From 3228b12622b8feb6caa109952d825e0cd65af171 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 24 Aug 2023 09:01:46 +0100
Subject: [PATCH] [SPARSE] Add support for sparse gemv with MKLCPU

---
 CMakeLists.txt                                |   5 +
 README.md                                     |   2 +
 examples/include/example_helper.hpp           |  64 +++-
 examples/sparse_blas/CMakeLists.txt           |  25 ++
 .../compile_time_dispatching/CMakeLists.txt   |  44 +++
 .../sparse_blas_gemv_usm_mklcpu.cpp           | 249 +++++++++++++++
 .../run_time_dispatching/CMakeLists.txt       |  65 ++++
 .../sparse_blas_gemv_usm.cpp                  | 257 ++++++++++++++++
 include/oneapi/mkl.hpp                        |   1 +
 include/oneapi/mkl/detail/backends_table.hpp  |  16 +-
 include/oneapi/mkl/sparse_blas.hpp            |  37 +++
 .../mkl/sparse_blas/detail/helper_types.hpp   |  52 ++++
 .../mklcpu/onemkl_sparse_blas_mklcpu.hpp      |  34 ++
 .../detail/mklcpu/sparse_blas_ct.hpp          |  41 +++
 .../detail/onemkl_sparse_blas_backends.hxx    |  82 +++++
 .../mkl/sparse_blas/detail/sparse_blas_ct.hxx | 121 ++++++++
 .../mkl/sparse_blas/detail/sparse_blas_rt.hpp |  95 ++++++
 include/oneapi/mkl/sparse_blas/types.hpp      |  44 +++
 src/sparse_blas/CMakeLists.txt                |  47 +++
 src/sparse_blas/backends/CMakeLists.txt       |  22 ++
 src/sparse_blas/backends/backend_wrappers.cxx |  83 +++++
 .../backends/mkl_common/mkl_basic.cxx         |  62 ++++
 .../backends/mkl_common/mkl_helper.hpp        |  56 ++++
 .../backends/mkl_common/mkl_operations.cxx    | 125 ++++++++
 .../backends/mklcpu/CMakeLists.txt            |  83 +++++
 .../backends/mklcpu/mklcpu_basic.cpp          |  28 ++
 .../backends/mklcpu/mklcpu_operations.cpp     |  28 ++
 .../backends/mklcpu/mklcpu_wrappers.cpp       |  32 ++
 src/sparse_blas/function_table.hpp            | 291 ++++++++++++++++++
 src/sparse_blas/sparse_blas_loader.cpp        | 167 ++++++++++
 tests/unit_tests/CMakeLists.txt               |   6 +
 tests/unit_tests/sparse_blas/CMakeLists.txt   |  20 ++
 .../sparse_blas/include/sparse_reference.hpp  | 149 +++++++++
 .../sparse_blas/include/test_common.hpp       | 248 +++++++++++++++
 .../sparse_blas/source/CMakeLists.txt         |  56 ++++
 .../sparse_blas/source/sparse_gemv_buffer.cpp | 183 +++++++++++
 .../sparse_blas/source/sparse_gemv_usm.cpp    | 209 +++++++++++++
 37 files changed, 3125 insertions(+), 4 deletions(-)
 create mode 100644 examples/sparse_blas/CMakeLists.txt
 create mode 100644 examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
 create mode 100644 examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp
 create mode 100644 examples/sparse_blas/run_time_dispatching/CMakeLists.txt
 create mode 100644 examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp
 create mode 100644 include/oneapi/mkl/sparse_blas.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/helper_types.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx
 create mode 100644 include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp
 create mode 100644 include/oneapi/mkl/sparse_blas/types.hpp
 create mode 100644 src/sparse_blas/CMakeLists.txt
 create mode 100644 src/sparse_blas/backends/CMakeLists.txt
 create mode 100644 src/sparse_blas/backends/backend_wrappers.cxx
 create mode 100644 src/sparse_blas/backends/mkl_common/mkl_basic.cxx
 create mode 100644 src/sparse_blas/backends/mkl_common/mkl_helper.hpp
 create mode 100644 src/sparse_blas/backends/mkl_common/mkl_operations.cxx
 create mode 100644 src/sparse_blas/backends/mklcpu/CMakeLists.txt
 create mode 100644 src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp
 create mode 100644 src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
 create mode 100644 src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp
 create mode 100644 src/sparse_blas/function_table.hpp
 create mode 100644 src/sparse_blas/sparse_blas_loader.cpp
 create mode 100644 tests/unit_tests/sparse_blas/CMakeLists.txt
 create mode 100644 tests/unit_tests/sparse_blas/include/sparse_reference.hpp
 create mode 100644 tests/unit_tests/sparse_blas/include/test_common.hpp
 create mode 100644 tests/unit_tests/sparse_blas/source/CMakeLists.txt
 create mode 100644 tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp
 create mode 100644 tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e29fcc30..15540c9df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,9 @@ if(ENABLE_MKLGPU_BACKEND
 	OR ENABLE_ROCFFT_BACKEND)
   list(APPEND DOMAINS_LIST "dft")
 endif()
+if(ENABLE_MKLCPU_BACKEND)
+  list(APPEND DOMAINS_LIST "sparse_blas")
+endif()
 
 if(ENABLE_PORTBLAS_BACKEND AND
     (ENABLE_MKLCPU_BACKEND OR
@@ -204,6 +207,8 @@ endif()
 if(NOT TARGET_DOMAINS OR TARGET_DOMAINS STREQUAL "None")
   # Set to all by default
   set(TARGET_DOMAINS ${DOMAINS_LIST})
+  # Remove sparse_blas from the default until it is supported by MKLCPU and MKLGPU backends
+  list(REMOVE_ITEM TARGET_DOMAINS "sparse_blas")
 else()
   # Make sure the input was converted to list
   string(REPLACE " " ";" TARGET_DOMAINS ${TARGET_DOMAINS})
diff --git a/README.md b/README.md
index 37a753166..f4a116e98 100644
--- a/README.md
+++ b/README.md
@@ -144,6 +144,8 @@ $> clang++ -fsycl app.o –L$ONEMKL/lib –lonemkl_blas_mklcpu –lonemkl_blas_c
 
 Supported domains: BLAS, LAPACK, RNG, DFT
 
+Support for SPARSE_BLAS domain is in progress and disabled by default. Use it at your own risks.
+
 #### Linux*
 
 <table>
diff --git a/examples/include/example_helper.hpp b/examples/include/example_helper.hpp
index 4f73f8971..312404ee7 100644
--- a/examples/include/example_helper.hpp
+++ b/examples/include/example_helper.hpp
@@ -20,11 +20,19 @@
 #ifndef __EXAMPLE_HELPER_HPP__
 #define __EXAMPLE_HELPER_HPP__
 
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include <vector>
+
 //
 // helpers for initializing templated scalar data type values.
 //
 template <typename fp>
-fp set_fp_value(fp arg1, fp arg2 = 0.0) {
+fp set_fp_value(fp arg1, fp /*arg2*/ = fp(0.0)) {
     return arg1;
 }
 
@@ -67,4 +75,58 @@ void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) {
     }
 }
 
+template <typename fp, typename intType>
+intType generate_sparse_matrix(const intType nx, intType *ia, intType *ja, fp *a,
+                               const intType index = 0) {
+    intType nz = nx, ny = nx;
+    intType nnz = 0;
+    intType current_row;
+
+    ia[0] = index;
+
+    for (intType iz = 0; iz < nz; iz++) {
+        for (intType iy = 0; iy < ny; iy++) {
+            for (intType ix = 0; ix < nx; ix++) {
+                current_row = iz * nx * ny + iy * nx + ix;
+
+                for (intType sz = -1; sz <= 1; sz++) {
+                    if (iz + sz > -1 && iz + sz < nz) {
+                        for (intType sy = -1; sy <= 1; sy++) {
+                            if (iy + sy > -1 && iy + sy < ny) {
+                                for (intType sx = -1; sx <= 1; sx++) {
+                                    if (ix + sx > -1 && ix + sx < nx) {
+                                        intType current_column =
+                                            current_row + sz * nx * ny + sy * nx + sx;
+                                        ja[nnz] = current_column + index;
+                                        if (current_column == current_row) {
+                                            a[nnz++] = set_fp_value(fp(26.0));
+                                        }
+                                        else {
+                                            a[nnz++] = set_fp_value(fp(-1.0));
+                                        }
+                                    } // end
+                                    // x
+                                    // bounds
+                                    // test
+                                } // end sx loop
+                            } // end y bounds test
+                        } // end sy loop
+                    } // end z bounds test
+                } // end sz loop
+                ia[current_row + 1] = nnz + index;
+
+            } // end ix loop
+        } // end iy loop
+    } // end iz loop
+    return nnz;
+}
+
+template <typename T>
+void free_vec(std::vector<T *> &ptr_vec, sycl::queue queue) {
+    for (auto ptr : ptr_vec) {
+        sycl::free(ptr, queue);
+    }
+    ptr_vec.clear();
+}
+
 #endif //__EXAMPLE_HELPER_HPP__
diff --git a/examples/sparse_blas/CMakeLists.txt b/examples/sparse_blas/CMakeLists.txt
new file mode 100644
index 000000000..721512429
--- /dev/null
+++ b/examples/sparse_blas/CMakeLists.txt
@@ -0,0 +1,25 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+add_subdirectory(compile_time_dispatching)
+
+# runtime compilation is only possible with dynamic libraries
+if (BUILD_SHARED_LIBS)
+  add_subdirectory(run_time_dispatching)
+endif()
diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
new file mode 100644
index 000000000..cb95333b4
--- /dev/null
+++ b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt
@@ -0,0 +1,44 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+#Build object from all sources
+set(SPARSE_BLAS_BACKENDS "")
+
+if(ENABLE_MKLCPU_BACKEND)
+  list(APPEND SPARSE_BLAS_BACKENDS "mklcpu")
+endif()
+
+include(WarningsUtils)
+
+foreach(backend ${SPARSE_BLAS_BACKENDS})
+  set(EXAMPLE_NAME example_sparse_blas_gemv_usm_${backend})
+  add_executable(${EXAMPLE_NAME} sparse_blas_gemv_usm_${backend}.cpp)
+  target_include_directories(${EXAMPLE_NAME}
+      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
+      PUBLIC ${PROJECT_SOURCE_DIR}/include
+      PUBLIC ${CMAKE_BINARY_DIR}/bin
+  )
+
+  add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend})
+  target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend})
+
+  # Register example as ctest
+  add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_gemv_usm_${backend} COMMAND ${EXAMPLE_NAME})
+endforeach(backend)
+
diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp
new file mode 100644
index 000000000..4b86f976d
--- /dev/null
+++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_gemv_usm_mklcpu.cpp
@@ -0,0 +1,249 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+/*
+*
+*  Content:
+*       This example demonstrates use of DPCPP API oneapi::mkl::sparse::gemv
+*       using unified shared memory to perform general sparse matrix-vector
+*       multiplication on a INTEL CPU SYCL device.
+*
+*       y = alpha * op(A) * x + beta * y
+*
+*       where op() is defined by one of
+*
+*           oneapi::mkl::transpose::{nontrans,trans,conjtrans}
+*
+*
+*       This example demonstrates only single precision (float) data type for
+*       gemv matrix data
+*
+*
+*******************************************************************************/
+
+// stl includes
+#include <iostream>
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+#include "oneapi/mkl.hpp"
+
+#include "example_helper.hpp"
+
+//
+// Main example for Sparse Matrix-Vector Multiply consisting of
+// initialization of A matrix, x and y vectors as well as
+// scalars alpha and beta.  Then the product
+//
+// y = alpha * op(A) * x + beta * y
+//
+// is performed and finally the results are post processed.
+//
+int run_sparse_matrix_vector_multiply_example(const sycl::device &cpu_dev) {
+    using fp = float;
+    using intType = std::int32_t;
+
+    // Matrix data size
+    intType size = 4;
+    intType nrows = size * size * size;
+
+    // Set scalar fp values
+    fp alpha = set_fp_value(fp(1.0));
+    fp beta = set_fp_value(fp(0.0));
+
+    // Catch asynchronous exceptions
+    auto exception_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL "
+                             "exception during sparse::gemv:\n"
+                          << e.what() << std::endl;
+            }
+        }
+    };
+
+    // create execution queue and buffers of matrix data
+    sycl::queue cpu_queue(cpu_dev, exception_handler);
+    oneapi::mkl::backend_selector<oneapi::mkl::backend::mklcpu> cpu_selector{ cpu_queue };
+
+    intType *ia, *ja;
+    fp *a, *x, *y, *z;
+    std::size_t sizea = static_cast<std::size_t>(27 * nrows);
+    std::size_t sizeja = static_cast<std::size_t>(27 * nrows);
+    std::size_t sizeia = static_cast<std::size_t>(nrows + 1);
+    std::size_t sizevec = static_cast<std::size_t>(nrows);
+
+    ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), cpu_queue);
+    ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), cpu_queue);
+    a = (fp *)sycl::malloc_shared(sizea * sizeof(fp), cpu_queue);
+    x = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
+    y = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
+    z = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue);
+
+    if (!ia || !ja || !a || !x || !y || !z) {
+        throw std::runtime_error("Failed to allocate USM memory");
+    }
+
+    intType nnz = generate_sparse_matrix<fp, intType>(size, ia, ja, a);
+
+    // Init vectors x and y
+    for (int i = 0; i < nrows; i++) {
+        x[i] = set_fp_value(fp(1.0));
+        y[i] = set_fp_value(fp(0.0));
+        z[i] = set_fp_value(fp(0.0));
+    }
+
+    std::vector<intType *> int_ptr_vec;
+    int_ptr_vec.push_back(ia);
+    int_ptr_vec.push_back(ja);
+    std::vector<fp *> fp_ptr_vec;
+    fp_ptr_vec.push_back(a);
+    fp_ptr_vec.push_back(x);
+    fp_ptr_vec.push_back(y);
+    fp_ptr_vec.push_back(z);
+
+    //
+    // Execute Matrix Multiply
+    //
+
+    oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
+    std::cout << "\n\t\tsparse::gemv parameters:\n";
+    std::cout << "\t\t\ttransA = "
+              << (transA == oneapi::mkl::transpose::nontrans
+                      ? "nontrans"
+                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
+              << std::endl;
+    std::cout << "\t\t\tnrows = " << nrows << std::endl;
+    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
+
+    // create and initialize handle for a Sparse Matrix in CSR format
+    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
+
+    oneapi::mkl::sparse::init_matrix_handle(cpu_selector, &handle);
+
+    auto ev_set = oneapi::mkl::sparse::set_csr_data(cpu_selector, handle, nrows, nrows, nnz,
+                                                    oneapi::mkl::index_base::zero, ia, ja, a);
+
+    auto ev_opt = oneapi::mkl::sparse::optimize_gemv(cpu_selector, transA, handle, { ev_set });
+
+    auto ev_gemv =
+        oneapi::mkl::sparse::gemv(cpu_selector, transA, alpha, handle, x, beta, y, { ev_opt });
+
+    auto ev_release =
+        oneapi::mkl::sparse::release_matrix_handle(cpu_selector, &handle, { ev_gemv });
+
+    ev_release.wait_and_throw();
+
+    //
+    // Post Processing
+    //
+
+    fp *res = y;
+    for (intType row = 0; row < nrows; row++) {
+        fp tmp = set_fp_value(fp(0.0));
+        for (intType i = ia[row]; i < ia[row + 1]; i++) {
+            tmp += a[i] * x[ja[i]];
+        }
+        z[row] = alpha * tmp + beta * z[row];
+    }
+
+    fp diff = set_fp_value(fp(0.0));
+    for (intType i = 0; i < nrows; i++) {
+        diff += (z[i] - res[i]) * (z[i] - res[i]);
+    }
+    std::cout << "\n\t\t sparse::gemv residual:\n"
+              << "\t\t\t" << diff << "\n\tFinished" << std::endl;
+
+    free_vec(fp_ptr_vec, cpu_queue);
+    free_vec(int_ptr_vec, cpu_queue);
+
+    if (diff > 0)
+        return 1;
+
+    return 0;
+}
+
+//
+// Description of example setup, apis used and supported floating point type
+// precisions
+//
+void print_example_banner() {
+    std::cout << "" << std::endl;
+    std::cout << "########################################################################"
+              << std::endl;
+    std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# where A is a sparse matrix in CSR format, x and y are "
+                 "dense vectors"
+              << std::endl;
+    std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Using apis:" << std::endl;
+    std::cout << "#   sparse::gemv" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Using single precision (float) data type" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Running on Intel CPU device" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "########################################################################"
+              << std::endl;
+    std::cout << std::endl;
+}
+
+//
+// Main entry point for example
+//
+int main(int /*argc*/, char ** /*argv*/) {
+    print_example_banner();
+
+    try {
+        // TODO: Add cuSPARSE compile-time dispatcher in this example once it is supported.
+        sycl::device cpu_dev(sycl::cpu_selector_v);
+
+        std::cout << "Running Sparse BLAS GEMV USM example on CPU device." << std::endl;
+        std::cout << "Device name is: " << cpu_dev.get_info<sycl::info::device::name>()
+                  << std::endl;
+        std::cout << "Running with single precision real data type:" << std::endl;
+
+        run_sparse_matrix_vector_multiply_example(cpu_dev);
+        std::cout << "Sparse BLAS GEMV USM example ran OK." << std::endl;
+    }
+    catch (sycl::exception const &e) {
+        std::cerr << "Caught synchronous SYCL exception during Sparse GEMV:" << std::endl;
+        std::cerr << "\t" << e.what() << std::endl;
+        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
+        return 1;
+    }
+    catch (std::exception const &e) {
+        std::cerr << "Caught std::exception during Sparse GEMV:" << std::endl;
+        std::cerr << "\t" << e.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
new file mode 100644
index 000000000..12d829fe3
--- /dev/null
+++ b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt
@@ -0,0 +1,65 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+# NOTE: user needs to set env var SYCL_DEVICE_FILTER to use runtime example (no need to specify backend when building with CMake)
+
+include(WarningsUtils)
+
+# Build object from all example sources
+set(SPARSE_BLAS_RT_SOURCES "sparse_blas_gemv_usm")
+# Set up for the right backend for run-time dispatching examples
+# If users build more than one backend (i.e. mklcpu and mklgpu, or mklcpu and CUDA), they may need to
+# overwrite SYCL_DEVICE_FILTER in their environment to run on the desired backend
+set(DEVICE_FILTERS "")
+if(ENABLE_MKLCPU_BACKEND)
+  list(APPEND DEVICE_FILTERS "cpu")
+endif()
+
+message(STATUS "SYCL_DEVICE_FILTER will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples")
+
+foreach(sparse_blas_rt_sources ${SPARSE_BLAS_RT_SOURCES})
+  add_executable(example_${sparse_blas_rt_sources} ${sparse_blas_rt_sources}.cpp)
+  target_include_directories(example_${sparse_blas_rt_sources}
+      PUBLIC ${PROJECT_SOURCE_DIR}/examples/include
+      PUBLIC ${PROJECT_SOURCE_DIR}/include
+      PUBLIC ${CMAKE_BINARY_DIR}/bin
+  )
+
+  add_dependencies(example_${sparse_blas_rt_sources} onemkl)
+
+  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
+    add_sycl_to_target(TARGET example_${sparse_blas_rt_sources} SOURCES ${SPARSE_BLAS_RT_SOURCES})
+  endif()
+
+  target_link_libraries(example_${sparse_blas_rt_sources}
+      PUBLIC onemkl
+      PUBLIC ONEMKL::SYCL::SYCL
+      PUBLIC ${CMAKE_DL_LIBS}
+      PRIVATE onemkl_warnings
+  )
+
+  # Register example as ctest
+  foreach(device_filter ${DEVICE_FILTERS})
+    add_test(NAME ${domain}/EXAMPLE/RT/${sparse_blas_rt_sources}/${device_filter} COMMAND example_${sparse_blas_rt_sources})
+    set_property(TEST ${domain}/EXAMPLE/RT/${sparse_blas_rt_sources}/${device_filter} PROPERTY
+      ENVIRONMENT LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/lib:$ENV{LD_LIBRARY_PATH}
+      ENVIRONMENT SYCL_DEVICE_FILTER=${device_filter})
+  endforeach(device_filter)
+
+endforeach()
diff --git a/examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp b/examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp
new file mode 100644
index 000000000..fb5dc8a67
--- /dev/null
+++ b/examples/sparse_blas/run_time_dispatching/sparse_blas_gemv_usm.cpp
@@ -0,0 +1,257 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+/*
+*
+*  Content:
+*       This example demonstrates use of DPCPP API oneapi::mkl::sparse::gemv
+*       using unified shared memory to perform general sparse matrix-vector
+*       multiplication on a SYCL device (HOST, CPU, GPU) that is selected
+*       during runtime.
+*
+*       y = alpha * op(A) * x + beta * y
+*
+*       where op() is defined by one of
+*
+*           oneapi::mkl::transpose::{nontrans,trans,conjtrans}
+*
+*
+*       This example demonstrates only single precision (float) data type for
+*       gemv matrix data
+*
+*
+*******************************************************************************/
+
+// stl includes
+#include <iostream>
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+#include "oneapi/mkl.hpp"
+
+#include "example_helper.hpp"
+
+//
+// Main example for Sparse Matrix-Vector Multiply consisting of
+// initialization of A matrix, x and y vectors as well as
+// scalars alpha and beta.  Then the product
+//
+// y = alpha * op(A) * x + beta * y
+//
+// is performed and finally the results are post processed.
+//
+int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) {
+    using fp = float;
+    using intType = std::int32_t;
+
+    // Matrix data size
+    intType size = 4;
+    intType nrows = size * size * size;
+
+    // Set scalar fp values
+    fp alpha = set_fp_value(fp(1.0));
+    fp beta = set_fp_value(fp(0.0));
+
+    // Catch asynchronous exceptions
+    auto exception_handler = [](sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL "
+                             "exception during sparse::gemv:\n"
+                          << e.what() << std::endl;
+            }
+        }
+    };
+
+    // create execution queue and buffers of matrix data
+    sycl::queue main_queue(dev, exception_handler);
+
+    intType *ia, *ja;
+    fp *a, *x, *y, *z;
+    std::size_t sizea = static_cast<std::size_t>(27 * nrows);
+    std::size_t sizeja = static_cast<std::size_t>(27 * nrows);
+    std::size_t sizeia = static_cast<std::size_t>(nrows + 1);
+    std::size_t sizevec = static_cast<std::size_t>(nrows);
+
+    ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), main_queue);
+    ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), main_queue);
+    a = (fp *)sycl::malloc_shared(sizea * sizeof(fp), main_queue);
+    x = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue);
+    y = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue);
+    z = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue);
+
+    if (!ia || !ja || !a || !x || !y || !z) {
+        throw std::runtime_error("Failed to allocate USM memory");
+    }
+
+    intType nnz = generate_sparse_matrix<fp, intType>(size, ia, ja, a);
+
+    // Init vectors x and y
+    for (int i = 0; i < nrows; i++) {
+        x[i] = set_fp_value(fp(1.0));
+        y[i] = set_fp_value(fp(0.0));
+        z[i] = set_fp_value(fp(0.0));
+    }
+
+    std::vector<intType *> int_ptr_vec;
+    int_ptr_vec.push_back(ia);
+    int_ptr_vec.push_back(ja);
+    std::vector<fp *> fp_ptr_vec;
+    fp_ptr_vec.push_back(a);
+    fp_ptr_vec.push_back(x);
+    fp_ptr_vec.push_back(y);
+    fp_ptr_vec.push_back(z);
+
+    //
+    // Execute Matrix Multiply
+    //
+
+    oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans;
+    std::cout << "\n\t\tsparse::gemv parameters:\n";
+    std::cout << "\t\t\ttransA = "
+              << (transA == oneapi::mkl::transpose::nontrans
+                      ? "nontrans"
+                      : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans"))
+              << std::endl;
+    std::cout << "\t\t\tnrows = " << nrows << std::endl;
+    std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl;
+
+    // create and initialize handle for a Sparse Matrix in CSR format
+    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
+
+    oneapi::mkl::sparse::init_matrix_handle(main_queue, &handle);
+
+    auto ev_set = oneapi::mkl::sparse::set_csr_data(main_queue, handle, nrows, nrows, nnz,
+                                                    oneapi::mkl::index_base::zero, ia, ja, a);
+
+    auto ev_opt = oneapi::mkl::sparse::optimize_gemv(main_queue, transA, handle, { ev_set });
+
+    auto ev_gemv =
+        oneapi::mkl::sparse::gemv(main_queue, transA, alpha, handle, x, beta, y, { ev_opt });
+
+    auto ev_release = oneapi::mkl::sparse::release_matrix_handle(main_queue, &handle, { ev_gemv });
+
+    ev_release.wait_and_throw();
+
+    //
+    // Post Processing
+    //
+
+    fp *res = y;
+    for (intType row = 0; row < nrows; row++) {
+        fp tmp = set_fp_value(fp(0.0));
+        for (intType i = ia[row]; i < ia[row + 1]; i++) {
+            tmp += a[i] * x[ja[i]];
+        }
+        z[row] = alpha * tmp + beta * z[row];
+    }
+
+    fp diff = set_fp_value(fp(0.0));
+    for (intType i = 0; i < nrows; i++) {
+        diff += (z[i] - res[i]) * (z[i] - res[i]);
+    }
+    std::cout << "\n\t\t sparse::gemv residual:\n"
+              << "\t\t\t" << diff << "\n\tFinished" << std::endl;
+
+    free_vec(fp_ptr_vec, main_queue);
+    free_vec(int_ptr_vec, main_queue);
+
+    if (diff > 0)
+        return 1;
+
+    return 0;
+}
+
+//
+// Description of example setup, apis used and supported floating point type
+// precisions
+//
+void print_example_banner() {
+    std::cout << "" << std::endl;
+    std::cout << "########################################################################"
+              << std::endl;
+    std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# where A is a sparse matrix in CSR format, x and y are "
+                 "dense vectors"
+              << std::endl;
+    std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Using apis:" << std::endl;
+    std::cout << "#   sparse::gemv" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Using single precision (float) data type" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "# Device will be selected during runtime." << std::endl;
+    std::cout << "# The environment variable SYCL_DEVICE_FILTER can be used to specify"
+              << std::endl;
+    std::cout << "# SYCL device" << std::endl;
+    std::cout << "# " << std::endl;
+    std::cout << "########################################################################"
+              << std::endl;
+    std::cout << std::endl;
+}
+
+//
+// Main entry point for example
+//
+int main(int /*argc*/, char ** /*argv*/) {
+    print_example_banner();
+
+    try {
+        sycl::device dev = sycl::device();
+
+        if (dev.is_gpu()) {
+            std::cout << "Running Sparse BLAS GEMV USM example on GPU device." << std::endl;
+            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
+                      << std::endl;
+        }
+        else {
+            std::cout << "Running Sparse BLAS GEMV USM example on CPU device." << std::endl;
+            std::cout << "Device name is: " << dev.get_info<sycl::info::device::name>()
+                      << std::endl;
+        }
+        std::cout << "Running with single precision real data type:" << std::endl;
+
+        run_sparse_matrix_vector_multiply_example(dev);
+        std::cout << "Sparse BLAS GEMV USM example ran OK." << std::endl;
+    }
+    catch (sycl::exception const &e) {
+        std::cerr << "Caught synchronous SYCL exception during Sparse GEMV:" << std::endl;
+        std::cerr << "\t" << e.what() << std::endl;
+        std::cerr << "\tSYCL error code: " << e.code().value() << std::endl;
+        return 1;
+    }
+    catch (std::exception const &e) {
+        std::cerr << "Caught std::exception during Sparse GEMV:" << std::endl;
+        std::cerr << "\t" << e.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/include/oneapi/mkl.hpp b/include/oneapi/mkl.hpp
index a49c1ceda..f3e9b8618 100644
--- a/include/oneapi/mkl.hpp
+++ b/include/oneapi/mkl.hpp
@@ -26,5 +26,6 @@
 #include "oneapi/mkl/dft.hpp"
 #include "oneapi/mkl/lapack.hpp"
 #include "oneapi/mkl/rng.hpp"
+#include "oneapi/mkl/sparse_blas.hpp"
 
 #endif //_ONEMKL_HPP_
diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp
index be5e3d897..ed1facd7f 100644
--- a/include/oneapi/mkl/detail/backends_table.hpp
+++ b/include/oneapi/mkl/detail/backends_table.hpp
@@ -41,7 +41,7 @@ namespace oneapi {
 namespace mkl {
 
 enum class device : uint16_t { x86cpu, intelgpu, nvidiagpu, amdgpu };
-enum class domain : uint16_t { blas, dft, lapack, rng };
+enum class domain : uint16_t { blas, dft, lapack, rng, sparse_blas };
 
 static std::map<domain, std::map<device, std::vector<const char*>>> libraries = {
     { domain::blas,
@@ -161,13 +161,23 @@ static std::map<domain, std::map<device, std::vector<const char*>>> libraries =
 #ifdef ENABLE_CURAND_BACKEND
               LIB_NAME("rng_curand")
 #endif
-          } } } }
+          } } } },
+
+    { domain::sparse_blas,
+      { { device::x86cpu,
+          {
+#ifdef ENABLE_MKLCPU_BACKEND
+              LIB_NAME("sparse_blas_mklcpu")
+#endif
+          } } } },
 };
 
 static std::map<domain, const char*> table_names = { { domain::blas, "mkl_blas_table" },
                                                      { domain::lapack, "mkl_lapack_table" },
                                                      { domain::dft, "mkl_dft_table" },
-                                                     { domain::rng, "mkl_rng_table" } };
+                                                     { domain::rng, "mkl_rng_table" },
+                                                     { domain::sparse_blas,
+                                                       "mkl_sparse_blas_table" } };
 
 } //namespace mkl
 } //namespace oneapi
diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp
new file mode 100644
index 000000000..139c30dc5
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas.hpp
@@ -0,0 +1,37 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_HPP_
+#define _ONEMKL_SPARSE_BLAS_HPP_
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include "oneapi/mkl/detail/config.hpp"
+
+#ifdef ENABLE_MKLCPU_BACKEND
+#include "sparse_blas/detail/mklcpu/sparse_blas_ct.hpp"
+#endif
+
+#include "sparse_blas/detail/sparse_blas_rt.hpp"
+
+#endif // _ONEMKL_SPARSE_BLAS_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp b/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp
new file mode 100644
index 000000000..4964b1eff
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_HELPER_TYPES_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_HELPER_TYPES_HPP_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+namespace detail {
+
+struct matrix_handle;
+
+template <typename fpType>
+inline constexpr bool is_fp_supported_v =
+    std::is_same_v<fpType, float> || std::is_same_v<fpType, double> ||
+    std::is_same_v<fpType, std::complex<float>> || std::is_same_v<fpType, std::complex<double>>;
+
+template <typename intType>
+inline constexpr bool is_int_supported_v =
+    std::is_same_v<intType, std::int32_t> || std::is_same_v<intType, std::int64_t>;
+
+template <typename fpType, typename intType>
+inline constexpr bool are_fp_int_supported_v =
+    is_fp_supported_v<fpType>&& is_int_supported_v<intType>;
+
+} // namespace detail
+} // namespace sparse
+} // namespace mkl
+} // namespace oneapi
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_HELPER_TYPES_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp b/include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp
new file mode 100644
index 000000000..2535e61f6
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp
@@ -0,0 +1,34 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_ONEMKL_SPARSE_BLAS_MKLCPU_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_ONEMKL_SPARSE_BLAS_MKLCPU_HPP_
+
+#include "oneapi/mkl/detail/export.hpp"
+#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
+
+namespace oneapi::mkl::sparse::mklcpu {
+
+namespace detail = oneapi::mkl::sparse::detail;
+
+#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx"
+
+} // namespace oneapi::mkl::sparse::mklcpu
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_ONEMKL_SPARSE_BLAS_MKLCPU_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp
new file mode 100644
index 000000000..bc0089c57
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/mklcpu/sparse_blas_ct.hpp
@@ -0,0 +1,41 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_SPARSE_BLAS_CT_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_SPARSE_BLAS_CT_HPP_
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+#include "oneapi/mkl/detail/backends.hpp"
+#include "oneapi/mkl/detail/backend_selector.hpp"
+
+#include "onemkl_sparse_blas_mklcpu.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+
+#define BACKEND mklcpu
+#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx"
+#undef BACKEND
+
+} //namespace sparse
+} //namespace mkl
+} //namespace oneapi
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_MKLCPU_SPARSE_BLAS_CT_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx b/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx
new file mode 100644
index 000000000..7541f004a
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx
@@ -0,0 +1,82 @@
+/***************************************************************************
+*  Copyright(C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0(the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+// This file is meant to be included in each backend onemkl_sparse_blas_BACKEND.hpp files.
+// It is used to exports each symbol to the onemkl_sparse_blas_BACKEND library.
+
+ONEMKL_EXPORT void init_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle);
+
+ONEMKL_EXPORT sycl::event release_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle,
+                                                const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType, typename intType>
+ONEMKL_EXPORT std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
+    sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols, intType nnz,
+    index_base index, sycl::buffer<intType, 1> &row_ptr, sycl::buffer<intType, 1> &col_ind,
+    sycl::buffer<fpType, 1> &val);
+
+template <typename fpType, typename intType>
+ONEMKL_EXPORT std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event>
+set_csr_data(sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols,
+             intType nnz, index_base index, intType *row_ptr, intType *col_ind, fpType *val,
+             const std::vector<sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT sycl::event optimize_gemv(sycl::queue &queue, transpose transpose_val,
+                                        matrix_handle_t handle,
+                                        const std::vector<sycl::event> &dependencies = {});
+
+ONEMKL_EXPORT sycl::event optimize_trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val,
+                                        diag diag_val, matrix_handle_t handle,
+                                        const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType>
+ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
+    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
+    sycl::buffer<fpType, 1> &x, const fpType beta, sycl::buffer<fpType, 1> &y);
+
+template <typename fpType>
+ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
+    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
+    const fpType *x, const fpType beta, fpType *y,
+    const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType>
+ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(
+    sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
+    matrix_handle_t A_handle, sycl::buffer<fpType, 1> &x, sycl::buffer<fpType, 1> &y);
+
+template <typename fpType>
+ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
+    sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
+    matrix_handle_t A_handle, const fpType *x, fpType *y,
+    const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType>
+ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
+    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
+    const fpType alpha, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &B,
+    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
+    sycl::buffer<fpType, 1> &C, const std::int64_t ldc);
+
+template <typename fpType>
+ONEMKL_EXPORT std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
+    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
+    const fpType alpha, matrix_handle_t A_handle, const fpType *B, const std::int64_t columns,
+    const std::int64_t ldb, const fpType beta, fpType *C, const std::int64_t ldc,
+    const std::vector<sycl::event> &dependencies = {});
diff --git a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx
new file mode 100644
index 000000000..d769e32c2
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx
@@ -0,0 +1,121 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+// This file is meant to be included in each backend sparse_blas_ct.hpp files
+// Each function calls the implementation from onemkl_sparse_blas_backends.hxx
+
+#ifndef BACKEND
+#error "BACKEND is not defined"
+#endif
+
+inline void init_matrix_handle(backend_selector<backend::BACKEND> selector,
+                               matrix_handle_t *p_handle) {
+    BACKEND::init_matrix_handle(selector.get_queue(), p_handle);
+}
+
+inline sycl::event release_matrix_handle(backend_selector<backend::BACKEND> selector,
+                                         matrix_handle_t *p_handle,
+                                         const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::release_matrix_handle(selector.get_queue(), p_handle, dependencies);
+}
+
+template <typename fpType, typename intType>
+std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
+    backend_selector<backend::BACKEND> selector, matrix_handle_t handle, intType num_rows,
+    intType num_cols, intType nnz, index_base index, sycl::buffer<intType, 1> &row_ptr,
+    sycl::buffer<intType, 1> &col_ind, sycl::buffer<fpType, 1> &val) {
+    BACKEND::set_csr_data(selector.get_queue(), handle, num_rows, num_cols, nnz, index, row_ptr,
+                          col_ind, val);
+}
+
+template <typename fpType, typename intType>
+std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event> set_csr_data(
+    backend_selector<backend::BACKEND> selector, matrix_handle_t handle, intType num_rows,
+    intType num_cols, intType nnz, index_base index, intType *row_ptr, intType *col_ind,
+    fpType *val, const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::set_csr_data(selector.get_queue(), handle, num_rows, num_cols, nnz, index,
+                                 row_ptr, col_ind, val, dependencies);
+}
+
+inline sycl::event optimize_gemv(backend_selector<backend::BACKEND> selector,
+                                 transpose transpose_val, matrix_handle_t handle,
+                                 const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::optimize_gemv(selector.get_queue(), transpose_val, handle, dependencies);
+}
+
+inline sycl::event optimize_trsv(backend_selector<backend::BACKEND> selector, uplo uplo_val,
+                                 transpose transpose_val, diag diag_val, matrix_handle_t handle,
+                                 const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::optimize_trsv(selector.get_queue(), uplo_val, transpose_val, diag_val, handle,
+                                  dependencies);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
+    backend_selector<backend::BACKEND> selector, transpose transpose_val, const fpType alpha,
+    matrix_handle_t A_handle, sycl::buffer<fpType, 1> &x, const fpType beta,
+    sycl::buffer<fpType, 1> &y) {
+    BACKEND::gemv(selector.get_queue(), transpose_val, alpha, A_handle, x, beta, y);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
+    backend_selector<backend::BACKEND> selector, transpose transpose_val, const fpType alpha,
+    matrix_handle_t A_handle, const fpType *x, const fpType beta, fpType *y,
+    const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::gemv(selector.get_queue(), transpose_val, alpha, A_handle, x, beta, y,
+                         dependencies);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(
+    backend_selector<backend::BACKEND> selector, uplo uplo_val, transpose transpose_val,
+    diag diag_val, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &x,
+    sycl::buffer<fpType, 1> &y) {
+    BACKEND::trsv(selector.get_queue(), uplo_val, transpose_val, diag_val, A_handle, x, y);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
+    backend_selector<backend::BACKEND> selector, uplo uplo_val, transpose transpose_val,
+    diag diag_val, matrix_handle_t A_handle, const fpType *x, fpType *y,
+    const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::trsv(selector.get_queue(), uplo_val, transpose_val, diag_val, A_handle, x, y,
+                         dependencies);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
+    backend_selector<backend::BACKEND> selector, layout dense_matrix_layout, transpose transpose_A,
+    transpose transpose_B, const fpType alpha, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &B,
+    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
+    sycl::buffer<fpType, 1> &C, const std::int64_t ldc) {
+    BACKEND::gemm(selector.get_queue(), dense_matrix_layout, transpose_A, transpose_B, alpha,
+                  A_handle, B, columns, ldb, beta, C, ldc);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
+    backend_selector<backend::BACKEND> selector, layout dense_matrix_layout, transpose transpose_A,
+    transpose transpose_B, const fpType alpha, matrix_handle_t A_handle, const fpType *B,
+    const std::int64_t columns, const std::int64_t ldb, const fpType beta, fpType *C,
+    const std::int64_t ldc, const std::vector<sycl::event> &dependencies = {}) {
+    return BACKEND::gemm(selector.get_queue(), dense_matrix_layout, transpose_A, transpose_B, alpha,
+                         A_handle, B, columns, ldb, beta, C, ldc, dependencies);
+}
diff --git a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp
new file mode 100644
index 000000000..af46f95a6
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp
@@ -0,0 +1,95 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_SPARSE_BLAS_RT_HPP_
+#define _ONEMKL_SPARSE_BLAS_DETAIL_SPARSE_BLAS_RT_HPP_
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+
+void init_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle);
+
+sycl::event release_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle,
+                                  const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType, typename intType>
+std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
+    sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols, intType nnz,
+    index_base index, sycl::buffer<intType, 1> &row_ptr, sycl::buffer<intType, 1> &col_ind,
+    sycl::buffer<fpType, 1> &val);
+
+template <typename fpType, typename intType>
+std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event> set_csr_data(
+    sycl::queue &queue, matrix_handle_t handle, intType num_rows, intType num_cols, intType nnz,
+    index_base index, intType *row_ptr, intType *col_ind, fpType *val,
+    const std::vector<sycl::event> &dependencies = {});
+
+sycl::event optimize_gemv(sycl::queue &queue, transpose transpose_val, matrix_handle_t handle,
+                          const std::vector<sycl::event> &dependencies = {});
+
+sycl::event optimize_trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
+                          matrix_handle_t handle,
+                          const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
+    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
+    sycl::buffer<fpType, 1> &x, const fpType beta, sycl::buffer<fpType, 1> &y);
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
+    sycl::queue &queue, transpose transpose_val, const fpType alpha, matrix_handle_t A_handle,
+    const fpType *x, const fpType beta, fpType *y,
+    const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(sycl::queue &queue, uplo uplo_val,
+                                                         transpose transpose_val, diag diag_val,
+                                                         matrix_handle_t A_handle,
+                                                         sycl::buffer<fpType, 1> &x,
+                                                         sycl::buffer<fpType, 1> &y);
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
+    sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
+    matrix_handle_t A_handle, const fpType *x, fpType *y,
+    const std::vector<sycl::event> &dependencies = {});
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
+    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
+    const fpType alpha, matrix_handle_t A_handle, sycl::buffer<fpType, 1> &B,
+    const std::int64_t columns, const std::int64_t ldb, const fpType beta,
+    sycl::buffer<fpType, 1> &C, const std::int64_t ldc);
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
+    sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A, transpose transpose_B,
+    const fpType alpha, matrix_handle_t A_handle, const fpType *B, const std::int64_t columns,
+    const std::int64_t ldb, const fpType beta, fpType *C, const std::int64_t ldc,
+    const std::vector<sycl::event> &dependencies = {});
+
+} // namespace sparse
+} // namespace mkl
+} // namespace oneapi
+
+#endif // _ONEMKL_SPARSE_BLAS_DETAIL_SPARSE_BLAS_RT_HPP_
diff --git a/include/oneapi/mkl/sparse_blas/types.hpp b/include/oneapi/mkl/sparse_blas/types.hpp
new file mode 100644
index 000000000..406c7dd1f
--- /dev/null
+++ b/include/oneapi/mkl/sparse_blas/types.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+*  Copyright (C) Codeplay Software Limited
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  For your convenience, a copy of the License has been included in this
+*  repository.
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*
+**************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_TYPES_HPP_
+#define _ONEMKL_SPARSE_BLAS_TYPES_HPP_
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include <vector>
+
+#include "oneapi/mkl/types.hpp"
+#include "detail/helper_types.hpp"
+
+namespace oneapi {
+namespace mkl {
+namespace sparse {
+
+using matrix_handle_t = detail::matrix_handle*;
+
+} // namespace sparse
+} // namespace mkl
+} // namespace oneapi
+
+#endif // _ONEMKL_SPARSE_BLAS_TYPES_HPP_
diff --git a/src/sparse_blas/CMakeLists.txt b/src/sparse_blas/CMakeLists.txt
new file mode 100644
index 000000000..e66158e44
--- /dev/null
+++ b/src/sparse_blas/CMakeLists.txt
@@ -0,0 +1,47 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+add_subdirectory(backends)
+
+if(BUILD_SHARED_LIBS)
+  add_library(onemkl_sparse_blas OBJECT)
+  target_sources(onemkl_sparse_blas PRIVATE sparse_blas_loader.cpp)
+  target_include_directories(onemkl_sparse_blas
+    PRIVATE ${PROJECT_SOURCE_DIR}/include
+            ${PROJECT_SOURCE_DIR}/src
+            ${PROJECT_SOURCE_DIR}/src/include
+            ${CMAKE_BINARY_DIR}/bin
+            $<TARGET_FILE_DIR:onemkl>
+  )
+
+  target_compile_options(onemkl_sparse_blas PRIVATE ${ONEMKL_BUILD_COPT})
+
+  set_target_properties(onemkl_sparse_blas PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+  )
+  if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
+    add_sycl_to_target(TARGET onemkl_sparse_blas SOURCES sparse_blas_loader.cpp)
+  else()
+    target_link_libraries(onemkl_sparse_blas PUBLIC ONEMKL::SYCL::SYCL)
+  endif()
+
+  include(WarningsUtils)
+  target_link_libraries(onemkl_sparse_blas PRIVATE onemkl_warnings)
+
+endif()
diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt
new file mode 100644
index 000000000..72a2dd207
--- /dev/null
+++ b/src/sparse_blas/backends/CMakeLists.txt
@@ -0,0 +1,22 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+if(ENABLE_MKLCPU_BACKEND)
+  add_subdirectory(mklcpu)
+endif()
diff --git a/src/sparse_blas/backends/backend_wrappers.cxx b/src/sparse_blas/backends/backend_wrappers.cxx
new file mode 100644
index 000000000..b97bc567b
--- /dev/null
+++ b/src/sparse_blas/backends/backend_wrappers.cxx
@@ -0,0 +1,83 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+/*
+This file lists functions matching those required by sparse_blas_function_table_t in
+src/sparse_blas/function_table.hpp.
+
+To use this:
+
+#define WRAPPER_VERSION <Wrapper version number>
+#define BACKEND         <Backend name eg. mklgpu>
+
+extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
+    WRAPPER_VERSION,
+#include "sparse_blas/backends/backend_wrappers.cxx"
+};
+
+Changes to this file should be matched to changes in sparse_blas/function_table.hpp. The required
+function template instantiations must be added to backend_sparse_blas_instantiations.cxx.
+*/
+
+// clang-format off
+oneapi::mkl::sparse::BACKEND::init_matrix_handle,
+oneapi::mkl::sparse::BACKEND::release_matrix_handle,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::set_csr_data,
+oneapi::mkl::sparse::BACKEND::optimize_gemv,
+oneapi::mkl::sparse::BACKEND::optimize_trsv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::gemv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::trsv,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+oneapi::mkl::sparse::BACKEND::gemm,
+    // clang-format on
diff --git a/src/sparse_blas/backends/mkl_common/mkl_basic.cxx b/src/sparse_blas/backends/mkl_common/mkl_basic.cxx
new file mode 100644
index 000000000..fd3b1563a
--- /dev/null
+++ b/src/sparse_blas/backends/mkl_common/mkl_basic.cxx
@@ -0,0 +1,62 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+void init_matrix_handle(sycl::queue & /*queue*/, detail::matrix_handle **p_handle) {
+    oneapi::mkl::sparse::init_matrix_handle(detail::get_handle(p_handle));
+}
+
+sycl::event release_matrix_handle(sycl::queue &queue, detail::matrix_handle **p_handle,
+                                  const std::vector<sycl::event> &dependencies) {
+    return oneapi::mkl::sparse::release_matrix_handle(queue, detail::get_handle(p_handle),
+                                                      dependencies);
+}
+
+template <typename fpType, typename intType>
+std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>> set_csr_data(
+    sycl::queue &queue, detail::matrix_handle *handle, intType num_rows, intType num_cols,
+    intType /*nnz*/, index_base index, sycl::buffer<intType, 1> &row_ptr,
+    sycl::buffer<intType, 1> &col_ind, sycl::buffer<fpType, 1> &val) {
+    oneapi::mkl::sparse::set_csr_data(queue, detail::get_handle(handle), num_rows, num_cols, index,
+                                      row_ptr, col_ind, val);
+}
+
+template <typename fpType, typename intType>
+std::enable_if_t<detail::are_fp_int_supported_v<fpType, intType>, sycl::event> set_csr_data(
+    sycl::queue &queue, detail::matrix_handle *handle, intType num_rows, intType num_cols,
+    intType /*nnz*/, index_base index, intType *row_ptr, intType *col_ind, fpType *val,
+    const std::vector<sycl::event> &dependencies) {
+    return oneapi::mkl::sparse::set_csr_data(queue, detail::get_handle(handle), num_rows, num_cols,
+                                             index, row_ptr, col_ind, val, dependencies);
+}
+
+#define INSTANTIATE_SET_CSR_DATA(FP_TYPE, INT_TYPE)                                                \
+    template std::enable_if_t<detail::are_fp_int_supported_v<FP_TYPE, INT_TYPE>>                   \
+    set_csr_data<FP_TYPE, INT_TYPE>(                                                               \
+        sycl::queue & queue, detail::matrix_handle * handle, INT_TYPE num_rows, INT_TYPE num_cols, \
+        INT_TYPE nnz, index_base index, sycl::buffer<INT_TYPE, 1> & row_ptr,                       \
+        sycl::buffer<INT_TYPE, 1> & col_ind, sycl::buffer<FP_TYPE, 1> & val);                      \
+    template std::enable_if_t<detail::are_fp_int_supported_v<FP_TYPE, INT_TYPE>, sycl::event>      \
+    set_csr_data<FP_TYPE, INT_TYPE>(sycl::queue & queue, detail::matrix_handle * handle,           \
+                                    INT_TYPE num_rows, INT_TYPE num_cols, INT_TYPE nnz,            \
+                                    index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind,      \
+                                    FP_TYPE * val, const std::vector<sycl::event> &dependencies)
+
+FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_SET_CSR_DATA);
+
+#undef INSTANTIATE_SET_CSR_DATA
diff --git a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp b/src/sparse_blas/backends/mkl_common/mkl_helper.hpp
new file mode 100644
index 000000000..da5235ee0
--- /dev/null
+++ b/src/sparse_blas/backends/mkl_common/mkl_helper.hpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+// MKLCPU and MKLGPU backends include
+// This include defines its own oneapi::mkl::sparse namespace with some of the types that are used here: matrix_handle_t, index_base, transpose, uolo, diag.
+#include <oneapi/mkl/spblas.hpp>
+
+// Includes are set up so that oneapi::mkl::sparse namespace refers to the MKLCPU and MKLGPU backends namespace (oneMKL product)
+// in this file.
+// oneapi::mkl::sparse::detail namespace refers to the oneMKL interface namespace.
+
+#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp"
+
+namespace oneapi::mkl::sparse::detail {
+
+inline auto get_handle(detail::matrix_handle **handle) {
+    return reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t *>(handle);
+}
+
+inline auto get_handle(detail::matrix_handle *handle) {
+    return reinterpret_cast<oneapi::mkl::sparse::matrix_handle_t>(handle);
+}
+
+} // namespace oneapi::mkl::sparse::detail
+
+#define FOR_EACH_FP_TYPE(INSTANTIATE_MACRO) \
+    INSTANTIATE_MACRO(float);               \
+    INSTANTIATE_MACRO(double);              \
+    INSTANTIATE_MACRO(std::complex<float>); \
+    INSTANTIATE_MACRO(std::complex<double>)
+
+#define FOR_EACH_FP_AND_INT_TYPE_HELPER(INSTANTIATE_MACRO, INT_TYPE) \
+    INSTANTIATE_MACRO(float, INT_TYPE);                              \
+    INSTANTIATE_MACRO(double, INT_TYPE);                             \
+    INSTANTIATE_MACRO(std::complex<float>, INT_TYPE);                \
+    INSTANTIATE_MACRO(std::complex<double>, INT_TYPE)
+
+#define FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_MACRO)                   \
+    FOR_EACH_FP_AND_INT_TYPE_HELPER(INSTANTIATE_MACRO, std::int32_t); \
+    FOR_EACH_FP_AND_INT_TYPE_HELPER(INSTANTIATE_MACRO, std::int64_t)
diff --git a/src/sparse_blas/backends/mkl_common/mkl_operations.cxx b/src/sparse_blas/backends/mkl_common/mkl_operations.cxx
new file mode 100644
index 000000000..c854b63be
--- /dev/null
+++ b/src/sparse_blas/backends/mkl_common/mkl_operations.cxx
@@ -0,0 +1,125 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+sycl::event optimize_gemv(sycl::queue& queue, transpose transpose_val,
+                          detail::matrix_handle* handle,
+                          const std::vector<sycl::event>& dependencies) {
+    return oneapi::mkl::sparse::optimize_gemv(queue, transpose_val, get_handle(handle),
+                                              dependencies);
+}
+
+sycl::event optimize_trsv(sycl::queue& /*queue*/, uplo /*uplo_val*/, transpose /*transpose_val*/,
+                          diag /*diag_val*/, detail::matrix_handle* /*handle*/,
+                          const std::vector<sycl::event>& /*dependencies*/) {
+    throw unimplemented("SPARSE_BLAS", "optimize_trsv");
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> gemv(
+    sycl::queue& queue, transpose transpose_val, const fpType alpha,
+    detail::matrix_handle* A_handle, sycl::buffer<fpType, 1>& x, const fpType beta,
+    sycl::buffer<fpType, 1>& y) {
+    oneapi::mkl::sparse::gemv(queue, transpose_val, alpha, get_handle(A_handle), x, beta, y);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemv(
+    sycl::queue& queue, transpose transpose_val, const fpType alpha,
+    detail::matrix_handle* A_handle, const fpType* x, const fpType beta, fpType* y,
+    const std::vector<sycl::event>& dependencies) {
+    return oneapi::mkl::sparse::gemv(queue, transpose_val, alpha, get_handle(A_handle), x, beta, y,
+                                     dependencies);
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> trsv(sycl::queue& /*queue*/, uplo /*uplo_val*/,
+                                                         transpose /*transpose_val*/,
+                                                         diag /*diag_val*/,
+                                                         detail::matrix_handle* /*A_handle*/,
+                                                         sycl::buffer<fpType, 1>& /*x*/,
+                                                         sycl::buffer<fpType, 1>& /*y*/) {
+    throw unimplemented("SPARSE_BLAS", "trsv");
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> trsv(
+    sycl::queue& /*queue*/, uplo /*uplo_val*/, transpose /*transpose_val*/, diag /*diag_val*/,
+    detail::matrix_handle* /*A_handle*/, const fpType* /*x*/, fpType* /*y*/,
+    const std::vector<sycl::event>& /*dependencies*/) {
+    throw unimplemented("SPARSE_BLAS", "trsv");
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>> gemm(
+    sycl::queue& /*queue*/, layout /*dense_matrix_layout*/, transpose /*transpose_A*/,
+    transpose /*transpose_B*/, const fpType /*alpha*/, detail::matrix_handle* /*A_handle*/,
+    sycl::buffer<fpType, 1>& /*B*/, const std::int64_t /*columns*/, const std::int64_t /*ldb*/,
+    const fpType /*beta*/, sycl::buffer<fpType, 1>& /*C*/, const std::int64_t /*ldc*/) {
+    throw unimplemented("SPARSE_BLAS", "gemm");
+}
+
+template <typename fpType>
+std::enable_if_t<detail::is_fp_supported_v<fpType>, sycl::event> gemm(
+    sycl::queue& /*queue*/, layout /*dense_matrix_layout*/, transpose /*transpose_A*/,
+    transpose /*transpose_B*/, const fpType /*alpha*/, detail::matrix_handle* /*A_handle*/,
+    const fpType* /*B*/, const std::int64_t /*columns*/, const std::int64_t /*ldb*/,
+    const fpType /*beta*/, fpType* /*C*/, const std::int64_t /*ldc*/,
+    const std::vector<sycl::event>& /*dependencies*/) {
+    throw unimplemented("SPARSE_BLAS", "gemm");
+}
+
+#define INSTANTIATE_GEMV(FP_TYPE)                                                          \
+    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>> gemv(                    \
+        sycl::queue& queue, transpose transpose_val, const FP_TYPE alpha,                  \
+        detail::matrix_handle* A_handle, sycl::buffer<FP_TYPE, 1>& x, const FP_TYPE beta,  \
+        sycl::buffer<FP_TYPE, 1>& y);                                                      \
+    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>, sycl::event> gemv(       \
+        sycl::queue& queue, transpose transpose_val, const FP_TYPE alpha,                  \
+        detail::matrix_handle* A_handle, const FP_TYPE* x, const FP_TYPE beta, FP_TYPE* y, \
+        const std::vector<sycl::event>& dependencies)
+
+#define INSTANTIATE_TRSV(FP_TYPE)                                                    \
+    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>> trsv(              \
+        sycl::queue& queue, uplo uplo_val, transpose transpose_val, diag diag_val,   \
+        detail::matrix_handle* A_handle, sycl::buffer<FP_TYPE, 1>& x,                \
+        sycl::buffer<FP_TYPE, 1>& y);                                                \
+    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>, sycl::event> trsv( \
+        sycl::queue& queue, uplo uplo_val, transpose transpose_val, diag diag_val,   \
+        detail::matrix_handle* A_handle, const FP_TYPE* x, FP_TYPE* y,               \
+        const std::vector<sycl::event>& dependencies)
+
+#define INSTANTIATE_GEMM(FP_TYPE)                                                                 \
+    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>> gemm(                           \
+        sycl::queue& queue, layout dense_matrix_layout, transpose transpose_A,                    \
+        transpose transpose_B, const FP_TYPE alpha, detail::matrix_handle* A_handle,              \
+        sycl::buffer<FP_TYPE, 1>& B, const std::int64_t columns, const std::int64_t ldb,          \
+        const FP_TYPE beta, sycl::buffer<FP_TYPE, 1>& C, const std::int64_t ldc);                 \
+    template std::enable_if_t<detail::is_fp_supported_v<FP_TYPE>, sycl::event> gemm(              \
+        sycl::queue& queue, layout dense_matrix_layout, transpose transpose_A,                    \
+        transpose transpose_B, const FP_TYPE alpha, detail::matrix_handle* A_handle,              \
+        const FP_TYPE* B, const std::int64_t columns, const std::int64_t ldb, const FP_TYPE beta, \
+        FP_TYPE* C, const std::int64_t ldc, const std::vector<sycl::event>& dependencies)
+
+FOR_EACH_FP_TYPE(INSTANTIATE_GEMV);
+FOR_EACH_FP_TYPE(INSTANTIATE_TRSV);
+FOR_EACH_FP_TYPE(INSTANTIATE_GEMM);
+
+#undef INSTANTIATE_GEMV
+#undef INSTANTIATE_TRSV
+#undef INSTANTIATE_GEMM
diff --git a/src/sparse_blas/backends/mklcpu/CMakeLists.txt b/src/sparse_blas/backends/mklcpu/CMakeLists.txt
new file mode 100644
index 000000000..a06b77900
--- /dev/null
+++ b/src/sparse_blas/backends/mklcpu/CMakeLists.txt
@@ -0,0 +1,83 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+set(LIB_NAME onemkl_sparse_blas_mklcpu)
+set(LIB_OBJ ${LIB_NAME}_obj)
+
+include(WarningsUtils)
+
+add_library(${LIB_NAME})
+add_library(${LIB_OBJ} OBJECT
+  mklcpu_basic.cpp
+  mklcpu_operations.cpp
+  $<$<BOOL:${BUILD_SHARED_LIBS}>: mklcpu_wrappers.cpp>
+)
+
+target_include_directories(${LIB_OBJ}
+  PRIVATE ${PROJECT_SOURCE_DIR}/include
+          ${PROJECT_SOURCE_DIR}/src
+          ${CMAKE_BINARY_DIR}/bin
+)
+
+target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT})
+if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
+  add_sycl_to_target(TARGET ${LIB_OBJ} SOURCES ${SOURCES})
+endif()
+
+if(TARGET MKL::MKL_SYCL::SPARSE)
+  target_link_libraries(${LIB_OBJ}
+    PUBLIC ONEMKL::SYCL::SYCL
+    PUBLIC MKL::MKL_SYCL::SPARSE
+    PRIVATE onemkl_warnings
+  )
+else()
+  target_link_libraries(${LIB_OBJ}
+    PUBLIC ONEMKL::SYCL::SYCL
+    PUBLIC MKL::MKL_DPCPP
+    PRIVATE onemkl_warnings
+  )
+endif()
+
+set_target_properties(${LIB_OBJ} PROPERTIES
+  POSITION_INDEPENDENT_CODE ON
+)
+target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ})
+
+#Set oneMKL libraries as not transitive for dynamic
+if(BUILD_SHARED_LIBS)
+  set_target_properties(${LIB_NAME} PROPERTIES
+    INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL
+  )
+endif()
+
+# Add major version to the library
+set_target_properties(${LIB_NAME} PROPERTIES
+  SOVERSION ${PROJECT_VERSION_MAJOR}
+)
+
+# Add dependencies rpath to the library
+list(APPEND CMAKE_BUILD_RPATH $<TARGET_FILE_DIR:${LIB_NAME}>)
+
+# Add the library to install package
+install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets)
+install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets
+  RUNTIME DESTINATION bin
+  ARCHIVE DESTINATION lib
+  LIBRARY DESTINATION lib
+)
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp
new file mode 100644
index 000000000..9ab29ee92
--- /dev/null
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_basic.cpp
@@ -0,0 +1,28 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include "../mkl_common/mkl_helper.hpp"
+
+#include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
+
+namespace oneapi::mkl::sparse::mklcpu {
+
+#include "../mkl_common/mkl_basic.cxx"
+
+} // namespace oneapi::mkl::sparse::mklcpu
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
new file mode 100644
index 000000000..e636b1816
--- /dev/null
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp
@@ -0,0 +1,28 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include "../mkl_common/mkl_helper.hpp"
+
+#include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
+
+namespace oneapi::mkl::sparse::mklcpu {
+
+#include "../mkl_common/mkl_operations.cxx"
+
+} // namespace oneapi::mkl::sparse::mklcpu
diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp
new file mode 100644
index 000000000..40f75c60c
--- /dev/null
+++ b/src/sparse_blas/backends/mklcpu/mklcpu_wrappers.cpp
@@ -0,0 +1,32 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+#include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp"
+
+#include "sparse_blas/function_table.hpp"
+
+#define WRAPPER_VERSION 1
+#define BACKEND         mklcpu
+
+extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = {
+    WRAPPER_VERSION,
+#include "sparse_blas/backends/backend_wrappers.cxx"
+};
diff --git a/src/sparse_blas/function_table.hpp b/src/sparse_blas/function_table.hpp
new file mode 100644
index 000000000..d4656190a
--- /dev/null
+++ b/src/sparse_blas/function_table.hpp
@@ -0,0 +1,291 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* (*Licensed under the Apache License, Version 2.0 )(the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#ifndef _ONEMKL_SPARSE_BLAS_FUNCTION_TABLE_HPP_
+#define _ONEMKL_SPARSE_BLAS_FUNCTION_TABLE_HPP_
+
+#include "oneapi/mkl/sparse_blas/types.hpp"
+
+typedef struct {
+    int version;
+    void (*init_matrix_handle)(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_handle);
+
+    sycl::event (*release_matrix_handle)(sycl::queue &queue,
+                                         oneapi::mkl::sparse::matrix_handle_t *p_handle,
+                                         const std::vector<sycl::event> &dependencies);
+
+    // set_csr_data
+    void (*set_csr_data_buffer_rf_i32)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int32_t num_rows, std::int32_t num_cols,
+                                       std::int32_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int32_t, 1> &row_ptr,
+                                       sycl::buffer<std::int32_t, 1> &col_ind,
+                                       sycl::buffer<float, 1> &val);
+    void (*set_csr_data_buffer_rd_i32)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int32_t num_rows, std::int32_t num_cols,
+                                       std::int32_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int32_t, 1> &row_ptr,
+                                       sycl::buffer<std::int32_t, 1> &col_ind,
+                                       sycl::buffer<double, 1> &val);
+    void (*set_csr_data_buffer_cf_i32)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int32_t num_rows, std::int32_t num_cols,
+                                       std::int32_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int32_t, 1> &row_ptr,
+                                       sycl::buffer<std::int32_t, 1> &col_ind,
+                                       sycl::buffer<std::complex<float>, 1> &val);
+    void (*set_csr_data_buffer_cd_i32)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int32_t num_rows, std::int32_t num_cols,
+                                       std::int32_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int32_t, 1> &row_ptr,
+                                       sycl::buffer<std::int32_t, 1> &col_ind,
+                                       sycl::buffer<std::complex<double>, 1> &val);
+    void (*set_csr_data_buffer_rf_i64)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int64_t num_rows, std::int64_t num_cols,
+                                       std::int64_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int64_t, 1> &row_ptr,
+                                       sycl::buffer<std::int64_t, 1> &col_ind,
+                                       sycl::buffer<float, 1> &val);
+    void (*set_csr_data_buffer_rd_i64)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int64_t num_rows, std::int64_t num_cols,
+                                       std::int64_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int64_t, 1> &row_ptr,
+                                       sycl::buffer<std::int64_t, 1> &col_ind,
+                                       sycl::buffer<double, 1> &val);
+    void (*set_csr_data_buffer_cf_i64)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int64_t num_rows, std::int64_t num_cols,
+                                       std::int64_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int64_t, 1> &row_ptr,
+                                       sycl::buffer<std::int64_t, 1> &col_ind,
+                                       sycl::buffer<std::complex<float>, 1> &val);
+    void (*set_csr_data_buffer_cd_i64)(sycl::queue &queue,
+                                       oneapi::mkl::sparse::matrix_handle_t handle,
+                                       std::int64_t num_rows, std::int64_t num_cols,
+                                       std::int64_t nnz, oneapi::mkl::index_base index,
+                                       sycl::buffer<std::int64_t, 1> &row_ptr,
+                                       sycl::buffer<std::int64_t, 1> &col_ind,
+                                       sycl::buffer<std::complex<double>, 1> &val);
+    sycl::event (*set_csr_data_usm_rf_i32)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int32_t num_rows, std::int32_t num_cols,
+                                           std::int32_t nnz, oneapi::mkl::index_base index,
+                                           std::int32_t *row_ptr, std::int32_t *col_ind, float *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_rd_i32)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int32_t num_rows, std::int32_t num_cols,
+                                           std::int32_t nnz, oneapi::mkl::index_base index,
+                                           std::int32_t *row_ptr, std::int32_t *col_ind,
+                                           double *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_cf_i32)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int32_t num_rows, std::int32_t num_cols,
+                                           std::int32_t nnz, oneapi::mkl::index_base index,
+                                           std::int32_t *row_ptr, std::int32_t *col_ind,
+                                           std::complex<float> *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_cd_i32)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int32_t num_rows, std::int32_t num_cols,
+                                           std::int32_t nnz, oneapi::mkl::index_base index,
+                                           std::int32_t *row_ptr, std::int32_t *col_ind,
+                                           std::complex<double> *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_rf_i64)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int64_t num_rows, std::int64_t num_cols,
+                                           std::int64_t nnz, oneapi::mkl::index_base index,
+                                           std::int64_t *row_ptr, std::int64_t *col_ind, float *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_rd_i64)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int64_t num_rows, std::int64_t num_cols,
+                                           std::int64_t nnz, oneapi::mkl::index_base index,
+                                           std::int64_t *row_ptr, std::int64_t *col_ind,
+                                           double *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_cf_i64)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int64_t num_rows, std::int64_t num_cols,
+                                           std::int64_t nnz, oneapi::mkl::index_base index,
+                                           std::int64_t *row_ptr, std::int64_t *col_ind,
+                                           std::complex<float> *val,
+                                           const std::vector<sycl::event> &dependencies);
+    sycl::event (*set_csr_data_usm_cd_i64)(sycl::queue &queue,
+                                           oneapi::mkl::sparse::matrix_handle_t handle,
+                                           std::int64_t num_rows, std::int64_t num_cols,
+                                           std::int64_t nnz, oneapi::mkl::index_base index,
+                                           std::int64_t *row_ptr, std::int64_t *col_ind,
+                                           std::complex<double> *val,
+                                           const std::vector<sycl::event> &dependencies);
+
+    // optimize_*
+    sycl::event (*optimize_gemv)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                                 oneapi::mkl::sparse::matrix_handle_t handle,
+                                 const std::vector<sycl::event> &dependencies);
+    sycl::event (*optimize_trsv)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                                 oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                                 oneapi::mkl::sparse::matrix_handle_t handle,
+                                 const std::vector<sycl::event> &dependencies);
+
+    // gemv
+    void (*gemv_buffer_rf)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                           const float alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<float, 1> &x, const float beta, sycl::buffer<float, 1> &y);
+    void (*gemv_buffer_rd)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                           const double alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<double, 1> &x, const double beta,
+                           sycl::buffer<double, 1> &y);
+    void (*gemv_buffer_cf)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                           const std::complex<float> alpha,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<std::complex<float>, 1> &x, const std::complex<float> beta,
+                           sycl::buffer<std::complex<float>, 1> &y);
+    void (*gemv_buffer_cd)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                           const std::complex<double> alpha,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<std::complex<double>, 1> &x,
+                           const std::complex<double> beta,
+                           sycl::buffer<std::complex<double>, 1> &y);
+    sycl::event (*gemv_usm_rf)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                               const float alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const float *x, const float beta, float *y,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*gemv_usm_rd)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                               const double alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const double *x, const double beta, double *y,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*gemv_usm_cf)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                               const std::complex<float> alpha,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const std::complex<float> *x, const std::complex<float> beta,
+                               std::complex<float> *y,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*gemv_usm_cd)(sycl::queue &queue, oneapi::mkl::transpose transpose_val,
+                               const std::complex<double> alpha,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const std::complex<double> *x, const std::complex<double> beta,
+                               std::complex<double> *y,
+                               const std::vector<sycl::event> &dependencies);
+
+    // trsv
+    void (*trsv_buffer_rf)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                           oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle, sycl::buffer<float, 1> &x,
+                           sycl::buffer<float, 1> &y);
+    void (*trsv_buffer_rd)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                           oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<double, 1> &x, sycl::buffer<double, 1> &y);
+    void (*trsv_buffer_cf)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                           oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<std::complex<float>, 1> &x,
+                           sycl::buffer<std::complex<float>, 1> &y);
+    void (*trsv_buffer_cd)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                           oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<std::complex<double>, 1> &x,
+                           sycl::buffer<std::complex<double>, 1> &y);
+    sycl::event (*trsv_usm_rf)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                               oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle, const float *x,
+                               float *y, const std::vector<sycl::event> &dependencies);
+    sycl::event (*trsv_usm_rd)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                               oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle, const double *x,
+                               double *y, const std::vector<sycl::event> &dependencies);
+    sycl::event (*trsv_usm_cf)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                               oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const std::complex<float> *x, std::complex<float> *y,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*trsv_usm_cd)(sycl::queue &queue, oneapi::mkl::uplo uplo_val,
+                               oneapi::mkl::transpose transpose_val, oneapi::mkl::diag diag_val,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const std::complex<double> *x, std::complex<double> *y,
+                               const std::vector<sycl::event> &dependencies);
+
+    // gemm
+    void (*gemm_buffer_rf)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                           oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B,
+                           const float alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<float, 1> &B, const std::int64_t columns,
+                           const std::int64_t ldb, const float beta, sycl::buffer<float, 1> &C,
+                           const std::int64_t ldc);
+    void (*gemm_buffer_rd)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                           oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B,
+                           const double alpha, oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<double, 1> &B, const std::int64_t columns,
+                           const std::int64_t ldb, const double beta, sycl::buffer<double, 1> &C,
+                           const std::int64_t ldc);
+    void (*gemm_buffer_cf)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                           oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B,
+                           const std::complex<float> alpha,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<std::complex<float>, 1> &B, const std::int64_t columns,
+                           const std::int64_t ldb, const std::complex<float> beta,
+                           sycl::buffer<std::complex<float>, 1> &C, const std::int64_t ldc);
+    void (*gemm_buffer_cd)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                           oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B,
+                           const std::complex<double> alpha,
+                           oneapi::mkl::sparse::matrix_handle_t A_handle,
+                           sycl::buffer<std::complex<double>, 1> &B, const std::int64_t columns,
+                           const std::int64_t ldb, const std::complex<double> beta,
+                           sycl::buffer<std::complex<double>, 1> &C, const std::int64_t ldc);
+    sycl::event (*gemm_usm_rf)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                               oneapi::mkl::transpose transpose_A,
+                               oneapi::mkl::transpose transpose_B, const float alpha,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle, const float *B,
+                               const std::int64_t columns, const std::int64_t ldb, const float beta,
+                               float *C, const std::int64_t ldc,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*gemm_usm_rd)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                               oneapi::mkl::transpose transpose_A,
+                               oneapi::mkl::transpose transpose_B, const double alpha,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle, const double *B,
+                               const std::int64_t columns, const std::int64_t ldb,
+                               const double beta, double *C, const std::int64_t ldc,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*gemm_usm_cf)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                               oneapi::mkl::transpose transpose_A,
+                               oneapi::mkl::transpose transpose_B, const std::complex<float> alpha,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const std::complex<float> *B, const std::int64_t columns,
+                               const std::int64_t ldb, const std::complex<float> beta,
+                               std::complex<float> *C, const std::int64_t ldc,
+                               const std::vector<sycl::event> &dependencies);
+    sycl::event (*gemm_usm_cd)(sycl::queue &queue, oneapi::mkl::layout dense_matrix_layout,
+                               oneapi::mkl::transpose transpose_A,
+                               oneapi::mkl::transpose transpose_B, const std::complex<double> alpha,
+                               oneapi::mkl::sparse::matrix_handle_t A_handle,
+                               const std::complex<double> *B, const std::int64_t columns,
+                               const std::int64_t ldb, const std::complex<double> beta,
+                               std::complex<double> *C, const std::int64_t ldc,
+                               const std::vector<sycl::event> &dependencies);
+} sparse_blas_function_table_t;
+
+#endif // _ONEMKL_SPARSE_BLAS_FUNCTION_TABLE_HPP_
diff --git a/src/sparse_blas/sparse_blas_loader.cpp b/src/sparse_blas/sparse_blas_loader.cpp
new file mode 100644
index 000000000..07d065561
--- /dev/null
+++ b/src/sparse_blas/sparse_blas_loader.cpp
@@ -0,0 +1,167 @@
+/*******************************************************************************
+* Copyright 2023 Codeplay Software Ltd.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include "oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp"
+
+#include "function_table_initializer.hpp"
+#include "sparse_blas/function_table.hpp"
+#include "oneapi/mkl/detail/get_device_id.hpp"
+
+#define FOR_EACH_FP_TYPE(DEFINE_MACRO)      \
+    DEFINE_MACRO(float, _rf);               \
+    DEFINE_MACRO(double, _rd);              \
+    DEFINE_MACRO(std::complex<float>, _cf); \
+    DEFINE_MACRO(std::complex<double>, _cd)
+
+#define FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, INT_TYPE, INT_SUFFIX) \
+    DEFINE_MACRO(float, _rf, INT_TYPE, INT_SUFFIX);                         \
+    DEFINE_MACRO(double, _rd, INT_TYPE, INT_SUFFIX);                        \
+    DEFINE_MACRO(std::complex<float>, _cf, INT_TYPE, INT_SUFFIX);           \
+    DEFINE_MACRO(std::complex<double>, _cd, INT_TYPE, INT_SUFFIX)
+
+#define FOR_EACH_FP_AND_INT_TYPE(DEFINE_MACRO)                         \
+    FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int32_t, _i32); \
+    FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int64_t, _i64)
+
+namespace oneapi::mkl::sparse {
+
+static oneapi::mkl::detail::table_initializer<mkl::domain::sparse_blas,
+                                              sparse_blas_function_table_t>
+    function_tables;
+
+void init_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle) {
+    auto libkey = get_device_id(queue);
+    function_tables[libkey].init_matrix_handle(queue, p_handle);
+}
+
+sycl::event release_matrix_handle(sycl::queue &queue, matrix_handle_t *p_handle,
+                                  const std::vector<sycl::event> &dependencies) {
+    auto libkey = get_device_id(queue);
+    return function_tables[libkey].release_matrix_handle(queue, p_handle, dependencies);
+}
+
+#define DEFINE_SET_CSR_DATA(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX)                              \
+    template <>                                                                                    \
+    void set_csr_data(sycl::queue &queue, matrix_handle_t handle, INT_TYPE num_rows,               \
+                      INT_TYPE num_cols, INT_TYPE nnz, index_base index,                           \
+                      sycl::buffer<INT_TYPE, 1> &row_ptr, sycl::buffer<INT_TYPE, 1> &col_ind,      \
+                      sycl::buffer<FP_TYPE, 1> &val) {                                             \
+        auto libkey = get_device_id(queue);                                                        \
+        function_tables[libkey].set_csr_data_buffer##FP_SUFFIX##INT_SUFFIX(                        \
+            queue, handle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val);                 \
+    }                                                                                              \
+    template <>                                                                                    \
+    sycl::event set_csr_data(sycl::queue &queue, matrix_handle_t handle, INT_TYPE num_rows,        \
+                             INT_TYPE num_cols, INT_TYPE nnz, index_base index, INT_TYPE *row_ptr, \
+                             INT_TYPE *col_ind, FP_TYPE *val,                                      \
+                             const std::vector<sycl::event> &dependencies) {                       \
+        auto libkey = get_device_id(queue);                                                        \
+        return function_tables[libkey].set_csr_data_usm##FP_SUFFIX##INT_SUFFIX(                    \
+            queue, handle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val, dependencies);   \
+    }
+
+FOR_EACH_FP_AND_INT_TYPE(DEFINE_SET_CSR_DATA)
+#undef DEFINE_SET_CSR_DATA
+
+sycl::event optimize_gemv(sycl::queue &queue, transpose transpose_val, matrix_handle_t handle,
+                          const std::vector<sycl::event> &dependencies) {
+    auto libkey = get_device_id(queue);
+    return function_tables[libkey].optimize_gemv(queue, transpose_val, handle, dependencies);
+}
+
+sycl::event optimize_trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,
+                          matrix_handle_t handle, const std::vector<sycl::event> &dependencies) {
+    auto libkey = get_device_id(queue);
+    return function_tables[libkey].optimize_trsv(queue, uplo_val, transpose_val, diag_val, handle,
+                                                 dependencies);
+}
+
+#define DEFINE_GEMV(FP_TYPE, FP_SUFFIX)                                                           \
+    template <>                                                                                   \
+    void gemv(sycl::queue &queue, transpose transpose_val, const FP_TYPE alpha,                   \
+              matrix_handle_t A_handle, sycl::buffer<FP_TYPE, 1> &x, const FP_TYPE beta,          \
+              sycl::buffer<FP_TYPE, 1> &y) {                                                      \
+        auto libkey = get_device_id(queue);                                                       \
+        function_tables[libkey].gemv_buffer##FP_SUFFIX(queue, transpose_val, alpha, A_handle, x,  \
+                                                       beta, y);                                  \
+    }                                                                                             \
+    template <>                                                                                   \
+    sycl::event gemv(sycl::queue &queue, transpose transpose_val, const FP_TYPE alpha,            \
+                     matrix_handle_t A_handle, const FP_TYPE *x, const FP_TYPE beta, FP_TYPE *y,  \
+                     const std::vector<sycl::event> &dependencies) {                              \
+        auto libkey = get_device_id(queue);                                                       \
+        return function_tables[libkey].gemv_usm##FP_SUFFIX(queue, transpose_val, alpha, A_handle, \
+                                                           x, beta, y, dependencies);             \
+    }
+
+FOR_EACH_FP_TYPE(DEFINE_GEMV)
+#undef DEFINE_GEMV
+
+#define DEFINE_TRSV(FP_TYPE, FP_SUFFIX)                                                          \
+    template <>                                                                                  \
+    void trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,         \
+              matrix_handle_t A_handle, sycl::buffer<FP_TYPE, 1> &x,                             \
+              sycl::buffer<FP_TYPE, 1> &y) {                                                     \
+        auto libkey = get_device_id(queue);                                                      \
+        function_tables[libkey].trsv_buffer##FP_SUFFIX(queue, uplo_val, transpose_val, diag_val, \
+                                                       A_handle, x, y);                          \
+    }                                                                                            \
+    template <>                                                                                  \
+    sycl::event trsv(sycl::queue &queue, uplo uplo_val, transpose transpose_val, diag diag_val,  \
+                     matrix_handle_t A_handle, const FP_TYPE *x, FP_TYPE *y,                     \
+                     const std::vector<sycl::event> &dependencies) {                             \
+        auto libkey = get_device_id(queue);                                                      \
+        return function_tables[libkey].trsv_usm##FP_SUFFIX(                                      \
+            queue, uplo_val, transpose_val, diag_val, A_handle, x, y, dependencies);             \
+    }
+
+FOR_EACH_FP_TYPE(DEFINE_TRSV)
+#undef DEFINE_TRSV
+
+#define DEFINE_GEMM(FP_TYPE, FP_SUFFIX)                                                          \
+    template <>                                                                                  \
+    void gemm(sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A,             \
+              transpose transpose_B, const FP_TYPE alpha, matrix_handle_t A_handle,              \
+              sycl::buffer<FP_TYPE, 1> &B, const std::int64_t columns, const std::int64_t ldb,   \
+              const FP_TYPE beta, sycl::buffer<FP_TYPE, 1> &C, const std::int64_t ldc) {         \
+        auto libkey = get_device_id(queue);                                                      \
+        function_tables[libkey].gemm_buffer##FP_SUFFIX(queue, dense_matrix_layout, transpose_A,  \
+                                                       transpose_B, alpha, A_handle, B, columns, \
+                                                       ldb, beta, C, ldc);                       \
+    }                                                                                            \
+    template <>                                                                                  \
+    sycl::event gemm(sycl::queue &queue, layout dense_matrix_layout, transpose transpose_A,      \
+                     transpose transpose_B, const FP_TYPE alpha, matrix_handle_t A_handle,       \
+                     const FP_TYPE *B, const std::int64_t columns, const std::int64_t ldb,       \
+                     const FP_TYPE beta, FP_TYPE *C, const std::int64_t ldc,                     \
+                     const std::vector<sycl::event> &dependencies) {                             \
+        auto libkey = get_device_id(queue);                                                      \
+        return function_tables[libkey].gemm_usm##FP_SUFFIX(                                      \
+            queue, dense_matrix_layout, transpose_A, transpose_B, alpha, A_handle, B, columns,   \
+            ldb, beta, C, ldc, dependencies);                                                    \
+    }
+
+FOR_EACH_FP_TYPE(DEFINE_GEMM)
+#undef DEFINE_GEMM
+
+} // namespace oneapi::mkl::sparse
+
+#undef FOR_EACH_FP_TYPE
+#undef FOR_EACH_FP_AND_INT_TYPE
+#undef FOR_EACH_FP_AND_INT_TYPE_HELPER
diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt
index 464d8841c..43b2444a7 100644
--- a/tests/unit_tests/CMakeLists.txt
+++ b/tests/unit_tests/CMakeLists.txt
@@ -65,6 +65,12 @@ set(dft_TEST_LIST
 
 set(dft_TEST_LINK "")
 
+# Sparse BLAS config
+set(sparse_blas_TEST_LIST
+      spblas_source)
+
+set(sparse_blas_TEST_LINK "")
+
 foreach(domain ${TARGET_DOMAINS})
   # Generate RT and CT test lists
   set(${domain}_TEST_LIST_RT ${${domain}_TEST_LIST})
diff --git a/tests/unit_tests/sparse_blas/CMakeLists.txt b/tests/unit_tests/sparse_blas/CMakeLists.txt
new file mode 100644
index 000000000..2c46cd38c
--- /dev/null
+++ b/tests/unit_tests/sparse_blas/CMakeLists.txt
@@ -0,0 +1,20 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+add_subdirectory(source)
diff --git a/tests/unit_tests/sparse_blas/include/sparse_reference.hpp b/tests/unit_tests/sparse_blas/include/sparse_reference.hpp
new file mode 100644
index 000000000..fd9ffaa3d
--- /dev/null
+++ b/tests/unit_tests/sparse_blas/include/sparse_reference.hpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#ifndef _SPARSE_REFERENCE_HPP__
+#define _SPARSE_REFERENCE_HPP__
+
+#include <stdexcept>
+
+#include "oneapi/mkl.hpp"
+
+template <typename T>
+inline T conjugate(T) {
+    static_assert(false, "Unsupported type");
+}
+template <>
+inline float conjugate(float t) {
+    return t;
+}
+template <>
+inline double conjugate(double t) {
+    return t;
+}
+template <>
+inline std::complex<float> conjugate(std::complex<float> t) {
+    return std::conj(t);
+}
+template <>
+inline std::complex<double> conjugate(std::complex<double> t) {
+    return std::conj(t);
+}
+
+template <typename T>
+inline T opVal(const T t, const bool isConj) {
+    return (isConj ? conjugate(t) : t);
+};
+
+template <typename fpType, typename intType, typename accIntType, typename accFpType>
+void do_csr_transpose(const oneapi::mkl::transpose opA, intType *ia_t, intType *ja_t, fpType *a_t,
+                      intType a_nrows, intType a_ncols, intType a_ind, accIntType &ia,
+                      accIntType &ja, accFpType &a, const bool structOnlyFlag = false) {
+    const bool isConj = (opA == oneapi::mkl::transpose::conjtrans);
+
+    // initialize ia_t to zero
+    for (intType i = 0; i < a_ncols + 1; ++i) {
+        ia_t[i] = 0;
+    }
+
+    // fill ia_t with counts of columns
+    for (intType i = 0; i < a_nrows; ++i) {
+        const intType st = ia[i] - a_ind;
+        const intType en = ia[i + 1] - a_ind;
+        for (intType j = st; j < en; ++j) {
+            const intType col = ja[j] - a_ind;
+            ia_t[col + 1]++;
+        }
+    }
+    // prefix sum to get official ia_t counts
+    ia_t[0] = a_ind;
+    for (intType i = 0; i < a_ncols; ++i) {
+        ia_t[i + 1] += ia_t[i];
+    }
+
+    // second pass through data to fill transpose structure
+    for (intType i = 0; i < a_nrows; ++i) {
+        const intType st = ia[i] - a_ind;
+        const intType en = ia[i + 1] - a_ind;
+        for (intType j = st; j < en; ++j) {
+            const intType col = ja[j] - a_ind;
+            const intType j_in_a_t = ia_t[col] - a_ind;
+            ia_t[col]++;
+            ja_t[j_in_a_t] = i + a_ind;
+            if (!structOnlyFlag) {
+                const fpType val = a[j];
+                a_t[j_in_a_t] = opVal(val, isConj);
+            }
+        }
+    }
+
+    // adjust ia_t back to original state after filling structure
+    for (intType i = a_ncols; i > 0; --i) {
+        ia_t[i] = ia_t[i - 1];
+    }
+    ia_t[0] = a_ind;
+}
+
+template <typename fpType, typename intType>
+void prepare_reference_gemv_data(const intType *ia, const intType *ja, const fpType *a,
+                                 intType a_nrows, intType a_ncols, intType a_nnz, intType a_ind,
+                                 oneapi::mkl::transpose opA, fpType alpha, fpType beta,
+                                 const fpType *x, fpType *y_ref) {
+    std::size_t opa_nrows =
+        static_cast<std::size_t>((opA == oneapi::mkl::transpose::nontrans) ? a_nrows : a_ncols);
+    const std::size_t nnz = static_cast<std::size_t>(a_nnz);
+
+    // prepare op(A) locally
+    std::vector<intType> iopa;
+    std::vector<intType> jopa;
+    std::vector<fpType> opa;
+    if (opA == oneapi::mkl::transpose::nontrans) {
+        iopa.assign(ia, ia + a_nrows + 1);
+        jopa.assign(ja, ja + nnz);
+        opa.assign(a, a + nnz);
+    }
+    else if (opA == oneapi::mkl::transpose::trans || opA == oneapi::mkl::transpose::conjtrans) {
+        iopa.resize(opa_nrows + 1);
+        jopa.resize(nnz);
+        opa.resize(nnz);
+        do_csr_transpose(opA, iopa.data(), jopa.data(), opa.data(), a_nrows, a_ncols, a_ind, ia, ja,
+                         a);
+    }
+    else {
+        throw std::runtime_error(
+            "unsupported transpose_val (opA) in prepare_reference_gemv_data()");
+    }
+
+    //
+    // do GEMV operation
+    //
+    //  y_ref <- alpha * op(A) * x + beta * y_ref
+    //
+    for (std::size_t row = 0; row < opa_nrows; row++) {
+        fpType tmp = 0;
+        for (intType i = iopa[row] - a_ind; i < iopa[row + 1] - a_ind; i++) {
+            std::size_t iu = static_cast<std::size_t>(i);
+            std::size_t x_ind = static_cast<std::size_t>(jopa[iu] - a_ind);
+            tmp += opa[iu] * x[x_ind];
+        }
+
+        y_ref[row] = alpha * tmp + beta * y_ref[row];
+    }
+}
+
+#endif // _SPARSE_REFERENCE_HPP__
diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp
new file mode 100644
index 000000000..0cf63e10f
--- /dev/null
+++ b/tests/unit_tests/sparse_blas/include/test_common.hpp
@@ -0,0 +1,248 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#ifndef _TEST_COMMON_HPP__
+#define _TEST_COMMON_HPP__
+
+#include <iostream>
+#include <memory>
+#include <limits>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include "test_helper.hpp"
+
+// Sparse BLAS domain needs to call more functions per test so we use this macro helper to select between runtime and compile dispatch for each function
+#ifdef CALL_RT_API
+#define CALL_RT_OR_CT(FUNC, QUEUE, ...) FUNC(QUEUE, __VA_ARGS__)
+#else
+#define CALL_RT_OR_CT(FUNC, QUEUE, ...) TEST_RUN_CT_SELECT(QUEUE, FUNC, __VA_ARGS__);
+#endif
+
+template <typename T>
+struct complex_info {
+    using real_type = T;
+    static const bool is_complex = false;
+};
+
+template <typename T>
+struct complex_info<std::complex<T>> {
+    using real_type = T;
+    static const bool is_complex = true;
+};
+
+void print_error_code(sycl::exception const &e);
+
+// Catch asynchronous exceptions.
+struct exception_handler_t {
+    void operator()(sycl::exception_list exceptions) {
+        for (std::exception_ptr const &e : exceptions) {
+            try {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e) {
+                std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
+                print_error_code(e);
+            }
+        }
+    }
+};
+
+// Use a unique_ptr to automatically free device memory on unique_ptr destruction.
+template <class T>
+auto malloc_device_uptr(sycl::queue &q, std::size_t num_elts) {
+    struct Deleter {
+        sycl::queue &q;
+        Deleter(sycl::queue &_q) : q(_q) {}
+        void operator()(T *ptr) {
+            sycl::free(ptr, q);
+        }
+    };
+    return std::unique_ptr<T, Deleter>(sycl::malloc_device<T>(num_elts, q), Deleter(q));
+}
+
+// SYCL buffer creation helper.
+template <typename vec>
+sycl::buffer<typename vec::value_type, 1> make_buffer(const vec &v) {
+    sycl::buffer<typename vec::value_type, 1> buf(v.data(), sycl::range<1>(v.size()));
+    return buf;
+}
+
+template <typename fpType>
+struct rand_scalar {
+    inline fpType operator()(double min, double max) {
+        return (fpType(std::rand()) / fpType(RAND_MAX)) * fpType(max - min) - fpType(min);
+    }
+};
+
+template <typename fpType>
+struct rand_scalar<std::complex<fpType>> {
+    inline std::complex<fpType> operator()(double min, double max) {
+        rand_scalar<fpType> rand;
+        return std::complex<fpType>(rand(min, max), rand(min, max));
+    }
+};
+
+template <typename fpType>
+void rand_vector(std::vector<fpType> &v, std::size_t n) {
+    using fpRealType = typename complex_info<fpType>::real_type;
+    v.resize(n);
+    rand_scalar<fpType> rand;
+    for (std::size_t i = 0; i < n; i++) {
+        v[i] = rand(fpRealType(-0.5), fpRealType(0.5));
+    }
+}
+
+// Creating the 3arrays CSR representation (ia, ja, values)
+// of general random sparse matrix
+// with density (0 < density <= 1.0)
+// -0.5 <= value < 0.5
+// require_diagonal means all diagonal entries guaranteed to be nonzero
+template <typename fpType, typename intType>
+intType generate_random_matrix(const intType nrows, const intType ncols, const double density_val,
+                               intType indexing, std::vector<intType> &ia, std::vector<intType> &ja,
+                               std::vector<fpType> &a, bool require_diagonal = false) {
+    intType nnz = 0;
+    rand_scalar<double> rand_density;
+    rand_scalar<fpType> rand_data;
+
+    ia.push_back(indexing); // starting index of row0.
+    for (intType i = 0; i < nrows; i++) {
+        ia.push_back(nnz + indexing); // ending index of row_i.
+        for (intType j = 0; j < ncols; j++) {
+            if ((require_diagonal && i == j) || (rand_density(0.0, 1.0) < density_val)) {
+                a.push_back(rand_data(-0.5, 0.5));
+                ja.push_back(j + indexing);
+                nnz++;
+            }
+        }
+        ia[static_cast<std::size_t>(i) + 1] = nnz + indexing;
+    }
+    return nnz;
+}
+
+// Shuffle the 3arrays CSR representation (ia, ja, values)
+// of any sparse matrix and set values serially from 0..nnz.
+// Intended for use with sorting.
+template <typename fpType, typename intType>
+void shuffle_data(const intType *ia, intType *ja, fpType *a, const std::size_t nrows) {
+    //
+    // shuffle indices according to random seed
+    //
+    intType indexing = ia[0];
+    for (std::size_t i = 0; i < nrows; ++i) {
+        intType nnz_row = ia[i + 1] - ia[i];
+        for (intType j = ia[i] - indexing; j < ia[i + 1] - indexing; ++j) {
+            intType q = ia[i] - indexing + std::rand() % (nnz_row);
+            // swap element i and q
+            std::swap(ja[q], ja[j]);
+            std::swap(a[q], a[j]);
+        }
+    }
+}
+
+template <typename fpType>
+struct set_fp_value {
+    inline fpType operator()(fpType real, fpType /*imag*/) {
+        return real;
+    }
+};
+
+template <typename scalarType>
+struct set_fp_value<std::complex<scalarType>> {
+    inline auto operator()(scalarType real, scalarType imag) {
+        return std::complex<scalarType>(real, imag);
+    }
+};
+
+inline void wait_and_free(sycl::queue &main_queue, oneapi::mkl::sparse::matrix_handle_t *p_handle) {
+    main_queue.wait();
+    sycl::event ev_release;
+    CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, p_handle);
+    ev_release.wait();
+}
+
+template <typename fpType>
+bool check_equal(fpType x, fpType x_ref, double abs_error_margin, double rel_error_margin,
+                 std::ostream &out) {
+    using fpRealType = typename complex_info<fpType>::real_type;
+    static_assert(std::is_floating_point_v<fpRealType>,
+                  "Expected floating-point real or complex type.");
+
+    const fpRealType epsilon = std::numeric_limits<fpRealType>::epsilon();
+    const auto abs_bound = static_cast<fpRealType>(abs_error_margin) * epsilon;
+    const auto rel_bound = static_cast<fpRealType>(rel_error_margin) * epsilon;
+
+    const auto aerr = std::abs(x - x_ref);
+    const auto rerr = aerr / std::abs(x_ref);
+    const bool valid = (rerr <= rel_bound) || (aerr <= abs_bound);
+    if (!valid) {
+        out << "Mismatching results: actual = " << x << " vs. reference = " << x_ref << "\n";
+        out << " relative error = " << rerr << " absolute error = " << aerr
+            << " relative bound = " << rel_bound << " absolute bound = " << abs_bound << "\n";
+    }
+    return valid;
+}
+
+template <typename vecType1, typename vecType2>
+bool check_equal_vector(const vecType1 &v, const vecType2 &v_ref, double abs_error_factor = 10.0,
+                        double rel_error_factor = 200.0, std::ostream &out = std::cout) {
+    using T = typename vecType2::value_type;
+    std::size_t n = v.size();
+    if (n != v_ref.size()) {
+        out << "Mismatching size got " << n << " expected " << v_ref.size() << "\n";
+        return false;
+    }
+    auto max_norm_ref =
+        *std::max_element(std::begin(v_ref), std::end(v_ref),
+                          [](const T &a, const T &b) { return std::abs(a) < std::abs(b); });
+    // Heuristic for the average-case error margins
+    double abs_error_margin =
+        abs_error_factor * std::abs(max_norm_ref) * std::log2(static_cast<double>(n));
+    double rel_error_margin = rel_error_factor * std::log2(static_cast<double>(n));
+
+    constexpr int max_print = 20;
+    int count = 0;
+    bool valid = true;
+
+    for (std::size_t i = 0; i < n; ++i) {
+        // Allow to convert the unsigned index `i` to a signed one to keep this function generic and allow for `v` and `v_ref` to be a vector, a pointer or a random access iterator.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wsign-conversion"
+        auto res = v[i];
+        auto ref = v_ref[i];
+#pragma clang diagnostic pop
+        if (!check_equal(res, ref, abs_error_margin, rel_error_margin, out)) {
+            out << " at index i =" << i << "\n";
+            valid = false;
+            ++count;
+            if (count > max_print) {
+                return valid;
+            }
+        }
+    }
+
+    return valid;
+}
+
+#endif // _TEST_COMMON_HPP__
diff --git a/tests/unit_tests/sparse_blas/source/CMakeLists.txt b/tests/unit_tests/sparse_blas/source/CMakeLists.txt
new file mode 100644
index 000000000..51ace5906
--- /dev/null
+++ b/tests/unit_tests/sparse_blas/source/CMakeLists.txt
@@ -0,0 +1,56 @@
+#===============================================================================
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+#
+# SPDX-License-Identifier: Apache-2.0
+#===============================================================================
+
+set(SPBLAS_SOURCES "sparse_gemv_usm.cpp" "sparse_gemv_buffer.cpp")
+
+include(WarningsUtils)
+
+if (BUILD_SHARED_LIBS)
+    add_library(spblas_source_rt OBJECT ${SPBLAS_SOURCES})
+    target_compile_options(spblas_source_rt PRIVATE -DCALL_RT_API -DNOMINMAX)
+    target_include_directories(spblas_source_rt
+            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
+            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+            PUBLIC ${PROJECT_SOURCE_DIR}/include
+            PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
+            PUBLIC ${CMAKE_BINARY_DIR}/bin
+            )
+    if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
+        add_sycl_to_target(TARGET spblas_source_rt SOURCES ${SPBLAS_SOURCES})
+    else ()
+        target_link_libraries(spblas_source_rt PUBLIC ONEMKL::SYCL::SYCL)
+    endif ()
+    target_link_libraries(spblas_source_rt PRIVATE onemkl_warnings)
+endif ()
+
+add_library(spblas_source_ct OBJECT ${SPBLAS_SOURCES})
+target_compile_options(spblas_source_ct PRIVATE -DNOMINMAX)
+target_include_directories(spblas_source_ct
+        PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../include
+        PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+        PUBLIC ${PROJECT_SOURCE_DIR}/include
+        PUBLIC ${PROJECT_SOURCE_DIR}/deps/googletest/include
+        PUBLIC ${CMAKE_BINARY_DIR}/bin
+        )
+if (USE_ADD_SYCL_TO_TARGET_INTEGRATION)
+    add_sycl_to_target(TARGET spblas_source_ct SOURCES ${SPBLAS_SOURCES})
+else ()
+    target_link_libraries(spblas_source_ct PUBLIC ONEMKL::SYCL::SYCL)
+endif ()
+target_link_libraries(spblas_source_ct PRIVATE onemkl_warnings)
diff --git a/tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp
new file mode 100644
index 000000000..c831514e7
--- /dev/null
+++ b/tests/unit_tests/sparse_blas/source/sparse_gemv_buffer.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include "oneapi/mkl.hpp"
+#include "oneapi/mkl/detail/config.hpp"
+#include "sparse_reference.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+extern std::vector<sycl::device *> devices;
+
+namespace {
+
+template <typename fpType, typename intType>
+int test(sycl::device *dev, intType nrows, intType ncols, double density_A_matrix,
+         oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha,
+         fpType beta) {
+    sycl::queue main_queue(*dev, exception_handler_t());
+
+    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
+    std::size_t opa_nrows =
+        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? nrows : ncols);
+    std::size_t opa_ncols =
+        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? ncols : nrows);
+
+    // Input matrix in CSR format
+    std::vector<intType> ia_host, ja_host;
+    std::vector<fpType> a_host;
+    intType nnz = generate_random_matrix<fpType, intType>(nrows, ncols, density_A_matrix, int_index,
+                                                          ia_host, ja_host, a_host);
+
+    // Input and output dense vectors
+    // The host and device output are filled with a dummy value to spot mistakes.
+    std::vector<fpType> x_host;
+    std::vector<fpType> y_host(opa_nrows, fpType(2));
+    rand_vector(x_host, opa_ncols);
+
+    // Shuffle ordering of column indices/values to test sortedness
+    shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), static_cast<std::size_t>(nrows));
+
+    auto ia_buf = make_buffer(ia_host);
+    auto ja_buf = make_buffer(ja_host);
+    auto x_buf = make_buffer(x_host);
+    auto y_buf = make_buffer(y_host);
+    auto a_buf = make_buffer(a_host);
+
+    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
+    sycl::event ev_release;
+    try {
+        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
+
+        CALL_RT_OR_CT(oneapi::mkl::sparse::set_csr_data, main_queue, handle, nrows, ncols, nnz,
+                      index, ia_buf, ja_buf, a_buf);
+
+        CALL_RT_OR_CT(oneapi::mkl::sparse::optimize_gemv, main_queue, transpose_val, handle);
+
+        CALL_RT_OR_CT(oneapi::mkl::sparse::gemv, main_queue, transpose_val, alpha, handle, x_buf,
+                      beta, y_buf);
+
+        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle);
+    }
+    catch (const sycl::exception &e) {
+        std::cout << "Caught synchronous SYCL exception during sparse GEMV:\n"
+                  << e.what() << std::endl;
+        print_error_code(e);
+        return 0;
+    }
+    catch (const oneapi::mkl::unimplemented &e) {
+        wait_and_free(main_queue, &handle);
+        return test_skipped;
+    }
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of sparse GEMV:\n" << error.what() << std::endl;
+        return 0;
+    }
+
+    // Compute reference.
+    // The reference is filled with a dummy value to spot mistakes.
+    std::vector<fpType> y_ref_host(opa_nrows, fpType(2));
+    prepare_reference_gemv_data(ia_host.data(), ja_host.data(), a_host.data(), nrows, ncols, nnz,
+                                int_index, transpose_val, alpha, beta, x_host.data(),
+                                y_ref_host.data());
+
+    // Compare the results of reference implementation and DPC++ implementation.
+    auto y_acc = y_buf.template get_host_access(sycl::read_only);
+    bool valid = check_equal_vector(y_acc, y_ref_host);
+
+    ev_release.wait_and_throw();
+    return static_cast<int>(valid);
+}
+
+class SparseGemvBufferTests : public ::testing::TestWithParam<sycl::device *> {};
+
+template <typename fpType>
+void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_val) {
+    double density_A_matrix = 0.8;
+    fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
+    fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
+    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
+    // Basic test
+    EXPECT_TRUEORSKIP(
+        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero));
+    // Test index_base 1
+    EXPECT_TRUEORSKIP(test(dev, 4, 6, density_A_matrix, oneapi::mkl::index_base::one, transpose_val,
+                           fp_one, fp_zero));
+    // Test non-default alpha
+    EXPECT_TRUEORSKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val,
+                           set_fp_value<fpType>()(2.f, 1.5f), fp_zero));
+    // Test non-default beta
+    EXPECT_TRUEORSKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one,
+                           set_fp_value<fpType>()(3.2f, 1.f)));
+    // Test 0 alpha
+    EXPECT_TRUEORSKIP(
+        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero, fp_one));
+    // Test 0 alpha and beta
+    EXPECT_TRUEORSKIP(
+        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero, fp_zero));
+    // Test int64 indices
+    EXPECT_TRUEORSKIP(
+        test(dev, 27L, 13L, density_A_matrix, index_zero, transpose_val, fp_one, fp_one));
+}
+
+TEST_P(SparseGemvBufferTests, RealSinglePrecision) {
+    using fpType = float;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+}
+
+TEST_P(SparseGemvBufferTests, RealDoublePrecision) {
+    using fpType = double;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+}
+
+TEST_P(SparseGemvBufferTests, ComplexSinglePrecision) {
+    using fpType = std::complex<float>;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans);
+}
+
+TEST_P(SparseGemvBufferTests, ComplexDoublePrecision) {
+    using fpType = std::complex<double>;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans);
+}
+
+INSTANTIATE_TEST_SUITE_P(SparseGemvBufferTestSuite, SparseGemvBufferTests,
+                         testing::ValuesIn(devices), ::DeviceNamePrint());
+
+} // anonymous namespace
diff --git a/tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp
new file mode 100644
index 000000000..e6b0520e1
--- /dev/null
+++ b/tests/unit_tests/sparse_blas/source/sparse_gemv_usm.cpp
@@ -0,0 +1,209 @@
+/*******************************************************************************
+* Copyright 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*
+* SPDX-License-Identifier: Apache-2.0
+*******************************************************************************/
+
+#include <algorithm>
+#include <complex>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#include <CL/sycl.hpp>
+#endif
+
+#include "oneapi/mkl.hpp"
+#include "oneapi/mkl/detail/config.hpp"
+#include "sparse_reference.hpp"
+#include "test_common.hpp"
+#include "test_helper.hpp"
+
+#include <gtest/gtest.h>
+
+extern std::vector<sycl::device *> devices;
+
+namespace {
+
+template <typename fpType, typename intType>
+int test(sycl::device *dev, intType nrows, intType ncols, double density_A_matrix,
+         oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha,
+         fpType beta) {
+    sycl::queue main_queue(*dev, exception_handler_t());
+
+    intType int_index = (index == oneapi::mkl::index_base::zero) ? 0 : 1;
+    std::size_t opa_nrows =
+        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? nrows : ncols);
+    std::size_t opa_ncols =
+        static_cast<std::size_t>(transpose_val == oneapi::mkl::transpose::nontrans ? ncols : nrows);
+
+    // Input matrix in CSR format
+    std::vector<intType> ia_host, ja_host;
+    std::vector<fpType> a_host;
+    intType nnz = generate_random_matrix<fpType, intType>(nrows, ncols, density_A_matrix, int_index,
+                                                          ia_host, ja_host, a_host);
+
+    // Input and output dense vectors
+    // The host and device output are filled with a dummy value to spot mistakes.
+    std::vector<fpType> x_host;
+    std::vector<fpType> y_host(opa_nrows, fpType(2));
+    rand_vector(x_host, opa_ncols);
+
+    // Shuffle ordering of column indices/values to test sortedness
+    shuffle_data(ia_host.data(), ja_host.data(), a_host.data(), static_cast<std::size_t>(nrows));
+
+    auto ia_usm_uptr = malloc_device_uptr<intType>(main_queue, ia_host.size());
+    auto ja_usm_uptr = malloc_device_uptr<intType>(main_queue, ja_host.size());
+    auto x_usm_uptr = malloc_device_uptr<fpType>(main_queue, x_host.size());
+    auto y_usm_uptr = malloc_device_uptr<fpType>(main_queue, y_host.size());
+    auto a_usm_uptr = malloc_device_uptr<fpType>(main_queue, a_host.size());
+
+    intType *ia_usm = ia_usm_uptr.get();
+    intType *ja_usm = ja_usm_uptr.get();
+    fpType *x_usm = x_usm_uptr.get();
+    fpType *y_usm = y_usm_uptr.get();
+    fpType *a_usm = a_usm_uptr.get();
+
+    std::vector<sycl::event> mat_dependencies;
+    std::vector<sycl::event> gemv_dependencies;
+    // Copy host to device
+    mat_dependencies.push_back(
+        main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType)));
+    mat_dependencies.push_back(
+        main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType)));
+    mat_dependencies.push_back(
+        main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType)));
+    gemv_dependencies.push_back(
+        main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType)));
+    gemv_dependencies.push_back(
+        main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType)));
+
+    sycl::event ev_copy, ev_release;
+    oneapi::mkl::sparse::matrix_handle_t handle = nullptr;
+    try {
+        sycl::event ev_set, ev_opt, ev_gemv;
+        CALL_RT_OR_CT(oneapi::mkl::sparse::init_matrix_handle, main_queue, &handle);
+
+        CALL_RT_OR_CT(ev_set = oneapi::mkl::sparse::set_csr_data, main_queue, handle, nrows, ncols,
+                      nnz, index, ia_usm, ja_usm, a_usm, mat_dependencies);
+
+        CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::optimize_gemv, main_queue, transpose_val,
+                      handle, { ev_set });
+
+        gemv_dependencies.push_back(ev_opt);
+        CALL_RT_OR_CT(ev_gemv = oneapi::mkl::sparse::gemv, main_queue, transpose_val, alpha, handle,
+                      x_usm, beta, y_usm, gemv_dependencies);
+
+        CALL_RT_OR_CT(ev_release = oneapi::mkl::sparse::release_matrix_handle, main_queue, &handle,
+                      { ev_gemv });
+
+        ev_copy = main_queue.memcpy(y_host.data(), y_usm, y_host.size() * sizeof(fpType), ev_gemv);
+    }
+    catch (const sycl::exception &e) {
+        std::cout << "Caught synchronous SYCL exception during sparse GEMV:\n"
+                  << e.what() << std::endl;
+        print_error_code(e);
+        return 0;
+    }
+    catch (const oneapi::mkl::unimplemented &e) {
+        wait_and_free(main_queue, &handle);
+        return test_skipped;
+    }
+    catch (const std::runtime_error &error) {
+        std::cout << "Error raised during execution of sparse GEMV:\n" << error.what() << std::endl;
+        return 0;
+    }
+
+    // Compute reference.
+    // The reference is filled with a dummy value to spot mistakes.
+    std::vector<fpType> y_ref_host(opa_nrows, fpType(2));
+    prepare_reference_gemv_data(ia_host.data(), ja_host.data(), a_host.data(), nrows, ncols, nnz,
+                                int_index, transpose_val, alpha, beta, x_host.data(),
+                                y_ref_host.data());
+
+    // Compare the results of reference implementation and DPC++ implementation.
+    ev_copy.wait_and_throw();
+    bool valid = check_equal_vector(y_host, y_ref_host);
+
+    ev_release.wait_and_throw();
+    return static_cast<int>(valid);
+}
+
+class SparseGemvUsmTests : public ::testing::TestWithParam<sycl::device *> {};
+
+template <typename fpType>
+void test_helper(sycl::device *dev, oneapi::mkl::transpose transpose_val) {
+    double density_A_matrix = 0.8;
+    fpType fp_zero = set_fp_value<fpType>()(0.f, 0.f);
+    fpType fp_one = set_fp_value<fpType>()(1.f, 0.f);
+    oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero;
+    // Basic test
+    EXPECT_TRUEORSKIP(
+        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero));
+    // Test index_base 1
+    EXPECT_TRUEORSKIP(test(dev, 4, 6, density_A_matrix, oneapi::mkl::index_base::one, transpose_val,
+                           fp_one, fp_zero));
+    // Test non-default alpha
+    EXPECT_TRUEORSKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val,
+                           set_fp_value<fpType>()(2.f, 1.5f), fp_zero));
+    // Test non-default beta
+    EXPECT_TRUEORSKIP(test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_one,
+                           set_fp_value<fpType>()(3.2f, 1.f)));
+    // Test 0 alpha
+    EXPECT_TRUEORSKIP(
+        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero, fp_one));
+    // Test 0 alpha and beta
+    EXPECT_TRUEORSKIP(
+        test(dev, 4, 6, density_A_matrix, index_zero, transpose_val, fp_zero, fp_zero));
+    // Test int64 indices
+    EXPECT_TRUEORSKIP(
+        test(dev, 27L, 13L, density_A_matrix, index_zero, transpose_val, fp_one, fp_one));
+}
+
+TEST_P(SparseGemvUsmTests, RealSinglePrecision) {
+    using fpType = float;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+}
+
+TEST_P(SparseGemvUsmTests, RealDoublePrecision) {
+    using fpType = double;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+}
+
+TEST_P(SparseGemvUsmTests, ComplexSinglePrecision) {
+    using fpType = std::complex<float>;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans);
+}
+
+TEST_P(SparseGemvUsmTests, ComplexDoublePrecision) {
+    using fpType = std::complex<double>;
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::nontrans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::trans);
+    test_helper<fpType>(GetParam(), oneapi::mkl::transpose::conjtrans);
+}
+
+INSTANTIATE_TEST_SUITE_P(SparseGemvUsmTestSuite, SparseGemvUsmTests, testing::ValuesIn(devices),
+                         ::DeviceNamePrint());
+
+} // anonymous namespace