diff --git a/.gitignore b/.gitignore
index e784653f1..86e63bd94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ src/version.h
 .cproject
 .project
 .settings
+.vscode
 
 html/
 latex/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e402c664..a3929ee31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.12)
 
-project(SIRIUS VERSION 6.5.3)
+project(SIRIUS VERSION 6.5.4)
 
 # set language and standard
 enable_language(CXX Fortran)
@@ -12,6 +12,7 @@ set(CREATE_FORTRAN_BINDINGS ON CACHE BOOL "build Fortran bindings")
 set(BUILD_DOCS OFF CACHE BOOL "build doxygen doc")
 set(USE_ELPA OFF CACHE BOOL "use scalapack")
 set(USE_MAGMA OFF CACHE BOOL "use MAGMA")
+set(USE_NLCGLIB OFF CACHE BOOL "enable nlcglib")
 set(USE_CUDA OFF CACHE BOOL "use CUDA")
 set(USE_ROCM OFF CACHE BOOL "use ROCM AMD GPU code")
 set(USE_NVTX OFF CACHE BOOL "use Nvidia profiling tools library")
@@ -27,17 +28,13 @@ set(PYTHON2 OFF CACHE STRING "Use Python 2.7")
 set(USE_PROFILER ON CACHE BOOL "measure execution of functions with timer")
 set(USE_MEMORY_POOL ON CACHE BOOL "use memory pool")
 
-if(USE_MAGMA AND NOT USE_CUDA)
-  message(FATAL_ERROR "MAGMA depends on Cuda, must enable Cuda or disable MAGMA")
-endif()
-
 if(USE_CUDA AND USE_ROCM)
   message(FATAL_ERROR "USE_CUDA and USE_ROCM must not be enabled at the same time!")
 endif()
 
-if(USE_MKL AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel"))
-  message(FATAL_ERROR "Unsupported compiler")
-endif()
+# if(USE_MKL AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel"))
+#   message(FATAL_ERROR "Unsupported compiler")
+# endif()
 
 set_property(CACHE GPU_MODEL PROPERTY STRINGS "none" "P100" "V100" "G10x0")
 
@@ -67,7 +64,7 @@ if (NOT CMAKE_BUILD_TYPE)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "debug" "release" "relwithdebinfo")
 endif()
 
-if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
   set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -ggdb -DDEBUG")
   set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
@@ -85,7 +82,7 @@ endif()
 # preserve rpaths when installing and make the install folder relocatable
 # use `CMAKE_SKIP_INSTALL_RPATH` to skip this
 # https://spack.readthedocs.io/en/latest/workflows.html#write-the-cmake-build
-list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES 
+list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES
           "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" isSystemDir)
 # skip RPATH if SIRIUS is installed to system directories
 if(isSystemDir STREQUAL "-1")
@@ -106,7 +103,11 @@ option(BUILD_SHARED_LIBS "Build shared libraries." ON)
 # generate compile_commands.json with compile commands for each target
 set(CMAKE_EXPORT_COMPILE_COMMANDS "YES")
 
-# dependencies
+if(USE_NLCGLIB)
+  find_package(nlcglib REQUIRED)
+  find_package(Kokkos REQUIRED)
+endif()
+
 find_package(MPI REQUIRED)
 find_package(GSL REQUIRED)
 find_package(LibXC 3.0.0 REQUIRED)
@@ -167,15 +168,14 @@ if(USE_CUDA)
 endif(USE_CUDA)
 
 if(USE_ROCM)
-  message(STATUS "WARNING: ROCM enabled, prototype feature! Only limited functionality available.")
-  find_package(ROCM COMPONENTS rocfft hipblas)
-  if(NOT ${ROCM_HIP_PLATFORM} STREQUAL hcc)
-    message(FATAL_ERROR "Compilation on Nvidia platform not supported with ROCM enabled!")
+  if(NOT HIP_HCC_FLAGS)
+    message(STATUS "Using default AMD gpu targets: gfx803, gfx900, gfx906. Set HIP_HCC_FLAGS to override.")
+    set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS} --amdgpu-target=gfx803 --amdgpu-target=gfx900 --amdgpu-target=gfx906)
   endif()
-  add_definitions(${ROCM_DEFINITIONS})
-  include_directories(${ROCM_INCLUDE_DIRS})
-  add_subdirectory(src/gpu/hipblas_port)
-  include_directories(src/gpu/hipblas_port)
+  find_package(HIP REQUIRED)
+  # rocblas and hip have cmake config files, but add incompatible flags for mixed compiler usage, so we use custom find modules
+  find_package(ROCBLAS REQUIRED)
+  find_package(HIPLIBS REQUIRED)
 endif()
 
 # check if git command exists
@@ -232,7 +232,11 @@ if(BUILD_TESTS)
 endif(BUILD_TESTS)
 
 add_subdirectory(apps/atoms)
+add_subdirectory(apps/hydrogen)
 add_subdirectory(apps/dft_loop)
+if(USE_NLCGLIB)
+add_subdirectory(apps/nlcg)
+endif()
 add_subdirectory(apps/upf)
 add_subdirectory(apps/utils)
 add_subdirectory(python_module)
diff --git a/apps/hydrogen/CMakeLists.txt b/apps/hydrogen/CMakeLists.txt
new file mode 100644
index 000000000..c43a15822
--- /dev/null
+++ b/apps/hydrogen/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(hydrogen hydrogen.cpp)
+target_link_libraries(hydrogen PRIVATE sirius)
+install(TARGETS hydrogen RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
diff --git a/apps/hydrogen/README.md b/apps/hydrogen/README.md
new file mode 100644
index 000000000..47b1e5b59
--- /dev/null
+++ b/apps/hydrogen/README.md
@@ -0,0 +1 @@
+Solve the hydrogen-like problem.
diff --git a/apps/tests/hydrogen.cpp b/apps/hydrogen/hydrogen.cpp
similarity index 100%
rename from apps/tests/hydrogen.cpp
rename to apps/hydrogen/hydrogen.cpp
diff --git a/apps/tests/hydrogen_plot.py b/apps/hydrogen/hydrogen_plot.py
similarity index 100%
rename from apps/tests/hydrogen_plot.py
rename to apps/hydrogen/hydrogen_plot.py
diff --git a/apps/nlcg/CMakeLists.txt b/apps/nlcg/CMakeLists.txt
new file mode 100644
index 000000000..0c8408dc8
--- /dev/null
+++ b/apps/nlcg/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_executable(sirius.nlcg sirius.nlcg.cpp)
+find_package(Kokkos REQUIRED)
+target_link_libraries(sirius.nlcg PRIVATE sirius)
+target_link_libraries(sirius.nlcg PRIVATE nlcglib::nlcglib)
+target_link_libraries(sirius.nlcg PRIVATE ${KOKKOS_LIBRARIES})
+set_property(TARGET sirius.nlcg PROPERTY POSITION_INDEPENDENT_CODE OFF)
+install(TARGETS sirius.nlcg RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
+
+
+# add_executable(sirius.test.nlcg sirius.test.nlcg.cpp)
+# find_package(nlcglib REQUIRED)
+# target_link_libraries(sirius.test.nlcg PRIVATE sirius)
+# set_property(TARGET sirius.test.nlcg PROPERTY POSITION_INDEPENDENT_CODE OFF)
+# install(TARGETS sirius.test.nlcg RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/bin")
diff --git a/apps/nlcg/sirius.nlcg.cpp b/apps/nlcg/sirius.nlcg.cpp
new file mode 100644
index 000000000..e84f47098
--- /dev/null
+++ b/apps/nlcg/sirius.nlcg.cpp
@@ -0,0 +1,303 @@
+#include "utils/profiler.hpp"
+#include <sirius.hpp>
+#include <utils/json.hpp>
+#include "nlcglib/adaptor.hpp"
+#include <nlcglib/nlcglib.hpp>
+
+using namespace sirius;
+using json = nlohmann::json;
+
+const std::string aiida_output_file = "output_aiida.json";
+
+enum class task_t : int
+{
+    ground_state_new     = 0
+};
+
+void json_output_common(json& dict__)
+{
+    dict__["git_hash"] = sirius::git_hash();
+    //dict__["build_date"] = build_date;
+    dict__["comm_world_size"] = Communicator::world().size();
+    dict__["threads_per_rank"] = omp_get_max_threads();
+}
+
+std::unique_ptr<Simulation_context> create_sim_ctx(std::string     fname__,
+                                                   cmd_args const& args__)
+{
+    auto ctx_ptr = std::unique_ptr<Simulation_context>(new Simulation_context(fname__, Communicator::world()));
+    Simulation_context& ctx = *ctx_ptr;
+
+    auto& inp = ctx.parameters_input();
+    if (inp.gamma_point_ && !(inp.ngridk_[0] * inp.ngridk_[1] * inp.ngridk_[2] == 1)) {
+        TERMINATE("this is not a Gamma-point calculation")
+    }
+
+    ctx.import(args__);
+
+    return ctx_ptr;
+}
+
+
+double ground_state(Simulation_context& ctx,
+                    task_t              task,
+                    cmd_args const&     args,
+                    int                 write_output)
+{
+    ctx.print_memory_usage(__FILE__, __LINE__);
+
+    auto& inp = ctx.parameters_input();
+
+    std::string ref_file = args.value<std::string>("test_against", "");
+    /* don't write output if we compare against the reference calculation */
+    bool write_state = (ref_file.size() == 0);
+
+    std::shared_ptr<K_point_set> kset;
+    if (ctx.parameters_input().vk_.size() == 0) {
+        kset = std::make_shared<K_point_set>(ctx, ctx.parameters_input().ngridk_, ctx.parameters_input().shiftk_, ctx.use_symmetry());
+    } else {
+        // setting
+        kset = std::make_shared<K_point_set>(ctx, ctx.parameters_input().vk_);
+    }
+    DFT_ground_state dft(*kset);
+
+    ctx.print_memory_usage(__FILE__, __LINE__);
+
+    auto& potential = dft.potential();
+    auto& density = dft.density();
+
+    dft.initial_state();
+
+    double initial_tol = ctx.iterative_solver_tolerance();
+
+    /* launch the calculation */
+    auto result = dft.find(inp.density_tol_, inp.energy_tol_, initial_tol, inp.num_dft_iter_, write_state);
+
+    auto nlcg_params  = ctx.nlcg_input();
+    double temp       = nlcg_params.T_;
+    double tol        = nlcg_params.tol_;
+    double kappa      = nlcg_params.kappa_;
+    double tau        = nlcg_params.tau_;
+    int maxiter       = nlcg_params.maxiter_;
+    int restart       = nlcg_params.restart_;
+    std::string smear = nlcg_params.smearing_;
+    std::string pu = nlcg_params.processing_unit_;
+    Energy energy(*kset, density, potential);
+
+    nlcglib::smearing_type smearing;
+    if (smear.compare("FD") == 0) {
+        smearing = nlcglib::smearing_type::FERMI_DIRAC;
+    } else if (smear.compare("GS") == 0) {
+        smearing = nlcglib::smearing_type::GAUSSIAN_SPLINE;
+    } else {
+        throw std::runtime_error("invalid smearing type given");
+    }
+
+    if(is_device_memory(ctx.preferred_memory_t())) {
+        if(pu.empty() || pu.compare("gpu") == 0) {
+            nlcglib::nlcg_mvp2_device(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else if (pu.compare("cpu") == 0){
+            nlcglib::nlcg_mvp2_device_cpu(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else {
+            throw std::runtime_error("invalid processing unit for nlcg given: " + pu);
+        }
+    } else {
+        if (pu.empty() || pu.compare("gpu") == 0) {
+            nlcglib::nlcg_mvp2_cpu(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else if (pu.compare("cpu") == 0){
+            nlcglib::nlcg_mvp2_cpu_device(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else {
+            throw std::runtime_error("invalid processing unit for nlcg given: " + pu);
+        }
+    }
+
+    if (ctx.control().verification_ >= 1) {
+        dft.check_scf_density();
+    }
+
+    //dft.print_magnetic_moment();
+
+    if (ctx.control().print_stress_ && !ctx.full_potential()) {
+        Stress& s       = dft.stress();
+        auto stress_tot = s.calc_stress_total();
+        s.print_info();
+        result["stress"] = std::vector<std::vector<double>>(3, std::vector<double>(3));
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+                result["stress"][i][j] = stress_tot(j, i);
+            }
+        }
+    }
+    if (ctx.control().print_forces_) {
+        Force& f         = dft.forces();
+        auto& forces_tot = f.calc_forces_total();
+        f.print_info();
+        result["forces"] = std::vector<std::vector<double>>(ctx.unit_cell().num_atoms(), std::vector<double>(3));
+        for (int i = 0; i < ctx.unit_cell().num_atoms(); i++) {
+            for (int j = 0; j < 3; j++) {
+                result["forces"][i][j] = forces_tot(j, i);
+            }
+        }
+    }
+
+    if (ref_file.size() != 0) {
+        json dict_ref;
+        std::ifstream(ref_file) >> dict_ref;
+
+        double e1 = result["energy"]["total"];
+        double e2 = dict_ref["ground_state"]["energy"]["total"];
+
+        if (std::abs(e1 - e2) > 1e-5) {
+            std::printf("total energy is different: %18.7f computed vs. %18.7f reference\n", e1, e2);
+            ctx.comm().abort(1);
+        }
+        if (result.count("stress") && dict_ref["ground_state"].count("stress")) {
+            double diff{0};
+            auto s1 = result["stress"].get<std::vector<std::vector<double>>>();
+            auto s2 = dict_ref["ground_state"]["stress"].get<std::vector<std::vector<double>>>();
+            for (int i = 0; i < 3; i++) {
+                for (int j = 0; j < 3; j++) {
+                    diff += std::abs(s1[i][j] - s2[i][j]);
+                }
+            }
+            if (diff > 1e-5) {
+                std::printf("total stress is different!");
+                std::cout << "  reference: " << dict_ref["ground_state"]["stress"] << "\n";
+                std::cout << "  computed: " << result["stress"] << "\n";
+                ctx.comm().abort(2);
+            }
+        }
+        if (result.count("forces") && dict_ref["ground_state"].count("forces")) {
+            double diff{0};
+            auto s1 = result["forces"].get<std::vector<std::vector<double>>>();
+            auto s2 = dict_ref["ground_state"]["forces"].get<std::vector<std::vector<double>>>();
+            for (int i = 0; i < ctx.unit_cell().num_atoms(); i++) {
+                for (int j = 0; j < 3; j++) {
+                    diff += std::abs(s1[i][j] - s2[i][j]);
+                }
+            }
+            if (diff > 1e-6) {
+                std::printf("total force is different!");
+                std::cout << "  reference: " << dict_ref["ground_state"]["forces"] << "\n";
+                std::cout << "  computed: " << result["forces"] << "\n";
+                ctx.comm().abort(3);
+            }
+        }
+    }
+
+    if (write_state && write_output) {
+        json dict;
+        json_output_common(dict);
+
+        dict["task"] = static_cast<int>(task);
+        dict["ground_state"] = result;
+        // dict["timers"] = utils::timer::serialize();
+        dict["counters"] = json::object();
+        dict["counters"]["local_operator_num_applied"] = ctx.num_loc_op_applied();
+        dict["counters"]["band_evp_work_count"] = ctx.evp_work_count();
+
+        if (ctx.comm().rank() == 0) {
+            std::string output_file = args.value<std::string>("output", std::string("output_") +
+                                                              ctx.start_time_tag() + std::string(".json"));
+            std::ofstream ofs(output_file, std::ofstream::out | std::ofstream::trunc);
+            ofs << dict.dump(4);
+        }
+
+        //if (args.exist("aiida_output")) {
+        //    json dict;
+        //    json_output_common(dict);
+        //    dict["task"] = static_cast<int>(task);
+        //    if (result >= 0) {
+        //        dict["task_status"] = "converged";
+        //        dict["num_scf_iterations"] =  result;
+        //    } else {
+        //        dict["task_status"] = "unconverged";
+        //    }
+        //    dict["volume"] = ctx.unit_cell().omega() * std::pow(bohr_radius, 3);
+        //    dict["volume_units"] = "angstrom^3";
+        //    dict["energy"] = dft.total_energy() * ha2ev;
+        //    dict["energy_units"] = "eV";
+        //    if (ctx.comm().rank() == 0) {
+        //        std::ofstream ofs(aiida_output_file, std::ofstream::out | std::ofstream::trunc);
+        //        ofs << dict.dump(4);
+        //    }
+        //}
+    }
+
+    /* wait for all */
+    ctx.comm().barrier();
+
+    return dft.total_energy();
+}
+
+/// Run a task based on a command line input.
+void run_tasks(cmd_args const& args)
+{
+    /* get the task id */
+    task_t task = static_cast<task_t>(args.value<int>("task", 0));
+    /* get the input file name */
+    std::string fname = args.value<std::string>("input", "sirius.json");
+    if (!utils::file_exists(fname)) {
+        if (Communicator::world().rank() == 0) {
+            std::printf("input file does not exist\n");
+        }
+        return;
+    }
+
+    if (task == task_t::ground_state_new) {
+        auto ctx = create_sim_ctx(fname, args);
+        ctx->initialize();
+        //if (ctx->full_potential()) {
+        //    ctx->gk_cutoff(ctx->aw_cutoff() / ctx->unit_cell().min_mt_radius());
+        //}
+        ground_state(*ctx, task, args, 1);
+    }
+
+
+}
+
+int main(int argn, char** argv)
+{
+    cmd_args args;
+    args.register_key("--input=", "{string} input file name");
+    args.register_key("--output=", "{string} output file name");
+    args.register_key("--task=", "{int} task id");
+    args.register_key("--aiida_output", "write output for AiiDA");
+    args.register_key("--test_against=", "{string} json file with reference values");
+    args.register_key("--control.processing_unit=", "");
+    args.register_key("--control.verbosity=", "");
+    args.register_key("--control.verification=", "");
+    args.register_key("--control.mpi_grid_dims=","");
+    args.register_key("--control.std_evp_solver_name=", "");
+    args.register_key("--control.gen_evp_solver_name=", "");
+    args.register_key("--control.fft_mode=", "");
+    args.register_key("--control.memory_usage=", "");
+    args.register_key("--parameters.ngridk=", "");
+    args.register_key("--parameters.gamma_point=", "");
+    args.register_key("--parameters.pw_cutoff=", "");
+    args.register_key("--iterative_solver.orthogonalize=", "");
+
+    args.parse_args(argn, argv);
+    if (args.exist("help")) {
+        std::printf("Usage: %s [options]\n", argv[0]);
+        args.print_help();
+        return 0;
+    }
+
+    sirius::initialize(1);
+
+    run_tasks(args);
+
+    // int my_rank = Communicator::world().rank();
+
+    sirius::finalize(1);
+
+    // if (my_rank == 0)  {
+    //     const auto timing_result = ::utils::global_rtgraph_timer.process();
+    //     std::cout<< timing_result.print();
+    //     std::ofstream ofs("timers.json", std::ofstream::out | std::ofstream::trunc);
+    //     ofs << timing_result.json();
+    // }
+
+    return 0;
+}
diff --git a/apps/nlcg/sirius.test.nlcg.cpp b/apps/nlcg/sirius.test.nlcg.cpp
new file mode 100644
index 000000000..2d5a8235e
--- /dev/null
+++ b/apps/nlcg/sirius.test.nlcg.cpp
@@ -0,0 +1,392 @@
+#include "utils/profiler.hpp"
+#include <sirius.h>
+#include <utils/json.hpp>
+#include "nlcglib/adaptor.hpp"
+
+using namespace sirius;
+using json = nlohmann::json;
+
+const std::string aiida_output_file = "output_aiida.json";
+
+enum class task_t : int
+{
+    ground_state_new     = 0,
+    ground_state_restart = 1,
+    k_point_path         = 2
+};
+
+void json_output_common(json& dict__)
+{
+    dict__["git_hash"] = sirius::git_hash();
+    //dict__["build_date"] = build_date;
+    dict__["comm_world_size"] = Communicator::world().size();
+    dict__["threads_per_rank"] = omp_get_max_threads();
+}
+
+std::unique_ptr<Simulation_context> create_sim_ctx(std::string     fname__,
+                                                   cmd_args const& args__)
+{
+    auto ctx_ptr = std::unique_ptr<Simulation_context>(new Simulation_context(fname__, Communicator::world()));
+    Simulation_context& ctx = *ctx_ptr;
+
+    auto& inp = ctx.parameters_input();
+    if (inp.gamma_point_ && !(inp.ngridk_[0] * inp.ngridk_[1] * inp.ngridk_[2] == 1)) {
+        TERMINATE("this is not a Gamma-point calculation")
+    }
+
+    ctx.import(args__);
+
+    return ctx_ptr;
+}
+
+
+double ground_state(Simulation_context& ctx,
+                    task_t              task,
+                    cmd_args const&     args,
+                    int                 write_output)
+{
+    ctx.print_memory_usage(__FILE__, __LINE__);
+
+    auto& inp = ctx.parameters_input();
+
+    std::string ref_file = args.value<std::string>("test_against", "");
+    /* don't write output if we compare against the reference calculation */
+    bool write_state = (ref_file.size() == 0);
+
+    K_point_set kset(ctx, ctx.parameters_input().ngridk_, ctx.parameters_input().shiftk_, ctx.use_symmetry());
+    DFT_ground_state dft(kset);
+
+    ctx.print_memory_usage(__FILE__, __LINE__);
+
+    auto& potential = dft.potential();
+    auto& density = dft.density();
+
+    if (task == task_t::ground_state_restart) {
+        if (!utils::file_exists(storage_file_name)) {
+            TERMINATE("storage file is not found");
+        }
+        density.load();
+        potential.load();
+    } else {
+        dft.initial_state();
+    }
+
+
+    double initial_tol = ctx.iterative_solver_tolerance();
+
+    /* launch the calculation */
+    int num_dft_iter = 1;
+    auto result = dft.find(inp.density_tol_, inp.energy_tol_, initial_tol, num_dft_iter, write_state);
+
+    std::cout << "call my stub solver: " << "\n";
+    Energy energy(kset, density, potential, nlcglib::smearing_type::FERMI_DIRAC);
+    if(is_device_memory(ctx.preferred_memory_t())) {
+        // nlcglib::nlcg_mvp2_cuda(energy);
+        nlcglib::test_nlcg_mvp2_cuda(energy);
+    } else {
+        nlcglib::test_nlcg_mvp2(energy);
+    }
+
+    if (ctx.control().verification_ >= 1) {
+        dft.check_scf_density();
+    }
+
+    auto repeat_update = args.value<int>("repeat_update", 0);
+    if (repeat_update) {
+        for (int i = 0; i < repeat_update; i++) {
+            dft.update();
+            result = dft.find(inp.density_tol_, inp.energy_tol_, initial_tol, inp.num_dft_iter_, write_state);
+        }
+    }
+
+    //dft.print_magnetic_moment();
+
+    if (ctx.control().print_stress_ && !ctx.full_potential()) {
+        Stress& s       = dft.stress();
+        auto stress_tot = s.calc_stress_total();
+        s.print_info();
+        result["stress"] = std::vector<std::vector<double>>(3, std::vector<double>(3));
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+                result["stress"][i][j] = stress_tot(j, i);
+            }
+        }
+    }
+    if (ctx.control().print_forces_) {
+        Force& f         = dft.forces();
+        auto& forces_tot = f.calc_forces_total();
+        f.print_info();
+        result["forces"] = std::vector<std::vector<double>>(ctx.unit_cell().num_atoms(), std::vector<double>(3));
+        for (int i = 0; i < ctx.unit_cell().num_atoms(); i++) {
+            for (int j = 0; j < 3; j++) {
+                result["forces"][i][j] = forces_tot(j, i);
+            }
+        }
+    }
+
+    if (ref_file.size() != 0) {
+        json dict_ref;
+        std::ifstream(ref_file) >> dict_ref;
+
+        double e1 = result["energy"]["total"];
+        double e2 = dict_ref["ground_state"]["energy"]["total"];
+
+        if (std::abs(e1 - e2) > 1e-5) {
+            std::printf("total energy is different: %18.7f computed vs. %18.7f reference\n", e1, e2);
+            ctx.comm().abort(1);
+        }
+        if (result.count("stress") && dict_ref["ground_state"].count("stress")) {
+            double diff{0};
+            auto s1 = result["stress"].get<std::vector<std::vector<double>>>();
+            auto s2 = dict_ref["ground_state"]["stress"].get<std::vector<std::vector<double>>>();
+            for (int i = 0; i < 3; i++) {
+                for (int j = 0; j < 3; j++) {
+                    diff += std::abs(s1[i][j] - s2[i][j]);
+                }
+            }
+            if (diff > 1e-5) {
+                std::printf("total stress is different!");
+                std::cout << "  reference: " << dict_ref["ground_state"]["stress"] << "\n";
+                std::cout << "  computed: " << result["stress"] << "\n";
+                ctx.comm().abort(2);
+            }
+        }
+        if (result.count("forces") && dict_ref["ground_state"].count("forces")) {
+            double diff{0};
+            auto s1 = result["forces"].get<std::vector<std::vector<double>>>();
+            auto s2 = dict_ref["ground_state"]["forces"].get<std::vector<std::vector<double>>>();
+            for (int i = 0; i < ctx.unit_cell().num_atoms(); i++) {
+                for (int j = 0; j < 3; j++) {
+                    diff += std::abs(s1[i][j] - s2[i][j]);
+                }
+            }
+            if (diff > 1e-6) {
+                std::printf("total force is different!");
+                std::cout << "  reference: " << dict_ref["ground_state"]["forces"] << "\n";
+                std::cout << "  computed: " << result["forces"] << "\n";
+                ctx.comm().abort(3);
+            }
+        }
+    }
+
+    if (write_state && write_output) {
+        json dict;
+        json_output_common(dict);
+
+        dict["task"] = static_cast<int>(task);
+        dict["ground_state"] = result;
+        dict["timers"] = utils::timer::serialize();
+        dict["counters"] = json::object();
+        dict["counters"]["local_operator_num_applied"] = Local_operator::num_applied();
+        dict["counters"]["band_evp_work_count"] = Band::evp_work_count();
+
+        if (ctx.comm().rank() == 0) {
+            std::string output_file = args.value<std::string>("output", std::string("output_") +
+                                                              ctx.start_time_tag() + std::string(".json"));
+            std::ofstream ofs(output_file, std::ofstream::out | std::ofstream::trunc);
+            ofs << dict.dump(4);
+        }
+
+        //if (args.exist("aiida_output")) {
+        //    json dict;
+        //    json_output_common(dict);
+        //    dict["task"] = static_cast<int>(task);
+        //    if (result >= 0) {
+        //        dict["task_status"] = "converged";
+        //        dict["num_scf_iterations"] =  result;
+        //    } else {
+        //        dict["task_status"] = "unconverged";
+        //    }
+        //    dict["volume"] = ctx.unit_cell().omega() * std::pow(bohr_radius, 3);
+        //    dict["volume_units"] = "angstrom^3";
+        //    dict["energy"] = dft.total_energy() * ha2ev;
+        //    dict["energy_units"] = "eV";
+        //    if (ctx.comm().rank() == 0) {
+        //        std::ofstream ofs(aiida_output_file, std::ofstream::out | std::ofstream::trunc);
+        //        ofs << dict.dump(4);
+        //    }
+        //}
+    }
+
+    /* wait for all */
+    ctx.comm().barrier();
+
+    return dft.total_energy();
+}
+
+/// Run a task based on a command line input.
+void run_tasks(cmd_args const& args)
+{
+    /* get the task id */
+    task_t task = static_cast<task_t>(args.value<int>("task", 0));
+    /* get the input file name */
+    std::string fname = args.value<std::string>("input", "sirius.json");
+    if (!utils::file_exists(fname)) {
+        if (Communicator::world().rank() == 0) {
+            std::printf("input file does not exist\n");
+        }
+        return;
+    }
+
+    if (task == task_t::ground_state_new || task == task_t::ground_state_restart) {
+        auto ctx = create_sim_ctx(fname, args);
+        ctx->initialize();
+        //if (ctx->full_potential()) {
+        //    ctx->gk_cutoff(ctx->aw_cutoff() / ctx->unit_cell().min_mt_radius());
+        //}
+        ground_state(*ctx, task, args, 1);
+    }
+
+    if (task == task_t::k_point_path) {
+        auto ctx = create_sim_ctx(fname, args);
+        ctx->iterative_solver_tolerance(1e-12);
+        ctx->gamma_point(false);
+        ctx->initialize();
+        //if (ctx->full_potential()) {
+        //    ctx->gk_cutoff(ctx->aw_cutoff() / ctx->unit_cell().min_mt_radius());
+        //}
+
+        Potential potential(*ctx);
+
+        Density density(*ctx);
+
+        K_point_set ks(*ctx);
+
+        json inp;
+        std::ifstream(fname) >> inp;
+
+        /* list of pairs (label, k-point vector) */
+        std::vector<std::pair<std::string, std::vector<double>>> vertex;
+
+        auto labels = inp["kpoints_path"].get<std::vector<std::string>>();
+        for (auto e: labels) {
+            auto v = inp["kpoints_rel"][e].get<std::vector<double>>();
+            vertex.push_back({e, v});
+        }
+
+        std::vector<double> x_axis;
+        std::vector<std::pair<double, std::string>> x_ticks;
+
+        /* first point */
+        x_axis.push_back(0);
+        x_ticks.push_back({0, vertex[0].first});
+        ks.add_kpoint(&vertex[0].second[0], 1.0);
+
+        double t{0};
+        for (size_t i = 0; i < vertex.size() - 1; i++) {
+            vector3d<double> v0 = vector3d<double>(vertex[i].second);
+            vector3d<double> v1 = vector3d<double>(vertex[i + 1].second);
+            vector3d<double> dv = v1 - v0;
+            vector3d<double> dv_cart = ctx->unit_cell().reciprocal_lattice_vectors() * dv;
+            int np = std::max(10, static_cast<int>(30 * dv_cart.length()));
+            for (int j = 1; j <= np; j++) {
+                vector3d<double> v = v0 + dv * static_cast<double>(j) / np;
+                ks.add_kpoint(&v[0], 1.0);
+                t += dv_cart.length() / np;
+                x_axis.push_back(t);
+            }
+            x_ticks.push_back({t, vertex[i + 1].first});
+        }
+
+        ks.initialize();
+
+        //density.initial_density();
+        density.load();
+        potential.generate(density);
+        Band band(*ctx);
+        Hamiltonian0 H0(potential);
+        if (!ctx->full_potential()) {
+            band.initialize_subspace(ks, H0);
+            if (ctx->hubbard_correction()) {
+                TERMINATE("fix me");
+                potential.U().hubbard_compute_occupation_numbers(ks); // TODO: this is wrong; U matrix should come form the saved file
+                potential.U().calculate_hubbard_potential_and_energy();
+            }
+        }
+        band.solve(ks, H0, true);
+
+        ks.sync_band_energies();
+        if (Communicator::world().rank() == 0) {
+            json dict;
+            dict["header"] = {};
+            dict["header"]["x_axis"] = x_axis;
+            dict["header"]["x_ticks"] = std::vector<json>();
+            dict["header"]["num_bands"] = ctx->num_bands();
+            dict["header"]["num_mag_dims"] = ctx->num_mag_dims();
+            for (auto& e: x_ticks) {
+                json j;
+                j["x"] = e.first;
+                j["label"] = e.second;
+                dict["header"]["x_ticks"].push_back(j);
+            }
+            dict["bands"] = std::vector<json>();
+
+            for (int ik = 0; ik < ks.num_kpoints(); ik++) {
+                json bnd_k;
+                bnd_k["kpoint"] = std::vector<double>(3, 0);
+                for (int x = 0; x < 3; x++) {
+                    bnd_k["kpoint"][x] = ks[ik]->vk()[x];
+                }
+                std::vector<double> bnd_e;
+
+                for (int ispn = 0; ispn < ctx->num_spin_dims(); ispn++) {
+                    for (int j = 0; j < ctx->num_bands(); j++) {
+                        bnd_e.push_back(ks[ik]->band_energy(j, ispn));
+                    }
+                }
+                //ks.get_band_energies(ik, bnd_e.data());
+                bnd_k["values"] = bnd_e;
+                dict["bands"].push_back(bnd_k);
+            }
+            std::ofstream ofs("bands.json", std::ofstream::out | std::ofstream::trunc);
+            ofs << dict.dump(4);
+        }
+    }
+}
+
+int main(int argn, char** argv)
+{
+    cmd_args args;
+    args.register_key("--input=", "{string} input file name");
+    args.register_key("--output=", "{string} output file name");
+    args.register_key("--task=", "{int} task id");
+    args.register_key("--aiida_output", "write output for AiiDA");
+    args.register_key("--test_against=", "{string} json file with reference values");
+    args.register_key("--repeat_update=", "{int} number of times to repeat update()");
+    args.register_key("--control.processing_unit=", "");
+    args.register_key("--control.verbosity=", "");
+    args.register_key("--control.verification=", "");
+    args.register_key("--control.mpi_grid_dims=","");
+    args.register_key("--control.std_evp_solver_name=", "");
+    args.register_key("--control.gen_evp_solver_name=", "");
+    args.register_key("--control.fft_mode=", "");
+    args.register_key("--control.memory_usage=", "");
+    args.register_key("--parameters.ngridk=", "");
+    args.register_key("--parameters.gamma_point=", "");
+    args.register_key("--parameters.pw_cutoff=", "");
+    args.register_key("--iterative_solver.orthogonalize=", "");
+
+    args.parse_args(argn, argv);
+    if (args.exist("help")) {
+        std::printf("Usage: %s [options]\n", argv[0]);
+        args.print_help();
+        return 0;
+    }
+
+    sirius::initialize(1);
+
+    run_tasks(args);
+
+    // int my_rank = Communicator::world().rank();
+
+    sirius::finalize(1);
+
+    // if (my_rank == 0)  {
+    //     const auto timing_result = ::utils::global_rtgraph_timer.process();
+    //     std::cout<< timing_result.print();
+    //     std::ofstream ofs("timers.json", std::ofstream::out | std::ofstream::trunc);
+    //     ofs << timing_result.json();
+    // }
+
+    return 0;
+}
diff --git a/apps/tests/CMakeLists.txt b/apps/tests/CMakeLists.txt
index 666779d0a..d9b8a8a26 100644
--- a/apps/tests/CMakeLists.txt
+++ b/apps/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
-set(_tests "test_hdf5;test_allgather;mt_function;splindex;hydrogen;\
+set(_tests "test_hdf5;test_allgather;mt_function;splindex;\
 read_atom;test_mdarray;test_xc;test_hloc;\
-test_mpi_grid;test_enu;test_eigen_v2;test_gemm;test_gemm2;test_wf_inner_v3;test_memop;\
+test_mpi_grid;test_enu;test_eigen;test_gemm;test_gemm2;test_wf_inner_v3;test_memop;\
 test_mem_pool;test_mem_alloc;test_examples;test_fft_full_grid;test_wf_inner_v4;test_bcast_v2;test_p2p_cyclic;\
 test_wf_ortho_6;test_mixer_v1;test_davidson;test_lapw_xc")
 
diff --git a/apps/tests/test_eigen_v2.cpp b/apps/tests/test_eigen.cpp
similarity index 85%
rename from apps/tests/test_eigen_v2.cpp
rename to apps/tests/test_eigen.cpp
index f45865c51..2cea59668 100644
--- a/apps/tests/test_eigen_v2.cpp
+++ b/apps/tests/test_eigen.cpp
@@ -12,18 +12,18 @@ double test_diag(BLACS_grid const& blacs_grid__,
                std::string name__,
                Eigensolver& solver)
 {
-    dmatrix<T> A = random_symmetric<T>(N__, bs__, blacs_grid__);
-    dmatrix<T> A_ref(N__, N__, blacs_grid__, bs__, bs__);
-    A >> A_ref;
+    auto A_ref = random_symmetric<T>(N__, bs__, blacs_grid__);
+    dmatrix<T> A(N__, N__, blacs_grid__, bs__, bs__, solver.host_memory_t());
+    A_ref >> A;
 
-    dmatrix<T> Z(N__, N__, blacs_grid__, bs__, bs__);
+    dmatrix<T> Z(N__, N__, blacs_grid__, bs__, bs__, solver.host_memory_t());
 
     dmatrix<T> B;
     dmatrix<T> B_ref;
     if (test_gen__) {
-        B = random_positive_definite<T>(N__, bs__, blacs_grid__);
-        B_ref = dmatrix<T>(N__, N__, blacs_grid__, bs__, bs__);
-        B >> B_ref;
+        B_ref = random_positive_definite<T>(N__, bs__, blacs_grid__);
+        B = dmatrix<T>(N__, N__, blacs_grid__, bs__, bs__, solver.host_memory_t());
+        B_ref >> B;
     }
 
     std::vector<double> eval(nev__);
@@ -125,7 +125,7 @@ void test_diag2(BLACS_grid const& blacs_grid__,
                 std::string name__,
                 std::string fname__)
 {
-    auto solver = Eigensolver_factory(get_ev_solver_t(name__));
+    auto solver = Eigensolver_factory(name__, nullptr);
 
     matrix<double_complex> full_mtrx;
     int n;
@@ -179,7 +179,7 @@ void call_test(std::vector<int> mpi_grid__,
                int repeat__,
                int type__)
 {
-    auto solver = Eigensolver_factory(get_ev_solver_t(name__));
+    auto solver = Eigensolver_factory(name__, nullptr);
     BLACS_grid blacs_grid(Communicator::world(), mpi_grid__[0], mpi_grid__[1]);
     if (fname__.length() == 0) {
         Measurement m;
@@ -205,19 +205,19 @@ void call_test(std::vector<int> mpi_grid__,
 
 int main(int argn, char** argv)
 {
-    cmd_args args;
-    args.register_key("--mpi_grid_dims=", "{int int} dimensions of MPI grid");
-    args.register_key("--N=", "{int} total size of the matrix");
-    args.register_key("--n=", "{int} size of the sub-matrix to diagonalize");
-    args.register_key("--nev=", "{int} number of eigen-vectors");
-    args.register_key("--bs=", "{int} block size");
-    args.register_key("--repeat=", "{int} number of repeats");
-    args.register_key("--gen", "test generalized problem");
-    args.register_key("--name=", "{string} name of the solver");
-    args.register_key("--file=", "{string} input file name");
-    args.register_key("--type=", "{int} data type: 0-real, 1-complex");
-
-    args.parse_args(argn, argv);
+    cmd_args args(argn, argv, {
+        {"mpi_grid_dims=", "{int int} dimensions of MPI grid"},
+        {"N=", "{int} total size of the matrix"},
+        {"n=", "{int} size of the sub-matrix to diagonalize"},
+        {"nev=", "{int} number of eigen-vectors"},
+        {"bs=", "{int} block size"},
+        {"repeat=", "{int} number of repeats"},
+        {"gen", "test generalized problem"},
+        {"name=", "{string} name of the solver"},
+        {"file=", "{string} input file name"},
+        {"type=", "{int} data type: 0-real, 1-complex"}
+    });
+
     if (args.exist("help")) {
         printf("Usage: %s [options]\n", argv[0]);
         args.print_help();
diff --git a/apps/tests/test_gemm2.cpp b/apps/tests/test_gemm2.cpp
index b8badf68a..cbd693de3 100644
--- a/apps/tests/test_gemm2.cpp
+++ b/apps/tests/test_gemm2.cpp
@@ -57,6 +57,7 @@ double test_gemm(int M, int N, int K, int transa, linalg_t la__, memory_t memA__
                        a.at(memA__), a.ld(), b.at(memB__), b.ld(),
                        &linalg_const<gemm_type>::zero(),
                        c.at(memC__), c.ld());
+    double t2 = t + utils::wtime();
     if (is_device_memory(memC__)) {
         c.copy_to(memory_t::host);
     }
@@ -65,6 +66,7 @@ double test_gemm(int M, int N, int K, int transa, linalg_t la__, memory_t memA__
     double perf = nop_gemm * 1e-9 * M * N * K / t;
     printf("execution time (sec) : %12.6f\n", t);
     printf("performance (GFlops) : %12.6f\n", perf);
+    printf("blas time (sec)      : %12.6f\n", t2);
 
     return perf;
 }
diff --git a/apps/tests/test_gen_eig.cpp b/apps/tests/test_gen_eig.cpp
deleted file mode 100644
index c71259c73..000000000
--- a/apps/tests/test_gen_eig.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <sirius.h>
-
-using namespace sirius;
-
-void test(cmd_args& args)
-{
-    std::vector<int> mpi_grid_dims;
-    mpi_grid_dims = args.value< std::vector<int> >("mpi_grid", mpi_grid_dims);
-
-    linalg<scalapack>::set_cyclic_block_size(32);
-
-    MPI_grid mpi_grid(mpi_grid_dims, Platform::comm_world());
-    
-    BLACS_grid blacs_grid(mpi_grid.communicator(), mpi_grid.dimension_size(0), mpi_grid.dimension_size(1));
-    
-    generalized_evp_elpa2 evp(blacs_grid);
-
-    HDF5_tree h_in("1_h.h5", false);
-    HDF5_tree o_in("2_o.h5", false);
-
-    int nrow, ncol;
-
-    h_in.read("nrow", &nrow);
-    h_in.read("ncol", &ncol);
-
-    mdarray<double_complex, 2> h(nrow, ncol);
-    mdarray<double_complex, 2> o(nrow, ncol);
-
-    h_in.read_mdarray("matrix", h);
-    o_in.read_mdarray("matrix", o);
-
-    dmatrix<double_complex> h1(nrow, ncol, blacs_grid);
-    dmatrix<double_complex> o1(nrow, ncol, blacs_grid);
-
-    int num_bands = 1234;
-    std::vector<double> eval(num_bands);
-    dmatrix<double_complex> z1(nrow, num_bands, blacs_grid);
-    z1.zero();
-    
-    for (int k = 0; k < 10; k++)
-    {
-        for (int i = 0; i < ncol; i++)
-        {
-            for (int j = 0; j < ncol; j++) 
-            {
-                h1.set(i, j, h(i, j));
-                o1.set(i, j, o(i, j));
-            }
-        }
-
-        Timer t("solve_evp");
-        evp.solve(nrow, h1.num_rows_local(), h1.num_cols_local(), num_bands, 
-                  h1.ptr(), h1.ld(), o1.ptr(), o1.ld(), &eval[0], z1.ptr(), z1.ld());
-        t.stop();
-    }
-    double tval = Timer::value("solve_evp");
-    if (mpi_grid.communicator().rank() == 0)
-    {
-        printf("mpi gird: %i %i\n", mpi_grid_dims[0], mpi_grid_dims[1]);
-        printf("matrix size: %i\n", nrow);
-        printf("average time on 10 runs: %f\n", tval / 10.0);
-    }
-}
-
-int main(int argn, char** argv)
-{
-    Platform::initialize(1);
-
-    cmd_args args;
-    args.register_key("--mpi_grid=", "{vector int} MPI grid dimensions");
-    args.parse_args(argn, argv);
-
-    if (argn == 1)
-    {
-        printf("Usage: ./dft_loop [options] \n");
-        args.print_help();
-        exit(0);
-    }
-    
-    test(args);
-
-    Platform::finalize();
-}
diff --git a/apps/utils/unit_cell_tools.cpp b/apps/utils/unit_cell_tools.cpp
index 208568fc8..590a61561 100644
--- a/apps/utils/unit_cell_tools.cpp
+++ b/apps/utils/unit_cell_tools.cpp
@@ -219,6 +219,42 @@ void create_qe_input(cmd_args const& args__)
     fclose(fout);
 }
 
+void create_exciting_input(cmd_args const& args__)
+{
+    Simulation_context ctx(args__.value<std::string>("input", "sirius.json"), Communicator::self());
+
+    FILE* fout = fopen("input.xml", "w");
+
+    fprintf(fout, "<input>\n");
+    fprintf(fout, "  <title> converted from SIRIUS json input </title>\n");
+    fprintf(fout, "  <structure speciespath=\"./\" autormt=\"false\">\n");
+    fprintf(fout, "    <crystal scale=\"1\">\n");
+    for (int i = 0; i < 3; i++) {
+        auto v = ctx.unit_cell().lattice_vector(i);
+        fprintf(fout, "      <basevect> %18.12f %18.12f %18.12f </basevect>\n", v[0], v[1], v[2]);
+    }
+    fprintf(fout, "    </crystal>\n");
+    for (int iat = 0; iat < ctx.unit_cell().num_atom_types(); iat++) {
+        fprintf(fout, "    <species speciesfile=\"%s.xml\" rmt=\"2.0\">\n", ctx.unit_cell().atom_type(iat).label().c_str());
+        for (int ia = 0; ia < ctx.unit_cell().atom_type(iat).num_atoms(); ia++) {
+            int id = ctx.unit_cell().atom_type(iat).atom_id(ia);
+            auto v = ctx.unit_cell().atom(id).position();
+            fprintf(fout, "      <atom coord=\"%18.12f %18.12f %18.12f\" bfcmt=\"0.0 0.0 0.0\"/>\n", v[0], v[1], v[2]);
+        }
+        fprintf(fout, "</species>\n");
+    }
+
+    fprintf(fout, "  </structure>\n");
+    fprintf(fout, "  <groundstate do=\"fromscratch\" ngridk=\"2 2 2\" rgkmax=\"4.0\" gmaxvr=\"16\" maxscl=\"2\"  kptgroups=\"1\">\n");
+    fprintf(fout, "    <libxc exchange=\"XC_LDA_X\" correlation=\"XC_LDA_C_PZ\"/>\n");
+    fprintf(fout, "    <sirius densityinit=\"true\" density=\"true\" vha=\"true\" xc=\"true\" eigenstates=\"true\" sfacg=\"true\" cfun=\"true\"/>\n");
+    fprintf(fout, "    <spin/>\n");
+    fprintf(fout, "  </groundstate>\n");
+
+    fprintf(fout, "</input>\n");
+    fclose(fout);
+}
+
 void convert_to_mol(cmd_args& args__)
 {
     Simulation_context ctx(args__.value<std::string>("input", "sirius.json"), Communicator::self());
@@ -274,6 +310,7 @@ int main(int argn, char** argv)
     args.register_key("--input=", "{string} input file name");
     args.register_key("--supercell=", "{string} transformation matrix (9 numbers)");
     args.register_key("--qe", "create input for QE");
+    args.register_key("--xml", "create Exciting XML input");
     args.register_key("--find_primitive", "find a primitive cell");
     args.register_key("--cif", "create CIF file");
     args.register_key("--mol", "convert to molecule input file");
@@ -296,6 +333,9 @@ int main(int argn, char** argv)
     if (args.exist("qe")) {
         create_qe_input(args);
     }
+    if (args.exist("xml")) {
+        create_exciting_input(args);
+    }
     if (args.exist("cif")) {
         Simulation_context ctx(args.value<std::string>("input", "sirius.json"), Communicator::self());
         ctx.unit_cell().write_cif();
diff --git a/ci/easybuild-jenkins.sh b/ci/easybuild-jenkins.sh
index 20d4665b8..e7f96d722 100755
--- a/ci/easybuild-jenkins.sh
+++ b/ci/easybuild-jenkins.sh
@@ -22,7 +22,7 @@ module load EasyBuild-custom/cscs
     eb magma-2.5.1-CrayGNU-19.10-cuda-10.1.eb -r
     eb SpFFT-0.9.8-CrayGNU-19.10-cuda-10.1.eb -r
     eb mpi4py-3.0.2-CrayGNU-19.10-python3-cuda10.1.eb -r
-    eb ELPA-2019.05.001-CrayGNU-19.10.eb -r
+    eb ELPA-2019.11.001-CrayGNU-19.10.eb -r
 )
 
 chmod ao+rx ${EASYBUILD_PREFIX}
diff --git a/ci/github_run_verification.sh b/ci/github_run_verification.sh
index ec96bd904..8dc06a488 100755
--- a/ci/github_run_verification.sh
+++ b/ci/github_run_verification.sh
@@ -2,5 +2,5 @@
 
 export PATH=$HOME/local/bin:$HOME/reframe/bin:$PATH
 git clone https://github.com/eth-cscs/reframe.git $HOME/reframe
-reframe -C ./reframe/config.py --system=linux:cpu -c ./reframe/checks -R -r --tag serial --exec-policy=serial
+reframe -C ./reframe/config.py --system=linux:cpu -c ./reframe/checks -R -r --tag serial --exec-policy=serial --skip-prgenv-check
 
diff --git a/cmake/modules/FindElpa.cmake b/cmake/modules/FindElpa.cmake
index 6c90863e0..396c6d2ce 100644
--- a/cmake/modules/FindElpa.cmake
+++ b/cmake/modules/FindElpa.cmake
@@ -3,7 +3,13 @@
 include(FindPackageHandleStandardArgs)
 find_package(PkgConfig)
 
-pkg_search_module(_ELPA elpa elpa_openmp)
+pkg_search_module(_ELPA
+  elpa
+  elpa_openmp
+  elpa_openmp-2019.11.001
+  elpa-2019.05.001
+  elpa-2019.11.001
+  elpa-2019.05.001)
 
 find_library(ELPA_LIBRARIES
   NAMES elpa elpa_openmp
@@ -15,8 +21,8 @@ find_library(ELPA_LIBRARIES
   DOC "elpa libraries list")
 
 find_path(ELPA_INCLUDE_DIR
-  NAMES elpa.h elpa_constants.h
-  PATH_SUFFIXES include/elpa_openmp-$ENV{EBVERSIONELPA}/elpa include/elpa_openmp-$ENV{EBVERSIONELPA} elpa
+  NAMES elpa/elpa.h elpa/elpa_constants.h
+  PATH_SUFFIXES include/elpa_openmp-$ENV{EBVERSIONELPA}
   HINTS
   ${_ELPA_INCLUDE_DIRS}
   ENV ELPAROOT
@@ -24,6 +30,8 @@ find_path(ELPA_INCLUDE_DIR
 
 find_package_handle_standard_args(Elpa "DEFAULT_MSG" ELPA_LIBRARIES ELPA_INCLUDE_DIR)
 
+message("ELPA_INCLUDE_DIR: ${ELPA_INCLUDE_DIR}")
+
 if(Elpa_FOUND AND NOT TARGET sirius::elpa)
   add_library(sirius::elpa INTERFACE IMPORTED)
   set_target_properties(sirius::elpa PROPERTIES
diff --git a/cmake/modules/FindHIP.cmake b/cmake/modules/FindHIP.cmake
new file mode 100644
index 000000000..a52c3921c
--- /dev/null
+++ b/cmake/modules/FindHIP.cmake
@@ -0,0 +1,593 @@
+###############################################################################
+# FindHIP.cmake
+###############################################################################
+
+# Copyright (c) 2008-2020 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+###############################################################################
+# SET: Variable defaults
+###############################################################################
+# User defined flags
+set(HIP_HIPCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HIPCC")
+set(HIP_HCC_FLAGS "" CACHE STRING "Semicolon delimited flags for HCC")
+set(HIP_NVCC_FLAGS "" CACHE STRING "Semicolon delimted flags for NVCC")
+mark_as_advanced(HIP_HIPCC_FLAGS HIP_HCC_FLAGS HIP_NVCC_FLAGS)
+set(_hip_configuration_types ${CMAKE_CONFIGURATION_TYPES} ${CMAKE_BUILD_TYPE} Debug MinSizeRel Release RelWithDebInfo)
+list(REMOVE_DUPLICATES _hip_configuration_types)
+foreach(config ${_hip_configuration_types})
+    string(TOUPPER ${config} config_upper)
+    set(HIP_HIPCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HIPCC")
+    set(HIP_HCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for HCC")
+    set(HIP_NVCC_FLAGS_${config_upper} "" CACHE STRING "Semicolon delimited flags for NVCC")
+    mark_as_advanced(HIP_HIPCC_FLAGS_${config_upper} HIP_HCC_FLAGS_${config_upper} HIP_NVCC_FLAGS_${config_upper})
+endforeach()
+option(HIP_HOST_COMPILATION_CPP "Host code compilation mode" ON)
+option(HIP_VERBOSE_BUILD "Print out the commands run while compiling the HIP source file.  With the Makefile generator this defaults to VERBOSE variable specified on the command line, but can be forced on with this option." OFF)
+mark_as_advanced(HIP_HOST_COMPILATION_CPP)
+
+###############################################################################
+# Set HIP CMAKE Flags
+###############################################################################
+# Copy the invocation styles from CXX to HIP
+set(CMAKE_HIP_ARCHIVE_CREATE ${CMAKE_CXX_ARCHIVE_CREATE})
+set(CMAKE_HIP_ARCHIVE_APPEND ${CMAKE_CXX_ARCHIVE_APPEND})
+set(CMAKE_HIP_ARCHIVE_FINISH ${CMAKE_CXX_ARCHIVE_FINISH})
+set(CMAKE_SHARED_LIBRARY_SONAME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG})
+set(CMAKE_SHARED_LIBRARY_CREATE_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS})
+set(CMAKE_SHARED_LIBRARY_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_CXX_FLAGS})
+#set(CMAKE_SHARED_LIBRARY_LINK_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS})
+set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG})
+set(CMAKE_SHARED_LIBRARY_RUNTIME_HIP_FLAG_SEP ${CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG_SEP})
+set(CMAKE_SHARED_LIBRARY_LINK_STATIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_STATIC_CXX_FLAGS})
+set(CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_HIP_FLAGS ${CMAKE_SHARED_LIBRARY_LINK_DYNAMIC_CXX_FLAGS})
+
+# Set the CMake Flags to use the HCC Compilier.
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <LANGUAGE_COMPILE_FLAGS> <LINK_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> <SONAME_FLAG><TARGET_SONAME> -o <TARGET> <LINK_LIBRARIES> -shared" )
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_PATH} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+
+###############################################################################
+# FIND: HIP and associated helper binaries
+###############################################################################
+
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../" REALPATH)
+
+# HIP is supported on Linux only
+if(UNIX AND NOT APPLE AND NOT CYGWIN)
+    # Search for HIP installation
+    if(NOT HIP_ROOT_DIR)
+        # Search in user specified path first
+        find_path(
+            HIP_ROOT_DIR
+            NAMES bin/hipconfig
+            PATHS
+            "$ENV{ROCM_PATH}/hip"
+            ENV HIP_PATH
+            ${_IMPORT_PREFIX}
+            /opt/rocm/hip
+            DOC "HIP installed location"
+            NO_DEFAULT_PATH
+            )
+        if(NOT EXISTS ${HIP_ROOT_DIR})
+            if(HIP_FIND_REQUIRED)
+                message(FATAL_ERROR "Specify HIP_ROOT_DIR")
+            elseif(NOT HIP_FIND_QUIETLY)
+                message("HIP_ROOT_DIR not found or specified")
+            endif()
+        endif()
+        # And push it back to the cache
+        set(HIP_ROOT_DIR ${HIP_ROOT_DIR} CACHE PATH "HIP installed location" FORCE)
+    endif()
+
+    # Find HIPCC executable
+    find_program(
+        HIP_HIPCC_EXECUTABLE
+        NAMES hipcc
+        PATHS
+        "${HIP_ROOT_DIR}"
+        ENV ROCM_PATH
+        ENV HIP_PATH
+        /opt/rocm
+        /opt/rocm/hip
+        PATH_SUFFIXES bin
+        NO_DEFAULT_PATH
+        )
+    if(NOT HIP_HIPCC_EXECUTABLE)
+        # Now search in default paths
+        find_program(HIP_HIPCC_EXECUTABLE hipcc)
+    endif()
+    mark_as_advanced(HIP_HIPCC_EXECUTABLE)
+
+    # Find HIPCONFIG executable
+    find_program(
+        HIP_HIPCONFIG_EXECUTABLE
+        NAMES hipconfig
+        PATHS
+        "${HIP_ROOT_DIR}"
+        ENV ROCM_PATH
+        ENV HIP_PATH
+        /opt/rocm
+        /opt/rocm/hip
+        PATH_SUFFIXES bin
+        NO_DEFAULT_PATH
+        )
+    if(NOT HIP_HIPCONFIG_EXECUTABLE)
+        # Now search in default paths
+        find_program(HIP_HIPCONFIG_EXECUTABLE hipconfig)
+    endif()
+    mark_as_advanced(HIP_HIPCONFIG_EXECUTABLE)
+
+    # Find HIPCC_CMAKE_LINKER_HELPER executable
+    find_program(
+        HIP_HIPCC_CMAKE_LINKER_HELPER
+        NAMES hipcc_cmake_linker_helper
+        PATHS
+        "${HIP_ROOT_DIR}"
+        ENV ROCM_PATH
+        ENV HIP_PATH
+        /opt/rocm
+        /opt/rocm/hip
+        PATH_SUFFIXES bin
+        NO_DEFAULT_PATH
+        )
+    if(NOT HIP_HIPCC_CMAKE_LINKER_HELPER)
+        # Now search in default paths
+        find_program(HIP_HIPCC_CMAKE_LINKER_HELPER hipcc_cmake_linker_helper)
+    endif()
+    mark_as_advanced(HIP_HIPCC_CMAKE_LINKER_HELPER)
+
+    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_VERSION)
+        # Compute the version
+        execute_process(
+            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --version
+            OUTPUT_VARIABLE _hip_version
+            ERROR_VARIABLE _hip_error
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            ERROR_STRIP_TRAILING_WHITESPACE
+            )
+        if(NOT _hip_error)
+            set(HIP_VERSION ${_hip_version} CACHE STRING "Version of HIP as computed from hipcc")
+        else()
+            set(HIP_VERSION "0.0.0" CACHE STRING "Version of HIP as computed by FindHIP()")
+        endif()
+        mark_as_advanced(HIP_VERSION)
+    endif()
+    if(HIP_VERSION)
+        string(REPLACE "." ";" _hip_version_list "${HIP_VERSION}")
+        list(GET _hip_version_list 0 HIP_VERSION_MAJOR)
+        list(GET _hip_version_list 1 HIP_VERSION_MINOR)
+        list(GET _hip_version_list 2 HIP_VERSION_PATCH)
+        set(HIP_VERSION_STRING "${HIP_VERSION}")
+    endif()
+
+    if(HIP_HIPCONFIG_EXECUTABLE AND NOT HIP_PLATFORM)
+        # Compute the platform
+        execute_process(
+            COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform
+            OUTPUT_VARIABLE _hip_platform
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+        set(HIP_PLATFORM ${_hip_platform} CACHE STRING "HIP platform as computed by hipconfig")
+        mark_as_advanced(HIP_PLATFORM)
+    endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+    HIP
+    REQUIRED_VARS
+    HIP_ROOT_DIR
+    HIP_HIPCC_EXECUTABLE
+    HIP_HIPCONFIG_EXECUTABLE
+    HIP_PLATFORM
+    VERSION_VAR HIP_VERSION
+    )
+
+###############################################################################
+# MACRO: Locate helper files
+###############################################################################
+macro(HIP_FIND_HELPER_FILE _name _extension)
+    set(_hip_full_name "${_name}.${_extension}")
+    get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+    set(HIP_${_name} "${CMAKE_CURRENT_LIST_DIR}/FindHIP/${_hip_full_name}")
+    if(NOT EXISTS "${HIP_${_name}}")
+        set(error_message "${_hip_full_name} not found in ${CMAKE_CURRENT_LIST_DIR}/FindHIP")
+        if(HIP_FIND_REQUIRED)
+            message(FATAL_ERROR "${error_message}")
+        else()
+            if(NOT HIP_FIND_QUIETLY)
+                message(STATUS "${error_message}")
+            endif()
+        endif()
+    endif()
+    # Set this variable as internal, so the user isn't bugged with it.
+    set(HIP_${_name} ${HIP_${_name}} CACHE INTERNAL "Location of ${_full_name}" FORCE)
+endmacro()
+
+###############################################################################
+hip_find_helper_file(run_make2cmake cmake)
+hip_find_helper_file(run_hipcc cmake)
+###############################################################################
+
+###############################################################################
+# MACRO: Reset compiler flags
+###############################################################################
+macro(HIP_RESET_FLAGS)
+    unset(HIP_HIPCC_FLAGS)
+    unset(HIP_HCC_FLAGS)
+    unset(HIP_NVCC_FLAGS)
+    foreach(config ${_hip_configuration_types})
+        string(TOUPPER ${config} config_upper)
+        unset(HIP_HIPCC_FLAGS_${config_upper})
+        unset(HIP_HCC_FLAGS_${config_upper})
+        unset(HIP_NVCC_FLAGS_${config_upper})
+    endforeach()
+endmacro()
+
+###############################################################################
+# MACRO: Separate the options from the sources
+###############################################################################
+macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_options _nvcc_options)
+    set(${_sources})
+    set(${_cmake_options})
+    set(${_hipcc_options})
+    set(${_hcc_options})
+    set(${_nvcc_options})
+    set(_hipcc_found_options FALSE)
+    set(_hcc_found_options FALSE)
+    set(_nvcc_found_options FALSE)
+    foreach(arg ${ARGN})
+        if("x${arg}" STREQUAL "xHIPCC_OPTIONS")
+            set(_hipcc_found_options TRUE)
+            set(_hcc_found_options FALSE)
+            set(_nvcc_found_options FALSE)
+        elseif("x${arg}" STREQUAL "xHCC_OPTIONS")
+            set(_hipcc_found_options FALSE)
+            set(_hcc_found_options TRUE)
+            set(_nvcc_found_options FALSE)
+        elseif("x${arg}" STREQUAL "xNVCC_OPTIONS")
+            set(_hipcc_found_options FALSE)
+            set(_hcc_found_options FALSE)
+            set(_nvcc_found_options TRUE)
+        elseif(
+                "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR
+                "x${arg}" STREQUAL "xSTATIC" OR
+                "x${arg}" STREQUAL "xSHARED" OR
+                "x${arg}" STREQUAL "xMODULE"
+                )
+            list(APPEND ${_cmake_options} ${arg})
+        else()
+            if(_hipcc_found_options)
+                list(APPEND ${_hipcc_options} ${arg})
+            elseif(_hcc_found_options)
+                list(APPEND ${_hcc_options} ${arg})
+            elseif(_nvcc_found_options)
+                list(APPEND ${_nvcc_options} ${arg})
+            else()
+                # Assume this is a file
+                list(APPEND ${_sources} ${arg})
+            endif()
+        endif()
+    endforeach()
+endmacro()
+
+###############################################################################
+# MACRO: Add include directories to pass to the hipcc command
+###############################################################################
+set(HIP_HIPCC_INCLUDE_ARGS_USER "")
+macro(HIP_INCLUDE_DIRECTORIES)
+    foreach(dir ${ARGN})
+        list(APPEND HIP_HIPCC_INCLUDE_ARGS_USER $<$<BOOL:${dir}>:-I${dir}>)
+    endforeach()
+endmacro()
+
+###############################################################################
+# FUNCTION: Helper to avoid clashes of files with the same basename but different paths
+###############################################################################
+function(HIP_COMPUTE_BUILD_PATH path build_path)
+    # Convert to cmake style paths
+    file(TO_CMAKE_PATH "${path}" bpath)
+    if(IS_ABSOLUTE "${bpath}")
+        string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos)
+        if(_binary_dir_pos EQUAL 0)
+            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}")
+        else()
+            file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}")
+        endif()
+    endif()
+
+    # Remove leading /
+    string(REGEX REPLACE "^[/]+" "" bpath "${bpath}")
+    # Avoid absolute paths by removing ':'
+    string(REPLACE ":" "_" bpath "${bpath}")
+    # Avoid relative paths that go up the tree
+    string(REPLACE "../" "__/" bpath "${bpath}")
+    # Avoid spaces
+    string(REPLACE " " "_" bpath "${bpath}")
+    # Strip off the filename
+    get_filename_component(bpath "${bpath}" PATH)
+
+    set(${build_path} "${bpath}" PARENT_SCOPE)
+endfunction()
+
+###############################################################################
+# MACRO: Parse OPTIONS from ARGN & set variables prefixed by _option_prefix
+###############################################################################
+macro(HIP_PARSE_HIPCC_OPTIONS _option_prefix)
+    set(_hip_found_config)
+    foreach(arg ${ARGN})
+        # Determine if we are dealing with a per-configuration flag
+        foreach(config ${_hip_configuration_types})
+            string(TOUPPER ${config} config_upper)
+            if(arg STREQUAL "${config_upper}")
+                set(_hip_found_config _${arg})
+                # Clear arg to prevent it from being processed anymore
+                set(arg)
+            endif()
+        endforeach()
+        if(arg)
+            list(APPEND ${_option_prefix}${_hip_found_config} "${arg}")
+        endif()
+    endforeach()
+endmacro()
+
+###############################################################################
+# MACRO: Try and include dependency file if it exists
+###############################################################################
+macro(HIP_INCLUDE_HIPCC_DEPENDENCIES dependency_file)
+    set(HIP_HIPCC_DEPEND)
+    set(HIP_HIPCC_DEPEND_REGENERATE FALSE)
+
+    # Create the dependency file if it doesn't exist
+    if(NOT EXISTS ${dependency_file})
+        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
+    endif()
+    # Include the dependency file
+    include(${dependency_file})
+
+    # Verify the existence of all the included files
+    if(HIP_HIPCC_DEPEND)
+        foreach(f ${HIP_HIPCC_DEPEND})
+            if(NOT EXISTS ${f})
+                # If they aren't there, regenerate the file again
+                set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
+            endif()
+        endforeach()
+    else()
+        # No dependencies, so regenerate the file
+        set(HIP_HIPCC_DEPEND_REGENERATE TRUE)
+    endif()
+
+    # Regenerate the dependency file if needed
+    if(HIP_HIPCC_DEPEND_REGENERATE)
+        set(HIP_HIPCC_DEPEND ${dependency_file})
+        file(WRITE ${dependency_file} "# Generated by: FindHIP.cmake. Do not edit.\n")
+    endif()
+endmacro()
+
+###############################################################################
+# MACRO: Prepare cmake commands for the target
+###############################################################################
+macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files)
+    set(_hip_flags "")
+    string(TOUPPER "${CMAKE_BUILD_TYPE}" _hip_build_configuration)
+    if(HIP_HOST_COMPILATION_CPP)
+        set(HIP_C_OR_CXX CXX)
+    else()
+        set(HIP_C_OR_CXX C)
+    endif()
+    set(generated_extension ${CMAKE_${HIP_C_OR_CXX}_OUTPUT_EXTENSION})
+
+    # Initialize list of includes with those specified by the user. Append with
+    # ones specified to cmake directly.
+    set(HIP_HIPCC_INCLUDE_ARGS ${HIP_HIPCC_INCLUDE_ARGS_USER})
+
+    # Add the include directories
+    set(include_directories_generator "$<TARGET_PROPERTY:${_target},INCLUDE_DIRECTORIES>")
+    list(APPEND HIP_HIPCC_INCLUDE_ARGS "$<$<BOOL:${include_directories_generator}>:-I$<JOIN:${include_directories_generator}, -I>>")
+
+    get_directory_property(_hip_include_directories INCLUDE_DIRECTORIES)
+    list(REMOVE_DUPLICATES _hip_include_directories)
+    if(_hip_include_directories)
+        foreach(dir ${_hip_include_directories})
+            list(APPEND HIP_HIPCC_INCLUDE_ARGS $<$<BOOL:${dir}>:-I${dir}>)
+        endforeach()
+    endif()
+
+    HIP_GET_SOURCES_AND_OPTIONS(_hip_sources _hip_cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
+    HIP_PARSE_HIPCC_OPTIONS(HIP_HIPCC_FLAGS ${_hipcc_options})
+    HIP_PARSE_HIPCC_OPTIONS(HIP_HCC_FLAGS ${_hcc_options})
+    HIP_PARSE_HIPCC_OPTIONS(HIP_NVCC_FLAGS ${_nvcc_options})
+
+    # Add the compile definitions
+    set(compile_definition_generator "$<TARGET_PROPERTY:${_target},COMPILE_DEFINITIONS>")
+    list(APPEND HIP_HIPCC_FLAGS "$<$<BOOL:${compile_definition_generator}>:-D$<JOIN:${compile_definition_generator}, -D>>")
+
+    # Check if we are building shared library.
+    set(_hip_build_shared_libs FALSE)
+    list(FIND _hip_cmake_options SHARED _hip_found_SHARED)
+    list(FIND _hip_cmake_options MODULE _hip_found_MODULE)
+    if(_hip_found_SHARED GREATER -1 OR _hip_found_MODULE GREATER -1)
+        set(_hip_build_shared_libs TRUE)
+    endif()
+    list(FIND _hip_cmake_options STATIC _hip_found_STATIC)
+    if(_hip_found_STATIC GREATER -1)
+        set(_hip_build_shared_libs FALSE)
+    endif()
+
+    # If we are building a shared library, add extra flags to HIP_HIPCC_FLAGS
+    if(_hip_build_shared_libs)
+        list(APPEND HIP_HCC_FLAGS "-fPIC")
+        list(APPEND HIP_NVCC_FLAGS "--shared -Xcompiler '-fPIC'")
+    endif()
+
+    # Set host compiler
+    set(HIP_HOST_COMPILER "${CMAKE_${HIP_C_OR_CXX}_COMPILER}")
+
+    # Set compiler flags
+    set(_HIP_HOST_FLAGS "set(CMAKE_HOST_FLAGS ${CMAKE_${HIP_C_OR_CXX}_FLAGS})")
+    set(_HIP_HIPCC_FLAGS "set(HIP_HIPCC_FLAGS ${HIP_HIPCC_FLAGS})")
+    set(_HIP_HCC_FLAGS "set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS})")
+    set(_HIP_NVCC_FLAGS "set(HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS})")
+    foreach(config ${_hip_configuration_types})
+        string(TOUPPER ${config} config_upper)
+        set(_HIP_HOST_FLAGS "${_HIP_HOST_FLAGS}\nset(CMAKE_HOST_FLAGS_${config_upper} ${CMAKE_${HIP_C_OR_CXX}_FLAGS_${config_upper}})")
+        set(_HIP_HIPCC_FLAGS "${_HIP_HIPCC_FLAGS}\nset(HIP_HIPCC_FLAGS_${config_upper} ${HIP_HIPCC_FLAGS_${config_upper}})")
+        set(_HIP_HCC_FLAGS "${_HIP_HCC_FLAGS}\nset(HIP_HCC_FLAGS_${config_upper} ${HIP_HCC_FLAGS_${config_upper}})")
+        set(_HIP_NVCC_FLAGS "${_HIP_NVCC_FLAGS}\nset(HIP_NVCC_FLAGS_${config_upper} ${HIP_NVCC_FLAGS_${config_upper}})")
+    endforeach()
+
+    # Reset the output variable
+    set(_hip_generated_files "")
+    set(_hip_source_files "")
+
+    # Iterate over all arguments and create custom commands for all source files
+    foreach(file ${ARGN})
+        # Ignore any file marked as a HEADER_FILE_ONLY
+        get_source_file_property(_is_header ${file} HEADER_FILE_ONLY)
+        # Allow per source file overrides of the format. Also allows compiling non .cu files.
+        get_source_file_property(_hip_source_format ${file} HIP_SOURCE_PROPERTY_FORMAT)
+        if((${file} MATCHES "\\.cu$" OR _hip_source_format) AND NOT _is_header)
+            set(host_flag FALSE)
+        else()
+            set(host_flag TRUE)
+        endif()
+
+        if(NOT host_flag)
+            # Determine output directory
+            HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path)
+            set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}")
+
+            get_filename_component(basename ${file} NAME)
+            set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}")
+            set(generated_file_basename "${_target}_generated_${basename}${generated_extension}")
+
+            # Set file names
+            set(generated_file "${generated_file_path}/${generated_file_basename}")
+            set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend")
+            set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen")
+            set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake")
+
+            # Set properties for object files
+            set_source_files_properties("${generated_file}"
+                PROPERTIES
+                EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked
+                )
+
+            # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path
+            get_filename_component(file_path "${file}" PATH)
+            if(IS_ABSOLUTE "${file_path}")
+                set(source_file "${file}")
+            else()
+                set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}")
+            endif()
+
+            # Bring in the dependencies
+            HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file})
+
+            # Configure the build script
+            configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY)
+            file(GENERATE
+                OUTPUT "${custom_target_script}"
+                INPUT "${custom_target_script_pregen}"
+                )
+            set(main_dep DEPENDS ${source_file})
+            if(CMAKE_GENERATOR MATCHES "Makefiles")
+                set(verbose_output "$(VERBOSE)")
+            elseif(HIP_VERBOSE_BUILD)
+                set(verbose_output ON)
+            else()
+                set(verbose_output OFF)
+            endif()
+
+            # Create up the comment string
+            file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}")
+            set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}")
+
+            # Build the generated file and dependency file
+            add_custom_command(
+                OUTPUT ${generated_file}
+                # These output files depend on the source_file and the contents of cmake_dependency_file
+                ${main_dep}
+                DEPENDS ${HIP_HIPCC_DEPEND}
+                DEPENDS ${custom_target_script}
+                # Make sure the output directory exists before trying to write to it.
+                COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}"
+                COMMAND ${CMAKE_COMMAND} ARGS
+                -D verbose:BOOL=${verbose_output}
+                -D build_configuration:STRING=${_hip_build_configuration}
+                -D "generated_file:STRING=${generated_file}"
+                -P "${custom_target_script}"
+                WORKING_DIRECTORY "${hip_compile_output_dir}"
+                COMMENT "${hip_build_comment_string}"
+                )
+
+            # Make sure the build system knows the file is generated
+            set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE)
+            list(APPEND _hip_generated_files ${generated_file})
+            list(APPEND _hip_source_files ${file})
+        endif()
+    endforeach()
+
+    # Set the return parameter
+    set(${_generated_files} ${_hip_generated_files})
+    set(${_source_files} ${_hip_source_files})
+endmacro()
+
+###############################################################################
+# HIP_ADD_EXECUTABLE
+###############################################################################
+macro(HIP_ADD_EXECUTABLE hip_target)
+    # Separate the sources from the options
+    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
+    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+        list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if("x${HCC_HOME}" STREQUAL "x")
+        if (DEFINED $ENV{ROCM_PATH})
+            set(HCC_HOME "$ENV{ROCM_PATH}/hcc")
+        elseif( DEFINED $ENV{HIP_PATH})
+            set(HCC_HOME "$ENV{HIP_PATH}/../hcc")
+        else()
+            set(HCC_HOME "/opt/rocm/hcc")
+        endif()
+    endif()
+    set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+    add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP)
+endmacro()
+
+###############################################################################
+# HIP_ADD_LIBRARY
+###############################################################################
+macro(HIP_ADD_LIBRARY hip_target)
+    # Separate the sources from the options
+    HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN})
+    HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+        list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX})
+endmacro()
+
+# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/cmake/modules/FindHIP/run_hipcc.cmake b/cmake/modules/FindHIP/run_hipcc.cmake
new file mode 100644
index 000000000..a02806c26
--- /dev/null
+++ b/cmake/modules/FindHIP/run_hipcc.cmake
@@ -0,0 +1,188 @@
+###############################################################################
+# Runs commands using HIPCC
+###############################################################################
+
+# Copyright (c) 2008-2020 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+###############################################################################
+# This file runs the hipcc commands to produce the desired output file
+# along with the dependency file needed by CMake to compute dependencies.
+#
+# Input variables:
+#
+# verbose:BOOL=<>               OFF: Be as quiet as possible (default)
+#                               ON : Describe each step
+# build_configuration:STRING=<> Build configuration. Defaults to Debug.
+# generated_file:STRING=<>      File to generate. Mandatory argument.
+
+if(NOT build_configuration)
+    set(build_configuration Debug)
+endif()
+if(NOT generated_file)
+    message(FATAL_ERROR "You must specify generated_file on the command line")
+endif()
+
+# Set these up as variables to make reading the generated file easier
+set(HIP_HIPCC_EXECUTABLE "@HIP_HIPCC_EXECUTABLE@") # path
+set(HIP_HIPCONFIG_EXECUTABLE "@HIP_HIPCONFIG_EXECUTABLE@") #path
+set(HIP_HOST_COMPILER "@HIP_HOST_COMPILER@") # path
+set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path
+set(HIP_run_make2cmake "@HIP_run_make2cmake@") # path
+set(HCC_HOME "@HCC_HOME@") #path
+
+@HIP_HOST_FLAGS@
+@_HIP_HIPCC_FLAGS@
+@_HIP_HCC_FLAGS@
+@_HIP_NVCC_FLAGS@
+set(HIP_HIPCC_INCLUDE_ARGS "@HIP_HIPCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly)
+
+set(cmake_dependency_file "@cmake_dependency_file@") # path
+set(source_file "@source_file@") # path
+set(host_flag "@host_flag@") # bool
+
+# Determine compiler and compiler flags
+execute_process(COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --platform OUTPUT_VARIABLE HIP_PLATFORM OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT host_flag)
+    set(__CC ${HIP_HIPCC_EXECUTABLE})
+    if(HIP_PLATFORM STREQUAL "hcc")
+        if(NOT "x${HCC_HOME}" STREQUAL "x")
+            set(ENV{HCC_HOME} ${HCC_HOME})
+        endif()
+        set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_HCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_HCC_FLAGS_${build_configuration}})
+    else()
+        set(__CC_FLAGS ${HIP_HIPCC_FLAGS} ${HIP_NVCC_FLAGS} ${HIP_HIPCC_FLAGS_${build_configuration}} ${HIP_NVCC_FLAGS_${build_configuration}})
+    endif()
+else()
+    set(__CC ${HIP_HOST_COMPILER})
+    set(__CC_FLAGS ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}})
+endif()
+set(__CC_INCLUDES ${HIP_HIPCC_INCLUDE_ARGS})
+
+# hip_execute_process - Executes a command with optional command echo and status message.
+#   status     - Status message to print if verbose is true
+#   command    - COMMAND argument from the usual execute_process argument structure
+#   ARGN       - Remaining arguments are the command with arguments
+#   HIP_result - Return value from running the command
+macro(hip_execute_process status command)
+    set(_command ${command})
+    if(NOT "x${_command}" STREQUAL "xCOMMAND")
+        message(FATAL_ERROR "Malformed call to hip_execute_process.  Missing COMMAND as second argument. (command = ${command})")
+    endif()
+    if(verbose)
+        execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status})
+        # Build command string to print
+        set(hip_execute_process_string)
+        foreach(arg ${ARGN})
+            # Escape quotes if any
+            string(REPLACE "\"" "\\\"" arg ${arg})
+            # Surround args with spaces with quotes
+            if(arg MATCHES " ")
+                list(APPEND hip_execute_process_string "\"${arg}\"")
+            else()
+                list(APPEND hip_execute_process_string ${arg})
+            endif()
+        endforeach()
+        # Echo the command
+        execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${hip_execute_process_string})
+    endif()
+    # Run the command
+    execute_process(COMMAND ${ARGN} RESULT_VARIABLE HIP_result)
+endmacro()
+
+# Delete the target file
+hip_execute_process(
+    "Removing ${generated_file}"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+    )
+
+# Generate the dependency file
+hip_execute_process(
+    "Generating dependency file: ${cmake_dependency_file}.pre"
+    COMMAND "${__CC}"
+    -M
+    "${source_file}"
+    -o "${cmake_dependency_file}.pre"
+    ${__CC_FLAGS}
+    ${__CC_INCLUDES}
+    )
+
+if(HIP_result)
+    message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the cmake readable dependency file to a temp file
+hip_execute_process(
+    "Generating temporary cmake readable file: ${cmake_dependency_file}.tmp"
+    COMMAND "${CMAKE_COMMAND}"
+    -D "input_file:FILEPATH=${cmake_dependency_file}.pre"
+    -D "output_file:FILEPATH=${cmake_dependency_file}.tmp"
+    -D "verbose=${verbose}"
+    -P "${HIP_run_make2cmake}"
+    )
+
+if(HIP_result)
+    message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Copy the file if it is different
+hip_execute_process(
+    "Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}"
+    COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}"
+    )
+
+if(HIP_result)
+    message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Delete the temporary file
+hip_execute_process(
+    "Removing ${cmake_dependency_file}.tmp and ${cmake_dependency_file}.pre"
+    COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${cmake_dependency_file}.pre"
+    )
+
+if(HIP_result)
+    message(FATAL_ERROR "Error generating ${generated_file}")
+endif()
+
+# Generate the output file
+hip_execute_process(
+    "Generating ${generated_file}"
+    COMMAND "${__CC}"
+    -c
+    "${source_file}"
+    -o "${generated_file}"
+    ${__CC_FLAGS}
+    ${__CC_INCLUDES}
+    )
+
+if(HIP_result)
+    # Make sure that we delete the output file
+    hip_execute_process(
+        "Removing ${generated_file}"
+        COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}"
+        )
+    message(FATAL_ERROR "Error generating file ${generated_file}")
+else()
+    if(verbose)
+        message("Generated ${generated_file} successfully.")
+    endif()
+endif()
+# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/cmake/modules/FindHIP/run_make2cmake.cmake b/cmake/modules/FindHIP/run_make2cmake.cmake
new file mode 100644
index 000000000..e7b179aa0
--- /dev/null
+++ b/cmake/modules/FindHIP/run_make2cmake.cmake
@@ -0,0 +1,70 @@
+###############################################################################
+# Computes dependencies using HIPCC
+###############################################################################
+
+# Copyright (c) 2008-2020 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+###############################################################################
+# This file converts dependency files generated using hipcc to a format that
+# cmake can understand.
+
+# Input variables:
+#
+# input_file:STRING=<> Dependency file to parse. Required argument
+# output_file:STRING=<> Output file to generate. Required argument
+
+if(NOT input_file OR NOT output_file)
+    message(FATAL_ERROR "You must specify input_file and output_file on the command line")
+endif()
+
+file(READ ${input_file} depend_text)
+
+if (NOT "${depend_text}" STREQUAL "")
+    string(REPLACE " /" "\n/" depend_text ${depend_text})
+    string(REGEX REPLACE "^.*:" "" depend_text ${depend_text})
+    string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text})
+
+    set(dependency_list "")
+
+    foreach(file ${depend_text})
+        string(REGEX REPLACE "^ +" "" file ${file})
+        if(NOT EXISTS "${file}")
+            message(WARNING " Removing non-existent dependency file: ${file}")
+            set(file "")
+        endif()
+
+        if(NOT IS_DIRECTORY "${file}")
+            get_filename_component(file_absolute "${file}" ABSOLUTE)
+            list(APPEND dependency_list "${file_absolute}")
+        endif()
+    endforeach()
+endif()
+
+# Remove the duplicate entries and sort them.
+list(REMOVE_DUPLICATES dependency_list)
+list(SORT dependency_list)
+
+foreach(file ${dependency_list})
+    set(hip_hipcc_depend "${hip_hipcc_depend} \"${file}\"\n")
+endforeach()
+
+file(WRITE ${output_file} "# Generated by: FindHIP.cmake. Do not edit.\nSET(HIP_HIPCC_DEPEND\n ${hip_hipcc_depend})\n\n")
+# vim: ts=4:sw=4:expandtab:smartindent
diff --git a/cmake/modules/FindHIPLIBS.cmake b/cmake/modules/FindHIPLIBS.cmake
new file mode 100644
index 000000000..1f0a90a5c
--- /dev/null
+++ b/cmake/modules/FindHIPLIBS.cmake
@@ -0,0 +1,92 @@
+#  Copyright (c) 2019 ETH Zurich
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#  2. Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#     may be used to endorse or promote products derived from this software
+#     without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+
+
+#.rst:
+# FindHIPLIBS
+# -----------
+#
+# This module searches for the fftw3 library.
+#
+# The following variables are set
+#
+# ::
+#
+#   HIPLIBS_FOUND           - True if hiplibs is found
+#   HIPLIBS_LIBRARIES       - The required libraries
+#   HIPLIBS_INCLUDE_DIRS    - The required include directory
+#
+# The following import target is created
+#
+# ::
+#
+#   HIPLIBS::hiplibs
+
+#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them.
+if(NOT POLICY CMP0074)
+    set(_HIPLIBS_PATHS ${HIPLIBS_ROOT} $ENV{HIPLIBS_ROOT})
+endif()
+
+if(NOT _HIPLIBS_PATHS)
+    set(_HIPLIBS_PATHS /opt/rocm)
+endif()
+
+find_path(
+    HIPLIBS_INCLUDE_DIRS
+    NAMES "hip/hip_runtime_api.h"
+    HINTS ${_HIPLIBS_PATHS}
+    PATH_SUFFIXES "hip/include" "include"
+)
+find_library(
+    HIPLIBS_LIBRARIES
+    NAMES "hip_hcc"
+    HINTS ${_ROCBLAS_PATHS}
+    PATH_SUFFIXES "hip/lib" "lib" "lib64" 
+)
+find_path(
+    HSA_INCLUDE_DIRS
+    NAMES "hsa/hsa.h"
+    HINTS ${_HIPLIBS_PATHS}
+    PATH_SUFFIXES "hip/include" "include"
+)
+
+# check if found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(HIPLIBS REQUIRED_VARS HIPLIBS_INCLUDE_DIRS HSA_INCLUDE_DIRS HIPLIBS_LIBRARIES)
+
+list(APPEND HIPLIBS_INCLUDE_DIRS ${HSA_INCLUDE_DIRS})
+
+# add target to link against
+if(HIPLIBS_FOUND)
+    if(NOT TARGET HIPLIBS::hiplibs)
+        add_library(HIPLIBS::hiplibs INTERFACE IMPORTED)
+    endif()
+    set_property(TARGET HIPLIBS::hiplibs PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${HIPLIBS_INCLUDE_DIRS})
+    set_property(TARGET HIPLIBS::hiplibs PROPERTY INTERFACE_LINK_LIBRARIES ${HIPLIBS_LIBRARIES})
+endif()
+
+# prevent clutter in cache
+MARK_AS_ADVANCED(HIPLIBS_FOUND HIPLIBS_LIBRARIES HIPLIBS_INCLUDE_DIRS)
diff --git a/cmake/modules/FindROCBLAS.cmake b/cmake/modules/FindROCBLAS.cmake
new file mode 100644
index 000000000..6b6b6d498
--- /dev/null
+++ b/cmake/modules/FindROCBLAS.cmake
@@ -0,0 +1,84 @@
+#  Copyright (c) 2019 ETH Zurich
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#
+#  1. Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#  2. Redistributions in binary form must reproduce the above copyright
+#     notice, this list of conditions and the following disclaimer in the
+#     documentation and/or other materials provided with the distribution.
+#  3. Neither the name of the copyright holder nor the names of its contributors
+#     may be used to endorse or promote products derived from this software
+#     without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+
+
+#.rst:
+# FindROCBLAS
+# -----------
+#
+# This module searches for the fftw3 library.
+#
+# The following variables are set
+#
+# ::
+#
+#   ROCBLAS_FOUND           - True if rocblas is found
+#   ROCBLAS_LIBRARIES       - The required libraries
+#   ROCBLAS_INCLUDE_DIRS    - The required include directory
+#
+# The following import target is created
+#
+# ::
+#
+#   ROCBLAS::rocblas
+
+#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them.
+if(NOT POLICY CMP0074)
+    set(_ROCBLAS_PATHS ${ROCBLAS_ROOT} $ENV{ROCBLAS_ROOT})
+endif()
+
+if(NOT _ROCBLAS_PATHS)
+    set(_ROCBLAS_PATHS /opt/rocm)
+endif()
+
+find_library(
+    ROCBLAS_LIBRARIES
+    NAMES "rocblas"
+    HINTS ${_ROCBLAS_PATHS}
+    PATH_SUFFIXES "rocblas/lib" "rocblas" 
+)
+find_path(
+    ROCBLAS_INCLUDE_DIRS
+    NAMES "rocblas.h"
+    HINTS ${_ROCBLAS_PATHS}
+    PATH_SUFFIXES "rocblas/include" "include"
+)
+
+# check if found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ROCBLAS REQUIRED_VARS ROCBLAS_INCLUDE_DIRS ROCBLAS_LIBRARIES )
+
+# add target to link against
+if(ROCBLAS_FOUND)
+    if(NOT TARGET ROCBLAS::rocblas)
+        add_library(ROCBLAS::rocblas INTERFACE IMPORTED)
+    endif()
+    set_property(TARGET ROCBLAS::rocblas PROPERTY INTERFACE_LINK_LIBRARIES ${ROCBLAS_LIBRARIES})
+    set_property(TARGET ROCBLAS::rocblas PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${ROCBLAS_INCLUDE_DIRS})
+endif()
+
+# prevent clutter in cache
+MARK_AS_ADVANCED(ROCBLAS_FOUND ROCBLAS_LIBRARIES ROCBLAS_INCLUDE_DIRS)
diff --git a/cmake/modules/FindROCM.cmake b/cmake/modules/FindROCM.cmake
deleted file mode 100644
index f8d2f6b6b..000000000
--- a/cmake/modules/FindROCM.cmake
+++ /dev/null
@@ -1,440 +0,0 @@
-# - Find the ROCM library
-#
-# Usage:
-#   find_package(ROCM [REQUIRED] [QUIET] COMPONENTS [components ...] )
-#
-# Compnents available:
-#  - hipblas
-#  - hipsparse
-#  - rocfft
-#  - rocblas
-#  - rocsparse
-#
-# Commands made available:
-#   rocm_hip_add_library(<name> <sources> [STATIC | SHARED] [FLAGS] <flags> [OUTPUT_DIR] <dir> [INCLUDE_DIRS] <dirs ...>)
-#    --- Compiles source files into an imported library with hipcc. No global defitions or include directories are taken into account.
-#
-# The following variables can be set for compilation:
-#   ROCM_HIPCC_FLAGS ----------------- Flags passed on to hipcc compiler
-#   ROCM_HIPCC_FLAGS_DEBUG ----------- Flags passed on to hipcc compiler in DEBUG mode
-#   ROCM_HIPCC_FLAGS_RELEASE --------- Flags passed on to hipcc compiler in RELEASE mode
-#   ROCM_HIPCC_FLAGS_RELWITHDEBINFO -- Flags passed on to hipcc compiler in RELWITHDEBINFO mode
-#   ROCM_HIPCC_FLAGS_MINSIZEREL ------ Flags passed on to hipcc compiler in MINSIZEREL mode
-#
-# The following variables can be set to specify a search location
-#   ROCM_ROOT ------------ if set, the libraries are exclusively searched under this path
-#   <COMPONENT>_ROOT ------ if set, search for component specific libraries at given path. Takes precedence over ROCM_ROOT
-#
-# The following variables are generated:
-#   ROCM_FOUND ------------------- true if ROCM is found on the system
-#   ROCM_LIBRARIES --------------- full path to ROCM
-#   ROCM_INCLUDE_DIRS ------------ ROCM include directories
-#   ROCM_DEFINITIONS ------------- ROCM definitions
-#   ROCM_HCC_EXECUTABLE ---------- ROCM HCC compiler
-#   ROCM_HCC-CONFIG_EXECUTABLE --- ROCM HCC config
-#   ROCM_HIPCC_EXECUTABLE -------- HIPCC compiler
-#   ROCM_HIPCONFIG_EXECUTABLE ---- hip config
-#   ROCM_HIPIFY-PERL_EXECUTABLE -- hipify
-#   ROCM_HIP_PLATFORM ------------ Platform identifier: "hcc" or "nvcc"
-#
-
-
-set(ROCM_HIPCC_FLAGS "" CACHE STRING "Flags for HIPCC Compiler")
-set(ROCM_HIPCC_FLAGS_DEBUG "-g" CACHE STRING "Debug flags for HIPCC Compiler")
-set(ROCM_HIPCC_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "Release flags for HIPCC Compiler")
-set(ROCM_HIPCC_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG" CACHE STRING "Release with debug flags for HIPCC Compiler")
-set(ROCM_HIPCC_FLAGS_MINSIZEREL "-Os -DNDEBUG" CACHE STRING "Minimum size flags for HIPCC Compiler")
-
-#If environment variable ROCM_ROOT is specified
-if(NOT ROCM_ROOT AND ENV{ROCM_ROOT})
-    file(TO_CMAKE_PATH "$ENV{ROCM_ROOT}" ROCM_ROOT)
-    set(ROCM_ROOT "${ROCM_ROOT}" CACHE PATH "Root directory for ROCM installation.")
-endif()
-
-set(ROCM_FOUND FALSE)
-set(ROCM_LIBRARIES)
-set(ROCM_INCLUDE_DIRS)
-set(ROCM_DEFINITIONS)
-unset(ROCM_HCC_EXECUTABLE)
-unset(ROCM_HCC-CONFIG_EXECUTABLE)
-unset(ROCM_HIPCC_EXECUTABLE)
-unset(ROCM_HIPCONFIG_EXECUTABLE)
-unset(ROCM_HIPFIY-PERL-EXECUTABLE)
-unset(ROCM_HIP_PLATFORM)
-
-include(FindPackageHandleStandardArgs)
-
-
-# Finds libraries and include path for rocm modules
-# IN:
-#   - module_name: name of a module (e.g. hcc)
-#   - following arguments: name of libraries required
-# OUT:
-#   - ROCM_LIBRARIES: Appends to list of libraries
-#   - ROCM_INCLUDE_DIRS: Appends to include dirs
-function(find_rcm_module module_name)
-    # convert module name to upper case for consistent variable naming
-    string(TOUPPER ${module_name} MODULE_NAME_UPPER)
-
-
-    if(DEFINED ${MODULE_NAME_UPPER}_ROOT)
-	set(ROOT_DIR ${${MODULE_NAME_UPPER}_ROOT})
-    elseif(DEFINED ROCM_ROOT)
-	set(ROOT_DIR ${ROCM_ROOT})
-    endif()
-
-    # get abosolute path to avoid issues with tilde
-    if(ROOT_DIR)
-        get_filename_component(ROOT_DIR ${ROOT_DIR} ABSOLUTE)
-    endif()
-
-    # remove module name from input arguments
-    set(LIBRARY_NAMES ${ARGV})
-    list(REMOVE_AT LIBRARY_NAMES 0)
-
-    if(${ROCM_FIND_REQUIRED})
-	set(ROCM_${MODULE_NAME_UPPER}_FIND_REQUIRED TRUE)
-    else()
-	set(ROCM_${MODULE_NAME_UPPER}_FIND_REQUIRED FALSE)
-    endif()
-    if(${ROCM_FIND_QUIETLY})
-	set(ROCM_${MODULE_NAME_UPPER}_FIND_QUIETLY TRUE)
-    else()
-	set(ROCM_${MODULE_NAME_UPPER}_FIND_QUIETLY FALSE)
-    endif()
-
-    set(ROCM_LIBRARIES_${MODULE_NAME_UPPER})
-
-    if(ROOT_DIR)
-        # find libraries
-        foreach(library_name IN LISTS LIBRARY_NAMES)
-            find_library(
-                ROCM_LIBRARIES_${library_name}
-                NAMES ${library_name}
-                PATHS ${ROOT_DIR}
-                PATH_SUFFIXES "lib" "${module_name}/lib"
-                NO_DEFAULT_PATH
-            )
-	    find_package_handle_standard_args(ROCM_${MODULE_NAME_UPPER} FAIL_MESSAGE
-                "For ROCM module ${module_name}, library ${library_name} could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT." 
-                REQUIRED_VARS ROCM_LIBRARIES_${library_name})
-	    if(ROCM_LIBRARIES_${library_name})
-		list(APPEND ROCM_LIBRARIES_${MODULE_NAME_UPPER} ${ROCM_LIBRARIES_${library_name}})
-		mark_as_advanced(ROCM_LIBRARIES_${library_name})
-	    endif()
-        endforeach()
-
-        # find include directory
-        find_path(
-            ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}
-            NAMES ${module_name}/include
-	    PATHS ${ROOT_DIR} ${ROOT_DIR}/..
-            NO_DEFAULT_PATH
-        )
-        # set include directory for module if found
-        if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
-            set(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER} ${ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}}/${module_name}/include)
-        endif()
-
-    else()
-
-        foreach(library_name IN LISTS LIBRARY_NAMES)
-            find_library(
-                ROCM_LIBRARIES_${library_name}
-                NAMES ${library_name}
-                PATHS /opt/rocm
-                PATH_SUFFIXES "lib" "lib64" "${module_name}/lib" "rocm/${module_name}/lib"
-            )
-	    find_package_handle_standard_args(ROCM_${MODULE_NAME_UPPER} FAIL_MESSAGE
-                "For ROCM module ${module_name}, library ${library_name} could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT." 
-                REQUIRED_VARS ROCM_LIBRARIES_${library_name})
-	    if(ROCM_LIBRARIES_${library_name})
-		list(APPEND ROCM_LIBRARIES_${MODULE_NAME_UPPER} ${ROCM_LIBRARIES_${library_name}})
-		mark_as_advanced(ROCM_LIBRARIES_${library_name})
-	    endif()
-        endforeach()
-
-        # find include directory
-        find_path(
-            ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}
-            NAMES ${module_name}/include
-            PATHS /opt/rocm/
-        )
-        # set include directory for module if found
-        if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
-            set(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER} ${ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}}/${module_name}/include)
-        endif()
-    endif()
-
-
-    # check if all required parts found
-    find_package_handle_standard_args(ROCM_${MODULE_NAME_UPPER} FAIL_MESSAGE
-        "ROCM module ${module_name} could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT." 
-        REQUIRED_VARS ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
-    if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
-	mark_as_advanced(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
-    endif()
-
-    # set global variables
-    if(ROCM_LIBRARIES_${MODULE_NAME_UPPER})
-        set(ROCM_LIBRARIES ${ROCM_LIBRARIES} ${ROCM_LIBRARIES_${MODULE_NAME_UPPER}} PARENT_SCOPE)
-    endif()
-    if(ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER})
-        set(ROCM_INCLUDE_DIRS ${ROCM_INCLUDE_DIRS} ${ROCM_INCLUDE_DIRS_${MODULE_NAME_UPPER}} PARENT_SCOPE)
-    endif()
-
-endfunction()
-
-
-# Finds executables of rocm modules
-# IN:
-#   - module_name: name of a module (e.g. hcc)
-#   - executable_name: name of the executable (e.g. hcc)
-# OUT:
-#   - ROCM_${executable_name}_EXECUTABLE: Path to executable
-function(find_rocm_executable module_name executable_name)
-    string(TOUPPER ${module_name} MODULE_NAME_UPPER)
-    string(TOUPPER ${executable_name} EXECUTABLE_NAME_UPPER)
-    unset(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE PARENT_SCOPE)
-
-    if(DEFINED ${MODULE_NAME_UPPER}_ROOT)
-	set(ROOT_DIR ${${MODULE_NAME_UPPER}_ROOT})
-    elseif(DEFINED ROCM_ROOT)
-	set(ROOT_DIR ${ROCM_ROOT})
-    endif()
-
-    # get abosolute path to avoid issues with tilde
-    if(ROOT_DIR)
-        get_filename_component(ROOT_DIR ${ROOT_DIR} ABSOLUTE)
-    endif()
-
-    if(ROOT_DIR)
-            find_file(
-                ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE
-                NAMES ${executable_name}
-		PATHS ${ROOT_DIR}
-		PATH_SUFFIXES "bin" "${module_name}/bin"
-                NO_DEFAULT_PATH
-            )
-    else()
-            find_file(
-                ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE
-                NAMES ${executable_name}
-                PATHS "/opt/rocm"
-		PATH_SUFFIXES "bin" "${module_name}/bin"
-            )
-    endif()
-    set(ROCM_${EXECUTABLE_NAME_UPPER} ROCM_${EXECUTABLE_NAME_UPPER} PARENT_SCOPE)
-
-    if(${ROCM_FIND_REQUIRED})
-	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_REQUIRED TRUE)
-    else()
-	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_REQUIRED FALSE)
-    endif()
-    if(${ROCM_FIND_QUIETLY})
-	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_QUIETLY TRUE)
-    else()
-	set(ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER}_FIND_QUIETLY FALSE)
-    endif()
-    find_package_handle_standard_args(ROCM FAIL_MESSAGE
-	"ROCM_${MODULE_NAME_UPPER}_${EXECUTABLE_NAME_UPPER} ${executable_name} executable could not be found. Please specify ROCM_ROOT or ${MODULE_NAME_UPPER}_ROOT."
-        REQUIRED_VARS ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE)
-    if(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE)
-	set(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE ${ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE} PARENT_SCOPE)
-	mark_as_advanced(ROCM_${EXECUTABLE_NAME_UPPER}_EXECUTABLE)
-    endif()
-endfunction()
-
-
-
-# find compilers
-find_rocm_executable(hcc hcc)
-find_rocm_executable(hip hipcc)
-
-if(ROCM_HIPCC_EXECUTABLE AND ROCM_HCC_EXECUTABLE)
-    set(ROCM_FOUND TRUE)
-else()
-    set(ROCM_FOUND FALSE)
-    return()
-endif()
-
-
-# find other executables and libraries
-find_rocm_executable(hcc hcc-config)
-find_rocm_executable(hip hipconfig)
-find_rocm_executable(hip hipify-perl)
-find_rcm_module(hcc LTO OptRemarks mcwamp mcwamp_cpu mcwamp_hsa hc_am)
-find_rcm_module(hip hip_hcc)
-find_rcm_module(rocm hsa-runtime64)
-
-
-# parse hip config
-execute_process(COMMAND ${ROCM_HIPCONFIG_EXECUTABLE} -P OUTPUT_VARIABLE ROCM_HIP_PLATFORM RESULT_VARIABLE RESULT_VALUE)
-if(NOT ${RESULT_VALUE} EQUAL 0)
-    message(FATAL_ERROR "Error parsing platform identifier from hipconfig! Code: ${RESULT_VALUE}")
-endif()
-if(NOT ROCM_HIP_PLATFORM)
-    message(FATAL_ERROR "Empty platform identifier from hipconfig!")
-endif()
-
-# set definitions
-if("${ROCM_HIP_PLATFORM}" STREQUAL "hcc")
-    set(ROCM_DEFINITIONS -D__HIP_PLATFORM_HCC__)
-elseif("${ROCM_HIP_PLATFORM}" STREQUAL "nvcc")
-    set(ROCM_DEFINITIONS -D__HIP_PLATFORM_NVCC__)
-else()
-    message(FATAL_ERROR "Could not parse platform identifier from hipconfig! Value: ${ROCM_HIP_PLATFORM}")
-endif()
-
-# find libraries for each specified components
-foreach(module_name IN LISTS ROCM_FIND_COMPONENTS)
-    # set required libaries for each module
-    if("${module_name}" STREQUAL "hipblas")
-        find_rcm_module(hipblas hipblas)
-    elseif("${module_name}" STREQUAL "hipsparse")
-        find_rcm_module(hipsparse hipsparse)
-    elseif("${module_name}" STREQUAL "rocblas")
-        find_rcm_module(rocblas rocblas)
-    elseif("${module_name}" STREQUAL "rocsparse")
-        find_rcm_module(rocsparse rocsparse)
-    elseif("${module_name}" STREQUAL "rocfft")
-        find_rcm_module(rocfft rocfft rocfft-device)
-    else()
-        message(FATAL_ERROR "Unrecognized component \"${module_name}\" in FindROCM module!")
-    endif()
-endforeach()
-
-
-# Generates library compiled with hipcc
-# Usage:
-#   rocm_hip_add_library(<name> <sources> [STATIC | SHARED] [FLAGS] <flags> [OUTPUT_DIR] <dir> [INCLUDE_DIRS] <dirs ...>)
-macro(rocm_hip_add_library)
-    cmake_parse_arguments(
-        HIP_LIB
-        "SHARED;STATIC"
-        "OUTPUT_DIR"
-        "FLAGS;INCLUDE_DIRS"
-        ${ARGN}
-    )
-    # allow either STATIC or SHARED
-    if(HIP_LIB_SHARED AND HIP_LIB_STATIC)
-        message(FATAL_ERROR "rocm_hip_add_library: library cannot by both static and shared!")
-    endif()
-
-    # default to SHARED
-    if(NOT (HIP_LIB_SHARED OR HIP_LIB_STATIC))
-        set(HIP_LIB_SHARED TRUE)
-    endif()
-
-    # default to current binary output directory
-    if(NOT HIP_LIB_OUTPUT_DIR)
-	set(HIP_LIB_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
-    endif()
-
-    # parse positional arguments
-    list(LENGTH HIP_LIB_UNPARSED_ARGUMENTS NARGS)
-    if(${NARGS} LESS 2)
-        message(FATAL_ERROR "rocm_hip_add_library: Not enough arguments!")
-    endif()
-    list(GET HIP_LIB_UNPARSED_ARGUMENTS 0 HIP_LIB_NAME)
-    list(REMOVE_AT HIP_LIB_UNPARSED_ARGUMENTS 0)
-    set(HIP_LIB_SOURCES ${HIP_LIB_UNPARSED_ARGUMENTS})
-
-    # generate include flags
-    set(_ROCM_FULL_PATH_INCLUDE_FLAGS)
-    foreach(_rocm_iternal_dir IN LISTS HIP_LIB_INCLUDE_DIRS)
-	if(NOT IS_ABSOLUTE ${_rocm_iternal_dir})
-	    get_filename_component(_rocm_iternal_dir ${_rocm_iternal_dir} ABSOLUTE)
-	endif()
-	list(APPEND _ROCM_FULL_PATH_INCLUDE_FLAGS -I${_rocm_iternal_dir})
-    endforeach()
-
-    # generate full path to source files
-    unset(_ROCM_SOURCES)
-    foreach(source IN LISTS HIP_LIB_SOURCES)
-	if(NOT IS_ABSOLUTE ${source})
-	    get_filename_component(source ${source} ABSOLUTE)
-	endif()
-	set(_ROCM_SOURCES ${_ROCM_SOURCES} ${source})
-    endforeach()
-    get_filename_component(HIP_LIB_OUTPUT_DIR ${HIP_LIB_OUTPUT_DIR} ABSOLUTE)
-
-    # generate flags to use
-    set(_ROCM_STD_FLAGS ${HIP_LIB_FLAGS} ${ROCM_HIPCC_FLAGS})
-    list(FILTER _ROCM_STD_FLAGS INCLUDE REGEX -std=)
-    set(_ROCM_FLAGS ${HIP_LIB_FLAGS})
-    if(CMAKE_CXX_STANDARD AND NOT _ROCM_STD_FLAGS)
-	list(APPEND _ROCM_FLAGS -std=c++${CMAKE_CXX_STANDARD})
-    endif()
-    if(CMAKE_BUILD_TYPE)
-	string(TOUPPER ${CMAKE_BUILD_TYPE} _ROCM_BUILD_TYPE_UPPER)
-	list(APPEND _ROCM_FLAGS ${ROCM_HIPCC_FLAGS_${_ROCM_BUILD_TYPE_UPPER}})
-    endif()
-
-    if(NOT ROCM_HIPCC_EXECUTABLE)
-	    message(FATAL_ERROR "HIPCC executable not found!")
-    endif()
-
-    # create imported shared library
-    if(HIP_LIB_SHARED)
-        set(_ROCM_FLAGS ${_ROCM_FLAGS} -fPIC)
-    endif()
-
-    # compile all files to .o
-    set(_ROCM_OBJS)
-    set(_ROCM_OBJ_TARGETS)
-    foreach(_rocm_file IN LISTS _ROCM_SOURCES)
-
-	# create output directory for .o file
-	get_filename_component(_ROCM_CURRENT_DIR ${_rocm_file} DIRECTORY)
-	file(RELATIVE_PATH _ROCM_CURRENT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ${_ROCM_CURRENT_DIR})
-	set(_ROCM_OBJ_OUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${HIP_LIB_NAME}.dir/${_ROCM_CURRENT_DIR}")
-	file(MAKE_DIRECTORY ${_ROCM_OBJ_OUT_DIR})
-
-	# set .o name and path
-	get_filename_component(_ROCM_FILE_NAME_ONLY ${_rocm_file} NAME)
-	set(_ROCM_OBJ_FILE ${_ROCM_OBJ_OUT_DIR}/${_ROCM_FILE_NAME_ONLY}.o)
-	list(APPEND _ROCM_OBJS ${_ROCM_OBJ_FILE})
-	list(APPEND _ROCM_OBJ_TARGETS HIP_TARGET_${_ROCM_FILE_NAME_ONLY})
-
-	# compile .o file
-	add_custom_target(HIP_TARGET_${_ROCM_FILE_NAME_ONLY} COMMAND ${ROCM_HIPCC_EXECUTABLE} -c ${_rocm_file} -o ${_ROCM_OBJ_FILE} ${_ROCM_FLAGS} ${_ROCM_FULL_PATH_INCLUDE_FLAGS}
-	    WORKING_DIRECTORY ${_ROCM_OBJ_OUT_DIR} SOURCES ${_rocm_file})
-
-    endforeach()
-
-    # compile shared library
-    if(HIP_LIB_SHARED)
-	add_custom_target(HIP_TARGET_${HIP_LIB_NAME} COMMAND ${ROCM_HIPCC_EXECUTABLE} ${_ROCM_OBJS} -fPIC --shared -o ${HIP_LIB_OUTPUT_DIR}/lib${HIP_LIB_NAME}.so
-	    ${_ROCM_FLAGS} ${_ROCM_FULL_PATH_INCLUDE_FLAGS}
-	    WORKING_DIRECTORY ${HIP_LIB_OUTPUT_DIR})
-
-	add_library(${HIP_LIB_NAME} INTERFACE)
-	target_link_libraries(${HIP_LIB_NAME} INTERFACE ${HIP_LIB_OUTPUT_DIR}/lib${HIP_LIB_NAME}.so)
-
-	# add depencies
-	add_dependencies(${HIP_LIB_NAME} HIP_TARGET_${HIP_LIB_NAME})
-	foreach(_rocm_target IN LISTS _ROCM_OBJ_TARGETS)
-	    add_dependencies(HIP_TARGET_${HIP_LIB_NAME} ${_rocm_target})
-	endforeach()
-    endif()
-
-    # static library
-    if(HIP_LIB_STATIC)
-        # create library from object files
-        add_library(${HIP_LIB_NAME} ${_ROCM_OBJS})
-        set_target_properties(${HIP_LIB_NAME} PROPERTIES LINKER_LANGUAGE CXX)
-        set_source_files_properties(
-            ${_ROCM_OBJS}
-            PROPERTIES
-            EXTERNAL_OBJECT true
-            GENERATED true
-            )
-	# add dependencies
-	foreach(_rocm_target IN LISTS _ROCM_OBJ_TARGETS)
-	    add_dependencies(${HIP_LIB_NAME} ${_rocm_target})
-	endforeach()
-    endif()
-
-endmacro()
-
diff --git a/cmake/modules/Findmpi4py.cmake b/cmake/modules/Findmpi4py.cmake
index f47001c57..485b5ec88 100644
--- a/cmake/modules/Findmpi4py.cmake
+++ b/cmake/modules/Findmpi4py.cmake
@@ -17,10 +17,10 @@ if(NOT exit_code EQUAL "0")
 endif()
 
 
-find_package_handle_standard_args(MPI4PY DEFAULT_MSG MPI4PY_INCLUDE_DIR)
-mark_as_advanced(MPI4PY_FOUND MPI4PY_INCLUDE_DIR)
+find_package_handle_standard_args(mpi4py DEFAULT_MSG MPI4PY_INCLUDE_DIR)
+mark_as_advanced(mpi4py_FOUND MPI4PY_INCLUDE_DIR)
 
-if(MPI4PY_FOUND AND NOT TARGET mpi4py::mpi4py)
+if(mpi4py_FOUND AND NOT TARGET mpi4py::mpi4py)
   add_library(mpi4py::mpi4py INTERFACE IMPORTED)
   set_target_properties(mpi4py::mpi4py PROPERTIES
                                        INTERFACE_INCLUDE_DIRECTORIES "${MPI4PY_INCLUDE_DIR}")
diff --git a/doc/doxygen.cfg b/doc/doxygen.cfg
index 4c84a8b31..30f497f49 100644
--- a/doc/doxygen.cfg
+++ b/doc/doxygen.cfg
@@ -38,7 +38,7 @@ PROJECT_NAME           = "SIRIUS"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "6.5.1"
+PROJECT_NUMBER         = "6.5.3"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/examples/fp-lapw/Eu6C60/C.xml b/examples/fp-lapw/Eu6C60/C.xml
new file mode 100644
index 000000000..d155c634c
--- /dev/null
+++ b/examples/fp-lapw/Eu6C60/C.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="C" name="carbon" z="-6.00000" mass="21894.16673">
+    <muffinTin rmin="0.100000E-06" radius="1.4500" rinf="21.1565" radialmeshPoints="350"/>
+    <atomicState n="1" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="0" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="1" occ="1.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="2" occ="1.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.0000" searchE="true"/>
+      <custom l="0" type="lapw" trialEnergy="-0.5012" searchE="true"/>
+      <lo l="0">
+        <wf matchingOrder="0" trialEnergy="-0.5012" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-0.5012" searchE="true"/>
+      </lo>
+      <lo l="0">
+        <wf matchingOrder="1" trialEnergy="-0.5012" searchE="true"/>
+        <wf matchingOrder="2" trialEnergy="-0.5012" searchE="true"/>
+      </lo>
+      <custom l="1" type="lapw" trialEnergy="-0.1990" searchE="true"/>
+      <lo l="1">
+        <wf matchingOrder="0" trialEnergy="-0.1990" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-0.1990" searchE="true"/>
+      </lo>
+      <lo l="1">
+        <wf matchingOrder="1" trialEnergy="-0.1990" searchE="true"/>
+        <wf matchingOrder="2" trialEnergy="-0.1990" searchE="true"/>
+      </lo>
+      <!-- Semi-Core States -->
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Eu6C60/Eu.xml b/examples/fp-lapw/Eu6C60/Eu.xml
new file mode 100644
index 000000000..b4287fae8
--- /dev/null
+++ b/examples/fp-lapw/Eu6C60/Eu.xml
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="Eu" name="europium" z="-63.0000" mass="277013.4257">
+    <muffinTin rmin="0.100000E-06" radius="2.2000" rinf="28.6992" radialmeshPoints="550"/>
+    <atomicState n="1" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="1" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="1" kappa="2" occ="4.00000" core="true"/>
+    <atomicState n="3" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="3" l="1" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="3" l="1" kappa="2" occ="4.00000" core="true"/>
+    <atomicState n="3" l="2" kappa="2" occ="4.00000" core="true"/>
+    <atomicState n="3" l="2" kappa="3" occ="6.00000" core="true"/>
+    <atomicState n="4" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="4" l="1" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="4" l="1" kappa="2" occ="4.00000" core="true"/>
+    <atomicState n="4" l="2" kappa="2" occ="4.00000" core="false"/>
+    <atomicState n="4" l="2" kappa="3" occ="6.00000" core="false"/>
+    <atomicState n="4" l="3" kappa="3" occ="3.00000" core="false"/>
+    <atomicState n="4" l="3" kappa="4" occ="4.00000" core="false"/>
+    <atomicState n="5" l="0" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="5" l="1" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="5" l="1" kappa="2" occ="4.00000" core="false"/>
+    <atomicState n="6" l="0" kappa="1" occ="2.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.0000" searchE="true"/>
+      <custom l="0" type="lapw" trialEnergy="-0.1383" searchE="true"/>
+      <lo l="0">
+        <wf matchingOrder="0" trialEnergy="-0.1383" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-0.1383" searchE="true"/>
+      </lo>
+      <lo l="0">
+        <wf matchingOrder="1" trialEnergy="-0.1383" searchE="true"/>
+        <wf matchingOrder="2" trialEnergy="-0.1383" searchE="true"/>
+      </lo>
+      <custom l="1" type="lapw" trialEnergy="-0.8764" searchE="true"/>
+      <lo l="1">
+        <wf matchingOrder="0" trialEnergy="-0.8764" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-0.8764" searchE="true"/>
+      </lo>
+      <lo l="1">
+        <wf matchingOrder="1" trialEnergy="-0.8764" searchE="true"/>
+        <wf matchingOrder="2" trialEnergy="-0.8764" searchE="true"/>
+      </lo>
+      <custom l="2" type="lapw" trialEnergy="-4.9470" searchE="true"/>
+      <lo l="2">
+        <wf matchingOrder="0" trialEnergy="-4.9470" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-4.9470" searchE="true"/>
+      </lo>
+      <lo l="2">
+        <wf matchingOrder="1" trialEnergy="-4.9470" searchE="true"/>
+        <wf matchingOrder="2" trialEnergy="-4.9470" searchE="true"/>
+      </lo>
+      <custom l="3" type="lapw" trialEnergy="-0.1087" searchE="true"/>
+      <lo l="3">
+        <wf matchingOrder="0" trialEnergy="-0.1087" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-0.1087" searchE="true"/>
+      </lo>
+      <lo l="3">
+        <wf matchingOrder="1" trialEnergy="-0.1087" searchE="true"/>
+        <wf matchingOrder="2" trialEnergy="-0.1087" searchE="true"/>
+      </lo>
+      <!-- Semi-Core States -->
+      <lo l="0">
+        <wf matchingOrder="0" trialEnergy="-0.1383" searchE="true"/>
+        <wf matchingOrder="0" trialEnergy="-1.6451" searchE="true"/>
+      </lo>
+      <lo l="0">
+        <wf matchingOrder="0" trialEnergy="-1.6451" searchE="true"/>
+        <wf matchingOrder="1" trialEnergy="-1.6451" searchE="true"/>
+      </lo>
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Eu6C60/input.xml b/examples/fp-lapw/Eu6C60/input.xml
new file mode 100644
index 000000000..7c796be5b
--- /dev/null
+++ b/examples/fp-lapw/Eu6C60/input.xml
@@ -0,0 +1,151 @@
+<input>
+  <title> converted from SIRIUS json input </title>
+  <structure speciespath="./" autormt="true">
+    <crystal scale="1">
+      <basevect>    20.653194760596     0.000000000000     0.000000000000 </basevect>
+      <basevect>     0.000000000000    20.653194760596     0.000000000000 </basevect>
+      <basevect>     0.000000000000     0.000000000000    20.653194760596 </basevect>
+    </crystal>
+    <species speciesfile="Eu.xml" rmt="2.0">
+      <atom coord="    0.724100000000     0.000000000000     0.500000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.275900000000     0.000000000000     0.500000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.500000000000     0.724100000000     0.000000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.500000000000     0.275900000000     0.000000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.000000000000     0.500000000000     0.724100000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.000000000000     0.500000000000     0.275900000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.224100000000     0.500000000000     0.000000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.775900000000     0.500000000000     0.000000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.000000000000     0.224100000000     0.500000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.000000000000     0.775900000000     0.500000000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.500000000000     0.000000000000     0.224100000000" bfcmt="0.0 0.0 1.0"/>
+      <atom coord="    0.500000000000     0.000000000000     0.775900000000" bfcmt="0.0 0.0 1.0"/>
+</species>
+    <species speciesfile="C.xml" rmt="2.0">
+      <atom coord="    0.662900000000     0.067100000000     0.000000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.337100000000     0.067100000000     0.000000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.337100000000     0.932900000000     0.000000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.662900000000     0.932900000000     0.000000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.000000000000     0.662900000000     0.067100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.000000000000     0.337100000000     0.067100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.000000000000     0.337100000000     0.932900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.000000000000     0.662900000000     0.932900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.067100000000     0.000000000000     0.662900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.067100000000     0.000000000000     0.337100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.932900000000     0.000000000000     0.337100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.932900000000     0.000000000000     0.662900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.162900000000     0.567100000000     0.500000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.837100000000     0.567100000000     0.500000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.837100000000     0.432900000000     0.500000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.162900000000     0.432900000000     0.500000000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.500000000000     0.162900000000     0.567100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.500000000000     0.837100000000     0.567100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.500000000000     0.837100000000     0.432900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.500000000000     0.162900000000     0.432900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.567100000000     0.500000000000     0.162900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.567100000000     0.500000000000     0.837100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.432900000000     0.500000000000     0.837100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.432900000000     0.500000000000     0.162900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.721500000000     0.129900000000     0.104700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.278500000000     0.129900000000     0.895300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.278500000000     0.870100000000     0.104700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.721500000000     0.870100000000     0.895300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.104700000000     0.721500000000     0.129900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.895300000000     0.278500000000     0.129900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.104700000000     0.278500000000     0.870100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.895300000000     0.721500000000     0.870100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.129900000000     0.104700000000     0.721500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.129900000000     0.895300000000     0.278500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.870100000000     0.104700000000     0.278500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.870100000000     0.895300000000     0.721500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.278500000000     0.870100000000     0.895300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.721500000000     0.870100000000     0.104700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.721500000000     0.129900000000     0.895300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.278500000000     0.129900000000     0.104700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.895300000000     0.278500000000     0.870100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.104700000000     0.721500000000     0.870100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.895300000000     0.721500000000     0.129900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.104700000000     0.278500000000     0.129900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.870100000000     0.895300000000     0.278500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.870100000000     0.104700000000     0.721500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.129900000000     0.895300000000     0.721500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.129900000000     0.104700000000     0.278500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.221500000000     0.629900000000     0.604700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.778500000000     0.629900000000     0.395300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.778500000000     0.370100000000     0.604700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.221500000000     0.370100000000     0.395300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.604700000000     0.221500000000     0.629900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.395300000000     0.778500000000     0.629900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.604700000000     0.778500000000     0.370100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.395300000000     0.221500000000     0.370100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.629900000000     0.604700000000     0.221500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.629900000000     0.395300000000     0.778500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.370100000000     0.604700000000     0.778500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.370100000000     0.395300000000     0.221500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.778500000000     0.370100000000     0.395300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.221500000000     0.370100000000     0.604700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.221500000000     0.629900000000     0.395300000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.778500000000     0.629900000000     0.604700000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.395300000000     0.778500000000     0.370100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.604700000000     0.221500000000     0.370100000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.395300000000     0.221500000000     0.629900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.604700000000     0.778500000000     0.629900000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.370100000000     0.395300000000     0.778500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.370100000000     0.604700000000     0.221500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.629900000000     0.395300000000     0.221500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.629900000000     0.604700000000     0.778500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.755400000000     0.064600000000     0.208500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.244600000000     0.064600000000     0.791500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.244600000000     0.935400000000     0.208500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.755400000000     0.935400000000     0.791500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.208500000000     0.755400000000     0.064600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.791500000000     0.244600000000     0.064600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.208500000000     0.244600000000     0.935400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.791500000000     0.755400000000     0.935400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.064600000000     0.208500000000     0.755400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.064600000000     0.791500000000     0.244600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.935400000000     0.208500000000     0.244600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.935400000000     0.791500000000     0.755400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.244600000000     0.935400000000     0.791500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.755400000000     0.935400000000     0.208500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.755400000000     0.064600000000     0.791500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.244600000000     0.064600000000     0.208500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.791500000000     0.244600000000     0.935400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.208500000000     0.755400000000     0.935400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.791500000000     0.755400000000     0.064600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.208500000000     0.244600000000     0.064600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.935400000000     0.791500000000     0.244600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.935400000000     0.208500000000     0.755400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.064600000000     0.791500000000     0.755400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.064600000000     0.208500000000     0.244600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.255400000000     0.564600000000     0.708500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.744600000000     0.564600000000     0.291500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.744600000000     0.435400000000     0.708500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.255400000000     0.435400000000     0.291500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.708500000000     0.255400000000     0.564600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.291500000000     0.744600000000     0.564600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.708500000000     0.744600000000     0.435400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.291500000000     0.255400000000     0.435400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.564600000000     0.708500000000     0.255400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.564600000000     0.291500000000     0.744600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.435400000000     0.708500000000     0.744600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.435400000000     0.291500000000     0.255400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.744600000000     0.435400000000     0.291500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.255400000000     0.435400000000     0.708500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.255400000000     0.564600000000     0.291500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.744600000000     0.564600000000     0.708500000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.291500000000     0.744600000000     0.435400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.708500000000     0.255400000000     0.435400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.291500000000     0.255400000000     0.564600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.708500000000     0.744600000000     0.564600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.435400000000     0.291500000000     0.744600000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.435400000000     0.708500000000     0.255400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.564600000000     0.291500000000     0.255400000000" bfcmt="0.0 0.0 0.0"/>
+      <atom coord="    0.564600000000     0.708500000000     0.744600000000" bfcmt="0.0 0.0 0.0"/>
+</species>
+  </structure>
+  <groundstate do="fromscratch" ngridk="6 6 6" rgkmax="5.8" gmaxvr="16" maxscl="2"  kptgroups="24">
+    <libxc exchange="XC_LDA_X" correlation="XC_LDA_C_PZ"/>
+    <sirius densityinit="true" density="true" vha="true" xc="true" eigenstates="true" sfacg="true" cfun="true"/>
+    <spin/>
+  </groundstate>
+</input>
diff --git a/examples/fp-lapw/Eu6C60/sirius.json b/examples/fp-lapw/Eu6C60/sirius.json
index 4b40c4cb8..75d27d6f6 100644
--- a/examples/fp-lapw/Eu6C60/sirius.json
+++ b/examples/fp-lapw/Eu6C60/sirius.json
@@ -16,7 +16,7 @@
         "aw_cutoff" : 7.0,
         "pw_cutoff" : 20.00,
         "auto_rmt" : 1,
-        "use_symmetry": 1,
+        "use_symmetry": true,
         "ngridk" : [1, 1, 1],
         "potential_tol" : 1e-5,
         "energy_tol" : 1e-8,
diff --git a/examples/fp-lapw/Mn_MOF/C.xml b/examples/fp-lapw/Mn_MOF/C.xml
new file mode 100644
index 000000000..0cb3eb23f
--- /dev/null
+++ b/examples/fp-lapw/Mn_MOF/C.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="C" name="carbon" z="-6.00000" mass="21894.16673">
+    <muffinTin rmin="0.100000E-05" radius="1.2500" rinf="20.0" radialmeshPoints="250"/>
+    <atomicState n="1" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="0" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="1" occ="1.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="2" occ="1.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.1500" searchE="false"/>
+      <custom l="0" type="apw+lo" trialEnergy="-0.2" searchE="false"/>
+      <custom l="1" type="apw+lo" trialEnergy="0.15" searchE="false"/>
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Mn_MOF/H.xml b/examples/fp-lapw/Mn_MOF/H.xml
new file mode 100644
index 000000000..046e4c160
--- /dev/null
+++ b/examples/fp-lapw/Mn_MOF/H.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="H" name="hydrogen" z="-1.00000" mass="1837.362220">
+    <muffinTin rmin="0.100000E-05" radius="1.1000" rinf="19.5924" radialmeshPoints="200"/>
+    <atomicState n="1" l="0" kappa="1" occ="1.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.1500" searchE="false"/>
+      <custom l="0" type="apw+lo" trialEnergy="0.1500" searchE="false"/>
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Mn_MOF/Mn.xml b/examples/fp-lapw/Mn_MOF/Mn.xml
new file mode 100644
index 000000000..df9b2d239
--- /dev/null
+++ b/examples/fp-lapw/Mn_MOF/Mn.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="Mn" name="manganese" z="-25.0000" mass="100145.9369">
+    <muffinTin rmin="0.100000E-04" radius="2.0000" rinf="23.8969" radialmeshPoints="350"/>
+    <atomicState n="1" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="1" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="1" kappa="2" occ="4.00000" core="true"/>
+    <atomicState n="3" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="3" l="1" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="3" l="1" kappa="2" occ="4.00000" core="false"/>
+    <atomicState n="3" l="2" kappa="2" occ="3.00000" core="false"/>
+    <atomicState n="3" l="2" kappa="3" occ="2.00000" core="false"/>
+    <atomicState n="4" l="0" kappa="1" occ="2.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.15" searchE="false"/>
+      
+      <custom l="0" type="apw+lo" trialEnergy="0.15" searchE="false"/>
+      
+      <custom l="1" type="apw+lo" trialEnergy="-1.76" searchE="false"/>
+      <lo l="1">
+        <wf matchingOrder="0" trialEnergy="-1.76" searchE="false"/>
+        <wf matchingOrder="0" trialEnergy="0.15" searchE="false"/>
+      </lo>
+      
+      <custom l="2" type="apw+lo" trialEnergy="0.10" searchE="false"/>
+      
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Mn_MOF/N.xml b/examples/fp-lapw/Mn_MOF/N.xml
new file mode 100644
index 000000000..9d387c749
--- /dev/null
+++ b/examples/fp-lapw/Mn_MOF/N.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="N" name="nitrogen" z="-7.00000" mass="25532.72506">
+    <muffinTin rmin="0.100000E-05" radius="1.4500" rinf="18.9090" radialmeshPoints="250"/>
+    <atomicState n="1" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="0" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="1" occ="1.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="2" occ="2.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.1500" searchE="false"/>
+      <custom l="0" type="apw+lo" trialEnergy="-0.4" searchE="false"/>
+      <custom l="1" type="apw+lo" trialEnergy="0.15" searchE="false"/>
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Mn_MOF/O.xml b/examples/fp-lapw/Mn_MOF/O.xml
new file mode 100644
index 000000000..d853bf114
--- /dev/null
+++ b/examples/fp-lapw/Mn_MOF/O.xml
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<spdb xsi:noNamespaceSchemaLocation="../../xml/species.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <sp chemicalSymbol="O" name="oxygen" z="-8.00000" mass="29165.12203">
+    <muffinTin rmin="0.100000E-05" radius="1.4500" rinf="17.0873" radialmeshPoints="200"/>
+    <atomicState n="1" l="0" kappa="1" occ="2.00000" core="true"/>
+    <atomicState n="2" l="0" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="1" occ="2.00000" core="false"/>
+    <atomicState n="2" l="1" kappa="2" occ="2.00000" core="false"/>
+    <basis>
+      <default type="lapw" trialEnergy="0.1500" searchE="false"/>
+      <custom l="0" type="apw+lo" trialEnergy="-0.8" searchE="false"/>
+      <custom l="1" type="apw+lo" trialEnergy="0.15" searchE="false"/>
+    </basis>
+  </sp>
+</spdb>
diff --git a/examples/fp-lapw/Mn_MOF/input.xml b/examples/fp-lapw/Mn_MOF/input.xml
new file mode 100644
index 000000000..be0a3667e
--- /dev/null
+++ b/examples/fp-lapw/Mn_MOF/input.xml
@@ -0,0 +1,131 @@
+<input>
+ 
+  <title>MnMOF</title>
+ 
+  <structure speciespath="./" autormt="false">
+    <crystal scale="27.30843234633042796443">
+      <basevect>  1.0  0.0  0.0 </basevect>
+      <basevect>  0.0  0.57961386755241851774  0.0 </basevect>
+      <basevect> -0.31792901467718496989  0.0  0.53166490266417548958 </basevect>
+    </crystal>
+    <species speciesfile="Mn.xml" rmt="2.2">
+      <atom coord="0.759870000  0.253600000  0.509140000" bfcmt="0.0  0.0  4.0"/>
+      <atom coord="0.259870000  0.753600000  0.509140000" bfcmt="0.0  0.0 -4.0"/>
+      <atom coord="0.759870000  0.746400000  0.009140000" bfcmt="0.0  0.0  4.0"/>
+      <atom coord="0.259870000  0.246400000  0.009140000" bfcmt="0.0  0.0 -4.0"/>
+    </species>
+    <species speciesfile="O.xml" rmt="1.2">
+      <atom coord="0.712840000     0.335340000     0.691430000"/>
+      <atom coord="0.212840000     0.835340000     0.691430000"/>
+      <atom coord="0.712840000     0.664660000     0.191430000"/>
+      <atom coord="0.212840000     0.164660000     0.191430000"/>
+      <atom coord="0.312850000     0.337520000     0.828020000"/>
+      <atom coord="0.812850000     0.837520000     0.828020000"/>
+      <atom coord="0.312850000     0.662480000     0.328020000"/>
+      <atom coord="0.812850000     0.162480000     0.328020000"/>
+      <atom coord="0.601810000     0.144950000     0.345310000"/>
+      <atom coord="0.101810000     0.644950000     0.345310000"/>
+      <atom coord="0.601810000     0.855050000     0.845310000"/>
+      <atom coord="0.101810000     0.355050000     0.845310000"/>
+      <atom coord="0.421830000     0.136800000     0.175080000"/>
+      <atom coord="0.921830000     0.636800000     0.175080000"/>
+      <atom coord="0.421830000     0.863200000     0.675080000"/>
+      <atom coord="0.921830000     0.363200000     0.675080000"/>
+      <atom coord="0.816440000     0.030440000     0.656690000"/>
+      <atom coord="0.316440000     0.530440000     0.656690000"/>
+      <atom coord="0.816440000     0.969560000     0.156690000"/>
+      <atom coord="0.316440000     0.469560000     0.156690000"/>
+      <atom coord="0.213870000     0.020560000     0.874250000"/>
+      <atom coord="0.713870000     0.520560000     0.874250000"/>
+      <atom coord="0.213870000     0.979440000     0.374250000"/>
+      <atom coord="0.713870000     0.479440000     0.374250000"/>
+    </species>
+    <species speciesfile="N.xml" rmt="1.1">
+      <atom coord="0.457100000     0.192500000     0.744810000"/>
+      <atom coord="0.957100000     0.692500000     0.744810000"/>
+      <atom coord="0.457100000     0.807500000     0.244810000"/>
+      <atom coord="0.957100000     0.307500000     0.244810000"/>
+      </species>
+      <species speciesfile="C.xml" rmt="1.05">
+      <atom coord="0.511130000     0.212150000     0.257610000"/>
+      <atom coord="0.011130000     0.712150000     0.257610000"/>
+      <atom coord="0.511130000     0.787850000     0.757610000"/>
+      <atom coord="0.011130000     0.287850000     0.757610000"/>
+      <atom coord="0.451810000     0.262100000     0.588180000"/>
+      <atom coord="0.951810000     0.762100000     0.588180000"/>
+      <atom coord="0.451810000     0.737900000     0.088180000"/>
+      <atom coord="0.951810000     0.237900000     0.088180000"/>
+      <atom coord="0.560980000     0.225420000     0.906940000"/>
+      <atom coord="0.060980000     0.725420000     0.906940000"/>
+      <atom coord="0.560980000     0.774580000     0.406940000"/>
+      <atom coord="0.060980000     0.274580000     0.406940000"/>
+      <atom coord="0.722680000     0.477800000     0.746110000"/>
+      <atom coord="0.222680000     0.977800000     0.746110000"/>
+      <atom coord="0.722680000     0.522200000     0.246110000"/>
+      <atom coord="0.222680000     0.022200000     0.246110000"/>
+      <atom coord="0.283410000     0.473040000     0.751650000"/>
+      <atom coord="0.783410000     0.973040000     0.751650000"/>
+      <atom coord="0.283410000     0.526960000     0.251650000"/>
+      <atom coord="0.783410000     0.026960000     0.251650000"/>
+      </species>
+     <species speciesfile="H.xml" rmt="0.85">
+      <atom coord="0.007360000     0.155400000     0.750330000"/>
+      <atom coord="0.507360000     0.655400000     0.750330000"/>
+      <atom coord="0.007360000     0.844600000     0.250330000"/>
+      <atom coord="0.507360000     0.344600000     0.250330000"/>
+      <atom coord="0.520060000     0.217080000     0.579490000"/>
+      <atom coord="0.020060000     0.717080000     0.579490000"/>
+      <atom coord="0.520060000     0.782920000     0.079490000"/>
+      <atom coord="0.020060000     0.282920000     0.079490000"/>
+      <atom coord="0.568320000     0.354400000     0.930630000"/>
+      <atom coord="0.068320000     0.854400000     0.930630000"/>
+      <atom coord="0.568320000     0.645600000     0.430630000"/>
+      <atom coord="0.068320000     0.145600000     0.430630000"/>
+      <atom coord="0.375040000     0.228710000     0.472820000"/>
+      <atom coord="0.875040000     0.728710000     0.472820000"/>
+      <atom coord="0.375040000     0.771290000     0.972820000"/>
+      <atom coord="0.875040000     0.271290000     0.972820000"/>
+      <atom coord="0.562740000     0.162120000     0.015260000"/>
+      <atom coord="0.062740000     0.662120000     0.015260000"/>
+      <atom coord="0.562740000     0.837880000     0.515260000"/>
+      <atom coord="0.062740000     0.337880000     0.515260000"/>
+      <atom coord="0.456700000     0.392330000     0.601370000"/>
+      <atom coord="0.956700000     0.892330000     0.601370000"/>
+      <atom coord="0.456700000     0.607670000     0.101370000"/>
+      <atom coord="0.956700000     0.107670000     0.101370000"/>
+      <atom coord="0.627300000     0.184670000     0.890970000"/>
+      <atom coord="0.127300000     0.684670000     0.890970000"/>
+      <atom coord="0.627300000     0.815330000     0.390970000"/>
+      <atom coord="0.127300000     0.315330000     0.390970000"/>
+      <atom coord="0.444930000     0.067770000     0.725760000"/>
+      <atom coord="0.944930000     0.567770000     0.725760000"/>
+      <atom coord="0.444930000     0.932230000     0.225760000"/>
+      <atom coord="0.944930000     0.432230000     0.225760000"/>
+      <atom coord="0.395210000     0.241450000     0.760900000"/>
+      <atom coord="0.895210000     0.741450000     0.760900000"/>
+      <atom coord="0.395210000     0.758550000     0.260900000"/>
+      <atom coord="0.895210000     0.258550000     0.260900000"/>
+      <atom coord="0.240230000     0.071570000     0.676230000"/>
+      <atom coord="0.740230000     0.571570000     0.676230000"/>
+      <atom coord="0.240230000     0.928430000     0.176230000"/>
+      <atom coord="0.740230000     0.428430000     0.176230000"/>
+      <atom coord="0.725460000     0.043550000     0.770910000"/>
+      <atom coord="0.225460000     0.543550000     0.770910000"/>
+      <atom coord="0.725460000     0.956450000     0.270910000"/>
+      <atom coord="0.225460000     0.456450000     0.270910000"/>
+    </species>
+  </structure>
+ 
+  <groundstate 
+    do="fromscratch"
+    ngridk="2 4 4"
+    rgkmax="4.6"
+    gmaxvr="16"
+    maxscl="2"
+    kptgroups="24">
+    <libxc exchange="XC_LDA_X" correlation="XC_LDA_C_PZ"/>
+    <sirius densityinit="true" density="true" vha="true" xc="true" eigenstates="true" sfacg="true" cfun="true"/>
+    <spin/>
+  </groundstate>
+
+</input>
diff --git a/python_module/pybind11 b/python_module/pybind11
index f7bc18f52..4f72ef846 160000
--- a/python_module/pybind11
+++ b/python_module/pybind11
@@ -1 +1 @@
-Subproject commit f7bc18f528bb35cd06c93d0a58c17e6eea3fa68c
+Subproject commit 4f72ef846fe8453596230ac285eeaa0ce3278bb4
diff --git a/python_module/sirius/edft/free_energy.py b/python_module/sirius/edft/free_energy.py
index 02d73b163..03a7c381a 100644
--- a/python_module/sirius/edft/free_energy.py
+++ b/python_module/sirius/edft/free_energy.py
@@ -29,37 +29,6 @@ def s(x):
         return _s(x)
 
 
-class OldFreeEnergy:
-    def __init__(self, H, energy, T):
-        """
-
-        """
-        self.H = H
-        self.energy = energy
-        self.kw = energy.kpointset.w
-        self.T = T
-        self.kb = (physical_constants['Boltzmann constant in eV/K'][0] /
-                   physical_constants['Hartree energy in eV'][0])
-
-    def entropy(self, fn):
-        ns = 2 if self.energy.kpointset.ctx().num_mag_dims() == 0 else 1
-        S = s(np.sqrt(fn/ns))
-        return self.kb * self.T * np.real(np.sum(self.kw*S))
-
-    def __call__(self, X, fn):
-        """
-        Keyword Arguments:
-        X --
-        f --
-        """
-
-        self.energy.kpointset.fn = fn
-        ns = 2 if self.energy.kpointset.ctx().num_mag_dims() == 0 else 1
-        entropy = s(np.sqrt(fn/ns))
-        E, HX = self.energy.compute(X)
-        return E + self.kb * self.T * np.real(np.sum(self.kw*entropy)), HX
-
-
 class FreeEnergy:
     """
     copied from Baarman implementation
diff --git a/python_module/sirius/edft/neugebaur.py b/python_module/sirius/edft/neugebaur.py
index 708f4df70..1542861dd 100644
--- a/python_module/sirius/edft/neugebaur.py
+++ b/python_module/sirius/edft/neugebaur.py
@@ -1,5 +1,4 @@
-"""
-Freysoldt, C., Boeck, S., & Neugebauer, J., Direct minimization technique
+"""Freysoldt, C., Boeck, S., & Neugebauer, J., Direct minimization technique
 for metals in density functional theory.
 http://dx.doi.org/10.1103/PhysRevB.79.241103
 """
@@ -249,10 +248,8 @@ def __init__(self, free_energy):
         self.M = free_energy
         self._save = False
 
-
     def step(self, X, f, eta, G_X, G_eta, xi_trial, F0, slope, kwargs):
         """
-
         Keyword Arguments:
         X         --
         f         -- occupation numbers (just for debugging, not needed)
@@ -382,7 +379,6 @@ def run(self, X, fn,
         is_converged -- bool
         """
 
-        use_g_eta=False
         if cgtype == 'PR':
             cg_update = polak_ribiere
         elif cgtype == 'FR':
@@ -401,10 +397,10 @@ def run(self, X, fn,
         kw = kset.w
         m = kset.ctx().max_occupancy()
         # set occupation numbers from band energies
-        fn, _ = self.M.smearing.fn(kset.e)
+        fn = kset.fn
         # ek = self.M.smearing.ek(fn)
 
-        eta = diag(kset.e)
+        eta = diag(self.M.smearing.ek(fn))
         w, U = eta.eigh()
         ek = w
         X = X@U
diff --git a/reframe/checks/sirius_scf_check.py b/reframe/checks/sirius_scf_check.py
index 904c3ea84..d621cf195 100644
--- a/reframe/checks/sirius_scf_check.py
+++ b/reframe/checks/sirius_scf_check.py
@@ -8,7 +8,6 @@
 test_folders = ['test01', 'test02', 'test03', 'test04', 'test05', 'test06', 'test07', 'test08',
     'test09', 'test10', 'test11', 'test12', 'test13', 'test14', 'test15', 'test16', 'test17', 'test18']
 
-
 @sn.sanity_function
 def load_json(filename):
     '''This will load a json data from a file.'''
diff --git a/reframe/config.py b/reframe/config.py
index 2362310ab..685cd471b 100644
--- a/reframe/config.py
+++ b/reframe/config.py
@@ -1,113 +1,101 @@
-
-class ReframeSettings:
-    job_poll_intervals = [1, 2, 3]
-    job_submit_timeout = 60
-    checks_path = ['checks/']
-    checks_path_recurse = False
-    site_configuration = {
-        'systems': {
-            'osx': {
-                'descr': 'OSX notebook with MacPort',
-                'hostnames': ['localhost'],
-                'modules_system': None,
-                'resourcesdir': '',
-                'partitions': {
-                    'cpu': {
-                        'scheduler': 'local+mpirun',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'CPU execution',
-                        'max_jobs': 1
-                    }
-                }
-            },
-            'linux': {
-                'descr': 'Ubuntu linux box',
-                'hostnames': ['localhost'],
-                'modules_system': None,
-                'resourcesdir': '',
-                'partitions': {
-                    'cpu': {
-                        'scheduler': 'local+local',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'CPU execution',
-                        'max_jobs': 1
-                    }
+site_configuration = {
+    'systems': [
+        {
+            'name' : 'osx',
+            'descr': 'OSX notebook with MacPort',
+            'hostnames': ['localhost'],
+            'resourcesdir': '',
+            'partitions': [
+                {
+                    'name' : 'cpu',
+                    'scheduler': 'local',
+                    'launcher' : 'mpirun',
+                    'environs': ['builtin'],
+                    'descr': 'CPU execution',
+                    'max_jobs': 1
                 }
-            }
+            ]
         },
-
-        'environments': {
-            'osx': {
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    'modules': [],
-                    'cc':  'mpicc',
-                    'cxx': 'mpic++',
-                    'ftn': 'mpif90',
+        {
+            'name' : 'linux',
+            'descr': 'Ubuntu linux box',
+            'hostnames': ['localhost'],
+            'resourcesdir': '',
+            'partitions': [
+                {
+                    'name' : 'cpu',
+                    'scheduler': 'local',
+                    'launcher' : 'local',
+                    'environs': ['builtin'],
+                    'descr': 'CPU execution',
+                    'max_jobs': 1
+                }
+            ]
+        }
+    ],
+    'environments': [
+        {
+            'name': 'builtin',
+            'target_systems': ['osx:cpu', 'linux:cpu'],
+            'cc': 'mpicc',
+            'cxx': 'mpic++',
+            'ftn': 'mpif90'
+        }
+    ],
+    'logging': [
+        {
+            'level': 'debug',
+            'handlers': [
+                {
+                    'type': 'file',
+                    'name': 'reframe.log',
+                    'level': 'debug',
+                    'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s',   # noqa: E501
+                    'append': False
+                },
+                {
+                    'type': 'stream',
+                    'name': 'stdout',
+                    'level': 'info',
+                    'format': '%(message)s'
+                },
+                {
+                    'type': 'file',
+                    'name': 'reframe.out',
+                    'level': 'info',
+                    'format': '%(message)s',
+                    'append': False
                 }
-            },
-            'linux': {
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    'modules': [],
-                    'cc':  'mpicc',
-                    'cxx': 'mpic++',
-                    'ftn': 'mpif90',
+            ],
+            'handlers_perflog': [
+                {
+                    'type': 'filelog',
+                    'prefix': '%(check_system)s/%(check_partition)s',
+                    'level': 'info',
+                    'format': '%(check_job_completion_time)s|reframe %(version)s|%(check_info)s|jobid=%(check_jobid)s|num_tasks=%(check_num_tasks)s|%(check_perf_var)s=%(check_perf_value)s|ref=%(check_perf_ref)s (l=%(check_perf_lower_thres)s, u=%(check_perf_upper_thres)s)|%(check_perf_unit)s',   # noqa: E501
+                    'datefmt': '%FT%T%:z',
+                    'append': True
+                },
+                {
+                    'type': 'graylog',
+                    'address': 'graylog-server:12345',
+                    'level': 'info',
+                    'format': '%(message)s',
+                    'extras': {
+                        'facility': 'reframe',
+                        'data-version': '1.0',
+                    }
                 }
-            }
+            ]
         }
-    }
-
-    logging_config = {
-        'level': 'DEBUG',
-        'handlers': [
-            {
-                'type': 'file',
-                'name': 'reframe.log',
-                'level': 'DEBUG',
-                'format': '[%(asctime)s] %(levelname)s: '
-                          '%(check_info)s: %(message)s',
-                'append': False,
-            },
-
-            # Output handling
-            {
-                'type': 'stream',
-                'name': 'stdout',
-                'level': 'INFO',
-                'format': '%(message)s'
-            },
-            {
-                'type': 'file',
-                'name': 'reframe.out',
-                'level': 'INFO',
-                'format': '%(message)s',
-                'append': False,
-            }
-        ]
-    }
-
-    perf_logging_config = {
-        'level': 'DEBUG',
-        'handlers': [
-            {
-                'type': 'filelog',
-                'prefix': '%(check_system)s/%(check_partition)s',
-                'level': 'INFO',
-                'format': (
-                    '%(check_job_completion_time)s|reframe %(version)s|'
-                    '%(check_info)s|jobid=%(check_jobid)s|'
-                    'num_tasks=%(check_num_tasks)s|'
-                    '%(check_perf_var)s=%(check_perf_value)s|'
-                    'ref=%(check_perf_ref)s '
-                    '(l=%(check_perf_lower_thres)s, '
-                    'u=%(check_perf_upper_thres)s)|'
-                    '%(check_perf_unit)s'
-                ),
-                'append': True
-            }
-        ]
-    }
-
-settings = ReframeSettings()
+    ],
+    'general': [
+        {
+            'check_search_path': [
+                'checks/'
+            ],
+            'check_search_recursive': True
+        }
+    ]
+}
 
diff --git a/reframe/cscs.py b/reframe/cscs.py
deleted file mode 100644
index 6a2411d5e..000000000
--- a/reframe/cscs.py
+++ /dev/null
@@ -1,567 +0,0 @@
-#
-# CSCS ReFrame settings
-#
-
-
-class ReframeSettings:
-    reframe_module = 'reframe'
-    job_poll_intervals = [1, 2, 3]
-    job_submit_timeout = 60
-    checks_path = ['checks/']
-    checks_path_recurse = True
-    site_configuration = {
-        'systems': {
-            'ault': {
-                'descr': 'Ault TDS',
-                'hostnames': ['ault'],
-                'modules_system': 'lmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Login nodes',
-                        'max_jobs': 4
-                    },
-                    'amdv100': {
-                        'scheduler': 'nativeslurm',
-                        'access':  ['-pamdv100'],
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'AMD Naples 32c + 2x NVIDIA V100',
-                        'max_jobs': 100,
-                    },
-                    'amdvega': {
-                        'scheduler': 'nativeslurm',
-                        'access':  ['-pamdvega'],
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'AMD Naples 32c + 3x AMD GFX900',
-                        'max_jobs': 100,
-                    },
-                    'intelv100': {
-                        'scheduler': 'nativeslurm',
-                        'access':  ['-pintelv100'],
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Intel Skylake 36c + 4x NVIDIA V100',
-                        'max_jobs': 100,
-                    },
-                    'intel': {
-                        'scheduler': 'nativeslurm',
-                        'access':  ['-pintel'],
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Intel Skylake 36c',
-                        'max_jobs': 100,
-                    }
-                }
-            },
-
-            'tave': {
-                'descr': 'Grand Tave',
-                'hostnames': ['tave'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel', 'PrgEnv-pgi'],
-                        'descr': 'Login nodes',
-                        'max_jobs': 4
-                    },
-                    'compute': {
-                        'scheduler': 'nativeslurm',
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel', 'PrgEnv-pgi'],
-                        'descr': 'Intel Xeon Phi',
-                        'max_jobs': 100,
-                    }
-                }
-            },
-
-            'daint': {
-                'descr': 'Piz Daint',
-                'hostnames': ['daint'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'modules': [],
-                        'access':  [],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel', 'PrgEnv-pgi'],
-                        'descr': 'Login nodes',
-                        'max_jobs': 4
-                    },
-
-                    'gpu': {
-                        'scheduler': 'local+srun',
-                        'modules': ['daint-gpu'],
-                        'access':  ['--constraint=gpu'],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel', 'PrgEnv-pgi'],
-                        'descr': 'Hybrid nodes (Haswell/P100)',
-                        'max_jobs': 100,
-                        'resources': {
-                            'switches': ['--switches={num_switches}']
-                        }
-                    },
-
-                    'mc': {
-                        'scheduler': 'nativeslurm',
-                        'modules': ['daint-mc'],
-                        'access':  ['--constraint=mc'],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel', 'PrgEnv-pgi'],
-                        'descr': 'Multicore nodes (Broadwell)',
-                        'max_jobs': 100,
-                        'resources': {
-                            'switches': ['--switches={num_switches}']
-                        }
-                    }
-                }
-            },
-
-            'dom': {
-                'descr': 'Dom TDS',
-                'hostnames': ['dom'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    # FIXME: temporarily disable PrgEnv-pgi on all partitions
-                    'login': {
-                        'scheduler': 'local',
-                        'modules': [],
-                        'access':  [],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel'],
-                        'descr': 'Login nodes',
-                        'max_jobs': 4
-                    },
-
-                    'gpu': {
-                        'scheduler': 'nativeslurm',
-                        'modules': ['daint-gpu'],
-                        'access':  ['--constraint=gpu'],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel'],
-                        'descr': 'Hybrid nodes (Haswell/P100)',
-                        'max_jobs': 100,
-                        'resources': {
-                            'switches': ['--switches={num_switches}']
-                        }
-                    },
-
-                    'mc': {
-                        'scheduler': 'nativeslurm',
-                        'modules': ['daint-mc'],
-                        'access':  ['--constraint=mc'],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-gnu',
-                                     'PrgEnv-intel'],
-                        'descr': 'Multicore nodes (Broadwell)',
-                        'max_jobs': 100,
-                        'resources': {
-                            'switches': ['--switches={num_switches}']
-                        }
-                    },
-                }
-            },
-
-            'fulen': {
-                'descr': 'Fulen',
-                'hostnames': [r'fulen-ln\d+'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Login nodes',
-                        'max_jobs': 1
-                    },
-
-                    'normal': {
-                        'scheduler': 'nativeslurm',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Compute nodes - default partition',
-                    },
-
-                    'fat': {
-                        'scheduler': 'nativeslurm',
-                        'environs': ['PrgEnv-gnu'],
-                        'access': ['--partition fat'],
-                        'descr': 'High-memory compute nodes',
-                    },
-
-                    'gpu': {
-                        'scheduler': 'nativeslurm',
-                        'environs': ['PrgEnv-gnu'],
-                        'access': ['--partition gpu'],
-                        'descr': 'Hybrid compute nodes',
-                    },
-                }
-            },
-
-            'kesch': {
-                'descr': 'Kesch MCH',
-                'hostnames': ['keschln-\d+'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'environs': ['PrgEnv-cray', 'PrgEnv-cray-nompi',
-                                     'PrgEnv-pgi', 'PrgEnv-pgi-nompi',
-                                     'PrgEnv-gnu', 'PrgEnv-gnu-nompi'],
-                        'descr': 'Kesch login nodes',
-                    },
-                    'pn': {
-                        'scheduler': 'nativeslurm',
-                        'access': ['--partition=pn-regression'],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-cray-nompi',
-                                     'PrgEnv-pgi', 'PrgEnv-pgi-nompi',
-                                     'PrgEnv-gnu', 'PrgEnv-gnu-nompi',
-                                     'PrgEnv-cray-c2sm',
-                                     'PrgEnv-pgi-c2sm',
-                                     'PrgEnv-gnu-c2sm',
-                                     'PrgEnv-cray-c2sm-gpu',
-                                     'PrgEnv-pgi-c2sm-gpu',
-                                     'PrgEnv-gnu-c2sm-gpu'],
-                        'descr': 'Kesch post-processing nodes'
-                    },
-
-                    'cn': {
-                        'scheduler': 'nativeslurm',
-                        'access': ['--partition=cn-regression'],
-                        'environs': ['PrgEnv-cray', 'PrgEnv-cray-nompi',
-                                     'PrgEnv-pgi', 'PrgEnv-pgi-nompi',
-                                     'PrgEnv-gnu', 'PrgEnv-gnu-nompi',
-                                     'PrgEnv-cray-c2sm',
-                                     'PrgEnv-pgi-c2sm',
-                                     'PrgEnv-gnu-c2sm',
-                                     'PrgEnv-cray-c2sm-gpu',
-                                     'PrgEnv-pgi-c2sm-gpu',
-                                     'PrgEnv-gnu-c2sm-gpu'],
-                        'descr': 'Kesch compute nodes',
-                        'resources': {
-                            '_rfm_gpu': ['--gres=gpu:{num_gpus_per_node}'],
-                        }
-                    }
-                }
-            },
-
-            'leone': {
-                'descr': 'Leone',
-                'hostnames': ['leone'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Leone login nodes',
-                        'max_jobs': 1
-                    },
-
-                    'normal': {
-                        'scheduler': 'nativeslurm',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': ('Leone compute nodes - '
-                                  'default partition'),
-                        'max_jobs': 10
-                    },
-                }
-            },
-
-            'monch': {
-                'descr': 'Monch PASC',
-                'hostnames': ['monch'],
-                'modules_system': 'tmod',
-                'resourcesdir': '/apps/common/UES/reframe/resources',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Monch login nodes',
-                        'max_jobs': 1
-                    },
-
-                    'compute': {
-                        'scheduler': 'slurm+mpirun',
-                        'access': ['--partition=compute'],
-                        'environs': ['PrgEnv-gnu'],
-                        'descr': 'Monch compute nodes',
-                        'max_jobs': 10
-                    }
-                }
-            },
-
-            'generic': {
-                'descr': 'Generic example system',
-                'partitions': {
-                    'login': {
-                        'scheduler': 'local',
-                        'modules': [],
-                        'access': [],
-                        'environs': ['builtin-gcc'],
-                        'descr': 'Login nodes'
-                    }
-                }
-            }
-        },
-
-        'environments': {
-
-            'ault': {
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    # defaults were gcc/8.3.0, cuda/10.1, openmpi/4.0.0
-                    'modules': ['gcc', 'cuda/10.1', 'openmpi'],
-                    'cc':  'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-                'builtin': {
-                    'type': 'ProgEnvironment',
-                    'cc':  'cc',
-                    'cxx': '',
-                    'ftn': '',
-                },
-                'builtin-gcc': {
-                    'type': 'ProgEnvironment',
-                    'cc':  'gcc',
-                    'cxx': 'g++',
-                    'ftn': 'gfortran',
-                }
-            },
-
-            'kesch': {
-                'PrgEnv-pgi-nompi': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-pgi/17.10'],
-                    'cc': 'pgcc',
-                    'cxx': 'pgc++',
-                    'ftn': 'pgf90',
-                },
-                'PrgEnv-pgi': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-pgi/17.10_gdr'],
-                    'cc': 'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-                'PrgEnv-cray': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-cray/1.0.2_gdr'],
-                },
-                'PrgEnv-cray-nompi': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-cray'],
-                },
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['gmvapich2/17.02_cuda_8.0_gdr'],
-                    'variables': {
-                        'LD_PRELOAD': '$(pkg-config --variable=libdir mvapich2-gdr)/libmpi.so'
-                    },
-                    'cc': 'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-                'PrgEnv-gnu-nompi': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-gnu'],
-                    'cc': 'gcc',
-                    'cxx': 'g++',
-                    'ftn': 'gfortran',
-                },
-                'PrgEnv-cray-c2sm': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['c2sm-rcm/1.00.00-kesch',
-                                'c2sm/cray-env/base'],
-                },
-                'PrgEnv-cray-c2sm-gpu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['c2sm-rcm/1.00.00-kesch',
-                                'c2sm/cray-env/gpu'],
-                },
-                'PrgEnv-pgi-c2sm': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['c2sm-rcm/1.00.00-kesch',
-                                'c2sm/pgi-env/base'],
-                    'cc': 'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-                'PrgEnv-pgi-c2sm-gpu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['c2sm-rcm/1.00.00-kesch',
-                                'c2sm/pgi-env/gpu'],
-                    'cc': 'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-                'PrgEnv-gnu-c2sm': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['c2sm-rcm/1.00.00-kesch',
-                                'c2sm/gnu-env/base'],
-                    'cc': 'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-                'PrgEnv-gnu-c2sm-gpu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['c2sm-rcm/1.00.00-kesch',
-                                'c2sm/gnu-env/gpu'],
-                    'cc': 'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-            },
-
-            'leone': {
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-gnu/leone-foss-2016b'],
-                    'cc':  'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                },
-            },
-
-            'monch': {
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-gnu'],
-                    'cc':  'mpicc',
-                    'cxx': 'mpicxx',
-                    'ftn': 'mpif90',
-                }
-            },
-
-            '*': {
-                'PrgEnv-cray': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-cray'],
-                },
-
-                'PrgEnv-gnu': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-gnu'],
-                },
-
-                'PrgEnv-intel': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-intel'],
-                },
-
-                'PrgEnv-pgi': {
-                    'type': 'ProgEnvironment',
-                    'modules': ['PrgEnv-pgi'],
-                },
-
-                'builtin': {
-                    'type': 'ProgEnvironment',
-                    'cc':  'cc',
-                    'cxx': '',
-                    'ftn': '',
-                },
-
-                'builtin-gcc': {
-                    'type': 'ProgEnvironment',
-                    'cc':  'gcc',
-                    'cxx': 'g++',
-                    'ftn': 'gfortran',
-                }
-            }
-        },
-
-        'modes': {
-            '*': {
-                'maintenance': [
-                    '--exec-policy=async',
-                    '--strict',
-                    '--output=$APPS/UES/$USER/regression/maintenance',
-                    '--perflogdir=$APPS/UES/$USER/regression/maintenance/logs',
-                    '--stage=$SCRATCH/regression/maintenance/stage',
-                    '--reservation=maintenance',
-                    '--save-log-files',
-                    '--tag=maintenance',
-                    '--timestamp=%F_%H-%M-%S'
-                ],
-                'production': [
-                    '--exec-policy=async',
-                    '--strict',
-                    '--output=$APPS/UES/$USER/regression/production',
-                    '--perflogdir=$APPS/UES/$USER/regression/production/logs',
-                    '--stage=$SCRATCH/regression/production/stage',
-                    '--save-log-files',
-                    '--tag=production',
-                    '--timestamp=%F_%H-%M-%S'
-                ]
-            }
-        }
-    }
-
-    logging_config = {
-        'level': 'DEBUG',
-        'handlers': [
-            {
-                'type': 'file',
-                'name': 'reframe.log',
-                'level': 'DEBUG',
-                'format': '[%(asctime)s] %(levelname)s: '
-                          '%(check_info)s: %(message)s',
-                'append': False,
-            },
-
-            # Output handling
-            {
-                'type': 'stream',
-                'name': 'stdout',
-                'level': 'INFO',
-                'format': '%(message)s'
-            },
-            {
-                'type': 'file',
-                'name': 'reframe.out',
-                'level': 'INFO',
-                'format': '%(message)s',
-                'append': False,
-            }
-        ]
-    }
-
-    perf_logging_config = {
-        'level': 'DEBUG',
-        'handlers': [
-            #@ {
-            #@     'type': 'graylog',
-            #@     'host': 'your-server-here',
-            #@     'port': 12345,
-            #@     'level': 'INFO',
-            #@     'format': '%(message)s',
-            #@     'extras': {
-            #@         'facility': 'reframe',
-            #@         'data-version': '1.0',
-            #@     }
-            #@ },
-            {
-                'type': 'filelog',
-                'prefix': '%(check_system)s/%(check_partition)s',
-                'level': 'INFO',
-                'format': (
-                    '%(asctime)s|reframe %(version)s|'
-                    '%(check_info)s|jobid=%(check_jobid)s|'
-                    '%(check_perf_var)s=%(check_perf_value)s|'
-                    'ref=%(check_perf_ref)s '
-                    '(l=%(check_perf_lower_thres)s, '
-                    'u=%(check_perf_upper_thres)s)|'
-                    '%(check_perf_unit)s'
-                ),
-                'append': True
-            }
-        ]
-    }
-
-
-settings = ReframeSettings()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index caac7f27f..a1199ba58 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -6,7 +6,7 @@ if(CREATE_FORTRAN_BINDINGS)
   set(_FSOURCES "api/sirius_api.cpp;api/sirius.f90")
 endif()
 
-if(USE_CUDA)
+if(USE_CUDA OR USE_ROCM)
   file(GLOB_RECURSE CUFILES "gpu/*.cu")
   set(_CUSOURCES "${CUFILES}")
 endif()
@@ -66,32 +66,28 @@ set(_SOURSES
   "SDDK/gvec.cpp"
   "SDDK/matrix_storage.cpp"
   "gpu/acc.cpp"
+  "gpu/acc_blas.cpp"
+  "gpu/cusolver.cpp"
+  "gpu/cublas.cpp"
   "sht/sht.cpp"
   "sirius_version.cpp"
   "mixer/mixer_functions.cpp"
+  "linalg/eigensolver.cpp"
+  "nlcglib/adaptor.cpp"
   )
 
-if(USE_ROCM)
-  add_library(sirius_rocm_interface STATIC ./gpu/rocfft_interface.cpp)
-  set_target_properties(sirius_rocm_interface PROPERTIES POSITION_INDEPENDENT_CODE ON)
-  # don't install as it is STATIC and not meant to be used by external projects
-  target_link_libraries(sirius_rocm_interface PUBLIC OpenMP::OpenMP_CXX)
 
-  # use include directories and definitions of sirius_rocm_interface (using custom targets not possible with generators)
-  set(DEFINITIONS_PROP "$<TARGET_PROPERTY:sirius_rocm_interface,COMPILE_DEFINITIONS>")
-  set(DEFINITIONS_GENERATOR "$<$<BOOL:${DEFINITIONS_PROP}>:-D$<JOIN:${DEFINITIONS_PROP}, -D>>")
-  set(INCLUDE_DIR_PROP "$<TARGET_PROPERTY:sirius_rocm_interface,INCLUDE_DIRECTORIES>")
-  set(INCLUDE_DIR_GENERATOR "$<$<BOOL:${INCLUDE_DIR_PROP}>:-I$<JOIN:${INCLUDE_DIR_PROP}, -I>>")
+# create library with .cpp, .cu and .f90 sources
 
-  # create gpu library compiled with hip
-  file(GLOB_RECURSE HIPFILES_KERNELS "gpu/*.cu")
-  file(GLOB_RECURSE HIPFILES_SDDK "SDDK/*.cu")
-  rocm_hip_add_library(sirius_rocm SHARED ${HIPFILES_SDDK} ${HIPFILES_KERNELS}
-    FLAGS ${DEFINITIONS_GENERATOR} ${INCLUDE_DIR_GENERATOR} "-Wno-macro-redefined" OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+if(USE_ROCM)
+  set(HIP_HCC_FLAGS ${HIP_HCC_FLAGS} -fno-gpu-rdc) # required for linking with compiler other than hcc
+  # macro from FindHIP package, which compiles all .cu files with hcc and cpp files with the set c++ compiler
+  HIP_ADD_LIBRARY(sirius SHARED "${_SOURSES};${_CUSOURCES};${_FSOURCES}")
+else()
+  add_library(sirius "${_SOURSES};${_CUSOURCES};${_FSOURCES}")
 endif()
 
-# create library with .cpp, .cu and .f90 sources
-add_library(sirius "${_SOURSES};${_CUSOURCES};${_FSOURCES}")
+
 target_link_libraries(sirius PUBLIC OpenMP::OpenMP_CXX
                                     SpFFT::spfft
                                     GSL::gsl
@@ -100,15 +96,18 @@ target_link_libraries(sirius PUBLIC OpenMP::OpenMP_CXX
                                     sirius::libxc
                                     sirius::libspg
                                     sirius::hdf5
+                                    $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
                                     $<TARGET_NAME_IF_EXISTS:sirius::elpa>
                                     $<TARGET_NAME_IF_EXISTS:sirius::magma>
                                     $<TARGET_NAME_IF_EXISTS:sirius::libvdwxc>
                                     $<TARGET_NAME_IF_EXISTS:sirius::cudalibs>
                                     $<TARGET_NAME_IF_EXISTS:sirius_rocm>
                                     $<TARGET_NAME_IF_EXISTS:sirius_rocm_interface>
-                                    $<TARGET_NAME_IF_EXISTS:hipblas_port>
-                                    $<$<BOOL:${USE_ROCM}>:${ROCM_LIBRARIES}>
+                                    $<TARGET_NAME_IF_EXISTS:nlcglib::nlcglib>
+                                    $<TARGET_NAME_IF_EXISTS:kokkos::kokkos>
                                     $<$<BOOL:${USE_NVTX}>:nvToolsExt>
+                                    $<$<BOOL:${USE_ROCM}>:ROCBLAS::rocblas>
+                                    $<$<BOOL:${USE_ROCM}>:HIPLIBS::hiplibs>
                                     )
 
 target_include_directories(sirius PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/src>
@@ -120,12 +119,15 @@ target_compile_definitions(sirius PUBLIC
   $<$<BOOL:${USE_MEMORY_POOL}>:__USE_MEMORY_POOL>
   $<$<BOOL:${DEBUG_MEMORY_POOL}>:__DEBUG_MEMORY_POOL>
   $<$<BOOL:${USE_ELPA}>:__ELPA>
+  $<$<BOOL:${USE_NLCGLIB}>:__NLCGLIB>
   $<$<BOOL:${USE_CUDA}>:__GPU __CUDA>
+  $<$<BOOL:${USE_ROCM}>:__HIP_PLATFORM_HCC__>
   $<$<BOOL:${USE_NVTX}>:__CUDA_NVTX>
   $<$<BOOL:${USE_MAGMA}>:__MAGMA>
   $<$<BOOL:${USE_ROCM}>:__GPU __ROCM>
   $<$<BOOL:${USE_VDWXC}>:__USE_VDWXC>
   $<$<BOOL:${HAVE_LIBVDW_WITH_MPI}>:__HAVE_VDWXC_MPI>
+  $<$<BOOL:${USE_MAGMA} AND ${USE_ROCM}>:HAVE_HIP> # Required for magma headers
 )
 
 set_target_properties(sirius PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/src/SDDK/dmatrix.cpp b/src/SDDK/dmatrix.cpp
index f2a9b29be..1a9c6644a 100644
--- a/src/SDDK/dmatrix.cpp
+++ b/src/SDDK/dmatrix.cpp
@@ -30,12 +30,9 @@ template <typename T>
 dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__,
                     memory_t mem_type__)
     : matrix<T>(splindex<splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(),
-                                                   bs_row__)
-                    .local_size(),
+                                                   bs_row__).local_size(),
                 splindex<splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(),
-                                                   bs_col__)
-                    .local_size(),
-                mem_type__)
+                                                   bs_col__).local_size(), mem_type__)
     , num_rows_(num_rows__)
     , num_cols_(num_cols__)
     , bs_row_(bs_row__)
@@ -47,28 +44,14 @@ dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid
     init();
 }
 
-template <typename T>
-dmatrix<T>::dmatrix(int num_rows__, int num_cols__, memory_t mem_type__)
-    : matrix<T>(num_rows__, num_cols__, mem_type__)
-    , num_rows_(num_rows__)
-    , num_cols_(num_cols__)
-    , bs_row_(1)
-    , bs_col_(1)
-    , spl_row_(num_rows_, 1, 0, bs_row_)
-    , spl_col_(num_cols_, 1, 0, bs_col_)
-{
-}
-
 template <typename T>
 dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__,
                     int bs_col__)
     : matrix<T>(ptr__,
                 splindex<splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(),
-                                                   bs_row__)
-                    .local_size(),
+                                                   bs_row__).local_size(),
                 splindex<splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(),
-                                                   bs_col__)
-                    .local_size())
+                                                   bs_col__).local_size())
     , num_rows_(num_rows__)
     , num_cols_(num_cols__)
     , bs_row_(bs_row__)
@@ -81,23 +64,15 @@ dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const&
 }
 
 template <typename T>
-dmatrix<T>::dmatrix(memory_pool& mp__, int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__,
-                    int bs_col__)
-    : matrix<T>(splindex<splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(),
-                                                   bs_row__)
-                    .local_size(),
-                splindex<splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(),
-                                                   bs_col__)
-                    .local_size(), mp__)
+dmatrix<T>::dmatrix(int num_rows__, int num_cols__, memory_t mem_type__)
+    : matrix<T>(num_rows__, num_cols__, mem_type__)
     , num_rows_(num_rows__)
     , num_cols_(num_cols__)
-    , bs_row_(bs_row__)
-    , bs_col_(bs_col__)
-    , blacs_grid_(&blacs_grid__)
-    , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_)
-    , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_)
+    , bs_row_(1)
+    , bs_col_(1)
+    , spl_row_(num_rows_, 1, 0, bs_row_)
+    , spl_col_(num_cols_, 1, 0, bs_col_)
 {
-    init();
 }
 
 template <typename T>
@@ -113,6 +88,24 @@ dmatrix<T>::dmatrix(T* ptr__, int num_rows__, int num_cols__)
     init();
 }
 
+template <typename T>
+dmatrix<T>::dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__,
+                    memory_pool& mp__)
+    : matrix<T>(splindex<splindex_t::block_cyclic>(num_rows__, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(),
+                                                   bs_row__).local_size(),
+                splindex<splindex_t::block_cyclic>(num_cols__, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(),
+                                                   bs_col__).local_size(), mp__)
+    , num_rows_(num_rows__)
+    , num_cols_(num_cols__)
+    , bs_row_(bs_row__)
+    , bs_col_(bs_col__)
+    , blacs_grid_(&blacs_grid__)
+    , spl_row_(num_rows_, blacs_grid__.num_ranks_row(), blacs_grid__.rank_row(), bs_row_)
+    , spl_col_(num_cols_, blacs_grid__.num_ranks_col(), blacs_grid__.rank_col(), bs_col_)
+{
+    init();
+}
+
 template <typename T>
 void dmatrix<T>::set(int ir0__, int jc0__, int mr__, int nc__, T* ptr__, int ld__)
 {
diff --git a/src/SDDK/dmatrix.hpp b/src/SDDK/dmatrix.hpp
index b9badcf90..824bc53ae 100644
--- a/src/SDDK/dmatrix.hpp
+++ b/src/SDDK/dmatrix.hpp
@@ -83,34 +83,17 @@ class dmatrix : public matrix<T>
     {
     }
 
-    dmatrix(int num_rows__,
-            int num_cols__,
-            BLACS_grid const& blacs_grid__,
-            int bs_row__,
-            int bs_col__,
+    dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__,
             memory_t mem_type__ = memory_t::host);
 
-    dmatrix(int num_rows__,
-            int num_cols__,
-            memory_t mem_type__ = memory_t::host);
+    dmatrix(int num_rows__, int num_cols__, memory_t mem_type__ = memory_t::host);
+
+    dmatrix(T* ptr__, int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__);
+
+    dmatrix(int num_rows__, int num_cols__, BLACS_grid const& blacs_grid__, int bs_row__, int bs_col__,
+            memory_pool& mp__);
 
-    dmatrix(T* ptr__,
-            int num_rows__,
-            int num_cols__,
-            BLACS_grid const& blacs_grid__,
-            int bs_row__,
-            int bs_col__);
-
-    dmatrix(memory_pool& mp__,
-            int num_rows__,
-            int num_cols__,
-            BLACS_grid const& blacs_grid__,
-            int bs_row__,
-            int bs_col__);
-
-    dmatrix(T* ptr__,
-            int num_rows__,
-            int num_cols__);
+    dmatrix(T* ptr__, int num_rows__, int num_cols__);
 
     dmatrix(dmatrix<T>&& src) = default;
 
diff --git a/src/SDDK/geometry3d.hpp b/src/SDDK/geometry3d.hpp
index c5910858a..54b9926fd 100644
--- a/src/SDDK/geometry3d.hpp
+++ b/src/SDDK/geometry3d.hpp
@@ -63,8 +63,17 @@ class vector3d : public std::array<T, 3>
         }
     }
 
+    vector3d& operator=(std::initializer_list<T> v__)
+    {
+        assert(v__.size() == 3);
+        for (int x : {0, 1, 2}) {
+            (*this)[x] = v__.begin()[x];
+        }
+        return *this;
+    }
+
     /// Create from std::vector.
-    vector3d(std::vector<T> v__)
+    vector3d(const std::vector<T>& v__)
     {
         assert(v__.size() == 3);
         for (int x : {0, 1, 2}) {
@@ -72,6 +81,15 @@ class vector3d : public std::array<T, 3>
         }
     }
 
+    vector3d& operator=(const std::vector<T>& v__)
+    {
+        assert(v__.size() == 3);
+        for (int x : {0, 1, 2}) {
+            (*this)[x] = v__[x];
+        }
+        return *this;
+    }
+
     /// Create from raw pointer.
     vector3d(T const* ptr__)
     {
diff --git a/src/SDDK/gvec.hpp b/src/SDDK/gvec.hpp
index a84634fcf..2de6e55c3 100644
--- a/src/SDDK/gvec.hpp
+++ b/src/SDDK/gvec.hpp
@@ -423,32 +423,33 @@ class Gvec
 
     /// Return G vector in Cartesian coordinates.
     template <index_domain_t idx_t>
-    inline vector3d<double> gvec_cart(int ig__) const
+    inline std::enable_if_t<idx_t == index_domain_t::local, vector3d<double>>
+    gvec_cart(int ig__) const
     {
-        switch (idx_t) {
-            case index_domain_t::local: {
-                return vector3d<double>(gvec_cart_(0, ig__), gvec_cart_(1, ig__), gvec_cart_(2, ig__));
-            }
-            case index_domain_t::global: {
-                auto G = gvec_by_full_index(gvec_full_index_(ig__));
-                return lattice_vectors_ * vector3d<double>(G[0], G[1], G[2]);
-            }
-        }
+        return vector3d<double>(gvec_cart_(0, ig__), gvec_cart_(1, ig__), gvec_cart_(2, ig__));
+    }
+
+    /// Return G vector in Cartesian coordinates.
+    template <index_domain_t idx_t>
+    inline std::enable_if_t<idx_t == index_domain_t::global, vector3d<double>> gvec_cart(int ig__) const
+    {
+        auto G = gvec_by_full_index(gvec_full_index_(ig__));
+        return lattice_vectors_ * vector3d<double>(G[0], G[1], G[2]);
     }
 
     /// Return G+k vector in Cartesian coordinates.
     template <index_domain_t idx_t>
-    inline vector3d<double> gkvec_cart(int ig__) const
+    inline std::enable_if_t<idx_t==index_domain_t::local, vector3d<double>> gkvec_cart(int ig__) const
     {
-        switch (idx_t) {
-            case index_domain_t::local: {
-                return vector3d<double>(gkvec_cart_(0, ig__), gkvec_cart_(1, ig__), gkvec_cart_(2, ig__));
-            }
-            case index_domain_t::global: {
-                auto G = gvec_by_full_index(gvec_full_index_(ig__));
-                return lattice_vectors_ * (vector3d<double>(G[0], G[1], G[2]) + vk_);
-            }
-        }
+        return vector3d<double>(gkvec_cart_(0, ig__), gkvec_cart_(1, ig__), gkvec_cart_(2, ig__));
+    }
+
+    /// Return G+k vector in Cartesian coordinates.
+    template <index_domain_t idx_t>
+    inline std::enable_if_t<idx_t == index_domain_t::global, vector3d<double>> gkvec_cart(int ig__) const
+    {
+        auto G = gvec_by_full_index(gvec_full_index_(ig__));
+        return lattice_vectors_ * (vector3d<double>(G[0], G[1], G[2]) + vk_);
     }
 
     inline int shell(int ig__) const
diff --git a/src/SDDK/memory.hpp b/src/SDDK/memory.hpp
index dd5645821..1d384f38d 100644
--- a/src/SDDK/memory.hpp
+++ b/src/SDDK/memory.hpp
@@ -907,6 +907,11 @@ class mdarray
         this->allocate(memory__);
     }
 
+    /*
+     * 1D array constructors
+     *
+     */
+
     /// 1D array with memory allocation.
     mdarray(mdarray_index_descriptor const& d0,
             memory_t memory__   = memory_t::host,
@@ -919,6 +924,50 @@ class mdarray
         this->allocate(memory__);
     }
 
+    /// 1D array with memory pool allocation.
+    mdarray(mdarray_index_descriptor const& d0, memory_pool& mp__, std::string label__ = "")
+    {
+        static_assert(N == 1, "wrong number of dimensions");
+
+        this->label_ = label__;
+        this->init_dimensions({d0});
+        this->allocate(mp__);
+    }
+
+
+    /// 1D array with host pointer wrapper.
+    mdarray(T* ptr__,
+            mdarray_index_descriptor const& d0,
+            std::string label__ = "")
+    {
+        static_assert(N == 1, "wrong number of dimensions");
+
+        this->label_ = label__;
+        this->init_dimensions({d0});
+        this->raw_ptr_ = ptr__;
+    }
+
+    /// 1D array with host and device pointer wrapper.
+    mdarray(T* ptr__,
+            T* ptr_device__,
+            mdarray_index_descriptor const& d0,
+            std::string label__ = "")
+    {
+        static_assert(N == 1, "wrong number of dimensions");
+
+        this->label_ = label__;
+        this->init_dimensions({d0});
+        this->raw_ptr_ = ptr__;
+#ifdef __GPU
+        this->raw_ptr_device_ = ptr_device__;
+#endif
+    }
+
+    /*
+     * 2D array constructors
+     *
+     */
+
     /// 2D array with memory allocation.
     mdarray(mdarray_index_descriptor const& d0,
             mdarray_index_descriptor const& d1,
@@ -994,42 +1043,6 @@ class mdarray
         this->allocate(memory__);
     }
 
-    mdarray(T* ptr__,
-            mdarray_index_descriptor const& d0,
-            std::string label__ = "")
-    {
-        static_assert(N == 1, "wrong number of dimensions");
-
-        this->label_ = label__;
-        this->init_dimensions({d0});
-        this->raw_ptr_ = ptr__;
-    }
-
-    mdarray(T* ptr__,
-            T* ptr_device__,
-            mdarray_index_descriptor const& d0,
-            std::string label__ = "")
-    {
-        static_assert(N == 1, "wrong number of dimensions");
-
-        this->label_ = label__;
-        this->init_dimensions({d0});
-        this->raw_ptr_ = ptr__;
-#ifdef __GPU
-        this->raw_ptr_device_ = ptr_device__;
-#endif
-    }
-
-    /// 1D array with memory pool allocation.
-    mdarray(mdarray_index_descriptor const& d0, memory_pool& mp__, std::string label__ = "")
-    {
-        static_assert(N == 1, "wrong number of dimensions");
-
-        this->label_ = label__;
-        this->init_dimensions({d0});
-        this->allocate(mp__);
-    }
-
     /// Wrap a pointer into 2D array.
     mdarray(T* ptr__,
             mdarray_index_descriptor const& d0,
diff --git a/src/SDDK/omp.hpp b/src/SDDK/omp.hpp
index fba19668d..a7dd7f843 100644
--- a/src/SDDK/omp.hpp
+++ b/src/SDDK/omp.hpp
@@ -19,12 +19,18 @@
 
 /** \file omp.hpp
  *
- *  \brief Substitution of OMP functions.
+ *  \brief Add or substitute OMP functions.
  */
 
 #ifndef __OMP_HPP__
 #define __OMP_HPP__
 
+#if defined(_OPENMP)
+
+#include <omp.h>
+
+#else
+
 inline int omp_get_max_threads()
 {
     return 1;
@@ -50,3 +56,5 @@ inline double omp_get_wtime()
 }
 
 #endif
+
+#endif
diff --git a/src/SDDK/wave_functions.cpp b/src/SDDK/wave_functions.cpp
index 2de92865d..59ce5d109 100644
--- a/src/SDDK/wave_functions.cpp
+++ b/src/SDDK/wave_functions.cpp
@@ -307,6 +307,16 @@ void Wave_functions::allocate(spin_range spins__, memory_t mem__)
     }
 }
 
+void Wave_functions::allocate(spin_range spins__, memory_pool& mp__)
+{
+    for (int s : spins__) {
+        pw_coeffs(s).allocate(mp__);
+        if (has_mt()) {
+            mt_coeffs(s).allocate(mp__);
+        }
+    }
+}
+
 void Wave_functions::deallocate(spin_range spins__, memory_t mem__)
 {
     for (int s : spins__) {
diff --git a/src/SDDK/wave_functions.hpp b/src/SDDK/wave_functions.hpp
index ecabb06db..37f210137 100644
--- a/src/SDDK/wave_functions.hpp
+++ b/src/SDDK/wave_functions.hpp
@@ -27,14 +27,13 @@
 
 #include <cstdlib>
 #include <iostream>
-#include <omp.h>
 #include "linalg/linalg.hpp"
-#include "linalg/eigenproblem.hpp"
-#include "hdf5_tree.hpp"
+#include "SDDK/hdf5_tree.hpp"
 #include "utils/env.hpp"
-#include "gvec.hpp"
+#include "SDDK/gvec.hpp"
 #include "matrix_storage.hpp"
 #ifdef __GPU
+using double_complex = std::complex<double>;
 extern "C" void add_square_sum_gpu(double_complex const* wf__, int num_rows_loc__, int nwf__, int reduced__,
                                    int mpi_rank__, double* result__);
 
@@ -306,6 +305,22 @@ class Wave_functions
         return offset_mt_coeffs_[ialoc__];
     }
 
+    inline memory_t preferred_memory_t() const
+    {
+        return preferred_memory_t_;
+    }
+
+    inline double_complex checksum(device_t pu__, int ispn__, int i0__, int n__) const
+    {
+        return checksum_pw(pu__, ispn__, i0__, n__) + checksum_mt(pu__, ispn__, i0__, n__);
+    }
+
+    inline void zero(device_t pu__, int ispn__, int i0__, int n__) // TODO: pass memory_t
+    {
+        zero_pw(pu__, ispn__, i0__, n__);
+        zero_mt(pu__, ispn__, i0__, n__);
+    }
+
     /// Copy values from another wave-function.
     /** \param [in] pu   Type of processging unit which copies data.
      *  \param [in] n    Number of wave-functions to copy.
@@ -327,39 +342,25 @@ class Wave_functions
     /// Checksum of muffin-tin coefficients.
     double_complex checksum_mt(device_t pu__, int ispn__, int i0__, int n__) const;
 
-    inline double_complex checksum(device_t pu__, int ispn__, int i0__, int n__) const
-    {
-        return checksum_pw(pu__, ispn__, i0__, n__) + checksum_mt(pu__, ispn__, i0__, n__);
-    }
-
     void zero_pw(device_t pu__, int ispn__, int i0__, int n__);
 
     void zero_mt(device_t pu__, int ispn__, int i0__, int n__);
 
-    inline void zero(device_t pu__, int ispn__, int i0__, int n__) // TODO: pass memory_t
-    {
-        zero_pw(pu__, ispn__, i0__, n__);
-        zero_mt(pu__, ispn__, i0__, n__);
-    }
-
     void scale(memory_t mem__, int ispn__, int i0__, int n__, double beta__);
 
-    mdarray<double, 1> l2norm(device_t pu__, spin_range spins__, int n__) const;
+    sddk::mdarray<double, 1> l2norm(device_t pu__, spin_range spins__, int n__) const;
 
     /// Normalize the functions.
     void normalize(device_t pu__, spin_range spins__, int n__);
 
     void allocate(spin_range spins__, memory_t mem__);
 
+    void allocate(spin_range spins__, memory_pool& mp__);
+
     void deallocate(spin_range spins__, memory_t mem__);
 
     void copy_to(spin_range spins__, memory_t mem__, int i0__, int n__);
 
-    inline memory_t preferred_memory_t() const
-    {
-        return preferred_memory_t_;
-    }
-
     void print_checksum(device_t pu__, std::string label__, int N__, int n__) const;
 };
 
diff --git a/src/SDDK/wf_inner.cpp b/src/SDDK/wf_inner.cpp
index 3bfe3a279..03c5b1614 100644
--- a/src/SDDK/wf_inner.cpp
+++ b/src/SDDK/wf_inner.cpp
@@ -24,6 +24,7 @@
  */
 #include "wf_inner.hpp"
 #include "utils/profiler.hpp"
+#include "SDDK/omp.hpp"
 #include <chrono>
 
 namespace sddk {
diff --git a/src/SDDK/wf_ortho.cpp b/src/SDDK/wf_ortho.cpp
index 0fe2038d2..ce1a9f117 100644
--- a/src/SDDK/wf_ortho.cpp
+++ b/src/SDDK/wf_ortho.cpp
@@ -27,6 +27,7 @@
 #include "wf_inner.hpp"
 #include "wf_trans.hpp"
 #include "utils/profiler.hpp"
+#include "linalg/eigensolver.hpp"
 
 namespace sddk {
 
@@ -109,7 +110,7 @@ void orthogonalize(memory_t mem__, linalg_t la__, int ispn__, std::vector<Wave_f
         std::vector<double> eo(n__);
         dmatrix<T> evec(o__.num_rows(), o__.num_cols(), o__.blacs_grid(), o__.bs_row(), o__.bs_col());
 
-        auto solver = Eigensolver_factory(ev_solver_t::scalapack);
+        auto solver = Eigensolver_factory("scalapack", nullptr);
         solver->solve(n__, o__, eo.data(), evec);
 
         if (o__.comm().rank() == 0) {
diff --git a/src/SDDK/wf_trans.cpp b/src/SDDK/wf_trans.cpp
index 7fa7527fa..c59f2c889 100644
--- a/src/SDDK/wf_trans.cpp
+++ b/src/SDDK/wf_trans.cpp
@@ -25,6 +25,7 @@
 
 #include "wf_trans.hpp"
 #include "utils/profiler.hpp"
+#include "SDDK/omp.hpp"
 
 namespace sddk {
 
diff --git a/src/api/generated.f90 b/src/api/generated.f90
index c135259c1..ce591bd4c 100644
--- a/src/api/generated.f90
+++ b/src/api/generated.f90
@@ -592,6 +592,25 @@ subroutine sirius_add_xc_functional_aux(handler,name)&
 call sirius_add_xc_functional_aux(handler,name)
 end subroutine sirius_add_xc_functional
 
+!> @brief Add one of the XC functionals.
+!> @param [in] gs_handler Handler of the ground state
+!> @param [in] name LibXC label of the functional.
+subroutine sirius_insert_xc_functional(gs_handler,name)
+implicit none
+type(C_PTR), intent(in) :: gs_handler
+character(C_CHAR), dimension(*), intent(in) :: name
+interface
+subroutine sirius_insert_xc_functional_aux(gs_handler,name)&
+&bind(C, name="sirius_insert_xc_functional")
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), intent(in) :: gs_handler
+character(C_CHAR), dimension(*), intent(in) :: name
+end subroutine
+end interface
+
+call sirius_insert_xc_functional_aux(gs_handler,name)
+end subroutine sirius_insert_xc_functional
+
 !> @brief Set dimensions of the MPI grid.
 !> @param [in] handler Simulation context handler
 !> @param [in] ndims Number of dimensions.
@@ -1513,55 +1532,55 @@ subroutine sirius_set_band_occupancies_aux(ks_handler,ik,ispn,band_occupancies)&
 call sirius_set_band_occupancies_aux(ks_handler,ik,ispn,band_occupancies)
 end subroutine sirius_set_band_occupancies
 
-!> @brief Get band energies.
+!> @brief Set band occupancies.
 !> @param [in] ks_handler K-point set handler.
 !> @param [in] ik Global index of k-point.
 !> @param [in] ispn Spin component.
-!> @param [out] band_energies Array of band energies.
-subroutine sirius_get_band_energies(ks_handler,ik,ispn,band_energies)
+!> @param [out] band_occupancies Array of band occupancies.
+subroutine sirius_get_band_occupancies(ks_handler,ik,ispn,band_occupancies)
 implicit none
 type(C_PTR), intent(in) :: ks_handler
 integer(C_INT), intent(in) :: ik
 integer(C_INT), intent(in) :: ispn
-real(C_DOUBLE), intent(out) :: band_energies
+real(C_DOUBLE), intent(out) :: band_occupancies
 interface
-subroutine sirius_get_band_energies_aux(ks_handler,ik,ispn,band_energies)&
-&bind(C, name="sirius_get_band_energies")
+subroutine sirius_get_band_occupancies_aux(ks_handler,ik,ispn,band_occupancies)&
+&bind(C, name="sirius_get_band_occupancies")
 use, intrinsic :: ISO_C_BINDING
 type(C_PTR), intent(in) :: ks_handler
 integer(C_INT), intent(in) :: ik
 integer(C_INT), intent(in) :: ispn
-real(C_DOUBLE), intent(out) :: band_energies
+real(C_DOUBLE), intent(out) :: band_occupancies
 end subroutine
 end interface
 
-call sirius_get_band_energies_aux(ks_handler,ik,ispn,band_energies)
-end subroutine sirius_get_band_energies
+call sirius_get_band_occupancies_aux(ks_handler,ik,ispn,band_occupancies)
+end subroutine sirius_get_band_occupancies
 
-!> @brief Get band occupancies.
+!> @brief Get band energies.
 !> @param [in] ks_handler K-point set handler.
 !> @param [in] ik Global index of k-point.
 !> @param [in] ispn Spin component.
-!> @param [out] band_occupancies Array of band occupancies.
-subroutine sirius_get_band_occupancies(ks_handler,ik,ispn,band_occupancies)
+!> @param [out] band_energies Array of band energies.
+subroutine sirius_get_band_energies(ks_handler,ik,ispn,band_energies)
 implicit none
 type(C_PTR), intent(in) :: ks_handler
 integer(C_INT), intent(in) :: ik
 integer(C_INT), intent(in) :: ispn
-real(C_DOUBLE), intent(out) :: band_occupancies
+real(C_DOUBLE), intent(out) :: band_energies
 interface
-subroutine sirius_get_band_occupancies_aux(ks_handler,ik,ispn,band_occupancies)&
-&bind(C, name="sirius_get_band_occupancies")
+subroutine sirius_get_band_energies_aux(ks_handler,ik,ispn,band_energies)&
+&bind(C, name="sirius_get_band_energies")
 use, intrinsic :: ISO_C_BINDING
 type(C_PTR), intent(in) :: ks_handler
 integer(C_INT), intent(in) :: ik
 integer(C_INT), intent(in) :: ispn
-real(C_DOUBLE), intent(out) :: band_occupancies
+real(C_DOUBLE), intent(out) :: band_energies
 end subroutine
 end interface
 
-call sirius_get_band_occupancies_aux(ks_handler,ik,ispn,band_occupancies)
-end subroutine sirius_get_band_occupancies
+call sirius_get_band_energies_aux(ks_handler,ik,ispn,band_energies)
+end subroutine sirius_get_band_energies
 
 !> @brief Get D-operator matrix
 !> @param [in] handler Simulation context handler.
@@ -3574,3 +3593,22 @@ subroutine sirius_set_callback_function_aux(handler,label,fptr,error_code)&
 call sirius_set_callback_function_aux(handler,label,fptr,error_code_ptr)
 end subroutine sirius_set_callback_function
 
+!> @brief Robust wave function optimizer
+!> @param [in] handler Ground state handler
+!> @param [in] ks_handler point set handler
+subroutine sirius_nlcg(handler,ks_handler)
+implicit none
+type(C_PTR), intent(in) :: handler
+type(C_PTR), intent(in) :: ks_handler
+interface
+subroutine sirius_nlcg_aux(handler,ks_handler)&
+&bind(C, name="sirius_nlcg")
+use, intrinsic :: ISO_C_BINDING
+type(C_PTR), intent(in) :: handler
+type(C_PTR), intent(in) :: ks_handler
+end subroutine
+end interface
+
+call sirius_nlcg_aux(handler,ks_handler)
+end subroutine sirius_nlcg
+
diff --git a/src/api/sirius_api.cpp b/src/api/sirius_api.cpp
index 9d58a5988..a7785c225 100644
--- a/src/api/sirius_api.cpp
+++ b/src/api/sirius_api.cpp
@@ -28,6 +28,10 @@
 #include "utils/any_ptr.hpp"
 #include "utils/profiler.hpp"
 #include "error_codes.hpp"
+#ifdef __NLCGLIB
+#include "nlcglib/adaptor.hpp"
+#include "nlcglib/nlcglib.hpp"
+#endif
 
 static inline void sirius_exit(int error_code__, std::string msg__ = "")
 {
@@ -518,6 +522,19 @@ void sirius_add_xc_functional(void* const* handler__,
     sim_ctx.add_xc_functional(std::string(name__));
 }
 
+/* @fortran begin function void sirius_insert_xc_functional         Add one of the XC functionals.
+   @fortran argument in required void* gs_handler                Handler of the ground state
+   @fortran argument in required string name                     LibXC label of the functional.
+   @fortran end */
+void
+sirius_insert_xc_functional(void* const* gs_handler__,
+                            char const* name__)
+{
+    auto& gs = get_gs(gs_handler__);
+    auto& potential = gs.potential();
+    potential.insert_xc_functionals({name__});
+}
+
 /* @fortran begin function void sirius_set_mpi_grid_dims      Set dimensions of the MPI grid.
    @fortran argument in required void*  handler               Simulation context handler
    @fortran argument in required int    ndims                 Number of dimensions.
@@ -748,6 +765,105 @@ void sirius_find_ground_state(void*  const* gs_handler__,
     auto result = gs.find(rho_tol, etol, ctx.iterative_solver_tolerance(), niter, save);
 }
 
+/* @fortran begin function void sirius_find_ground_state_robust     Find the ground state using the robust
+   wave-function optimization.
+   @fortran argument in required void*  gs_handler                  Handler of the ground state.
+   @fortran argument in required void*  ks_handler                  Handler of the k-point set.
+   @fortran argument in optional double scf_density_tol             Tolerance on RMS in density.
+   @fortran argument in optional double scf_energy_tol              Tolerance in total energy difference.
+   @fortran argument in optional int    scf_ninit__                 Number of SCF iterations.
+   @fortran argument in optional double temp__                      Temperature.
+   @fortran argument in optional double tol__                       Tolerance.
+   @fortran argument in optional int    cg_restart__                CG restart.
+   @fortran argument in optional char[] smearing__                  Smearing "FD" for Fermi-Dirac or "GS" for Gaussian-Spline.
+   @fortran argument in optional double kappa__                     Scalar preconditioner for pseudo Hamiltonian
+   ground state.
+   @fortran end */
+void sirius_find_ground_state_robust(void*  const* gs_handler__,
+                                     void*  const* ks_handler__,
+                                     double const* scf_density_tol__,
+                                     double const* scf_energy_tol__,
+                                     int    const* scf_ninit__,
+                                     double const* temp__,
+                                     double const* tol__
+                                    )
+{
+#ifdef __NLCGLIB
+    auto& gs = get_gs(gs_handler__);
+    auto& ctx = gs.ctx();
+    auto& inp = ctx.parameters_input();
+    gs.initial_state();
+
+    double rho_tol = inp.density_tol_;
+    if (scf_density_tol__) {
+        rho_tol = *scf_density_tol__;
+    }
+
+    double etol = inp.energy_tol_;
+    if (scf_energy_tol__) {
+        etol = *scf_energy_tol__;
+    }
+
+    int niter = inp.num_dft_iter_;
+    if (scf_ninit__) {
+        niter = *scf_ninit__;
+    }
+
+    // do a couple of SCF iterations to obtain a good initial guess
+    bool save_state = false;
+    auto result = gs.find(rho_tol, etol, ctx.iterative_solver_tolerance(), niter, save_state);
+
+    // now call the direct solver
+    // call nlcg solver
+    auto& potential = gs.potential();
+    auto& density = gs.density();
+
+    auto& kset = get_ks(ks_handler__);
+
+    auto nlcg_params  = ctx.nlcg_input();
+    double temp       = nlcg_params.T_;
+    double tol        = nlcg_params.tol_;
+    double kappa      = nlcg_params.kappa_;
+    double tau        = nlcg_params.tau_;
+    int maxiter       = nlcg_params.maxiter_;
+    int restart       = nlcg_params.restart_;
+    std::string smear = nlcg_params.smearing_;
+    std::string pu = nlcg_params.processing_unit_;
+
+    nlcglib::smearing_type smearing;
+    if (smear.compare("FD") == 0) {
+        smearing = nlcglib::smearing_type::FERMI_DIRAC;
+    } else if (smear.compare("GS") == 0) {
+        smearing = nlcglib::smearing_type::GAUSSIAN_SPLINE;
+    } else {
+        throw std::runtime_error("invalid smearing type given");
+    }
+
+    sirius::Energy energy(kset, density, potential);
+    if (is_device_memory(ctx.preferred_memory_t())) {
+        if (pu.empty() || pu.compare("gpu") == 0) {
+            nlcglib::nlcg_mvp2_device(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else if (pu.compare("cpu") == 0) {
+            nlcglib::nlcg_mvp2_device_cpu(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else {
+            throw std::runtime_error("invalid processing unit for nlcg given: " + pu);
+        }
+    } else {
+        if (pu.empty() || pu.compare("gpu") == 0) {
+            nlcglib::nlcg_mvp2_cpu(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else if (pu.compare("cpu") == 0) {
+            nlcglib::nlcg_mvp2_cpu_device(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else {
+            throw std::runtime_error("invalid processing unit for nlcg given: " + pu);
+        }
+    }
+#else
+    throw std::runtime_error("SIRIUS was not compiled with NLCG option.");
+#endif
+
+}
+
+
 /* @fortran begin function void sirius_update_ground_state   Update a ground state object after change of atomic coordinates or lattice vectors.
    @fortran argument in  required void*  gs_handler          Ground-state handler.
    @fortran end */
@@ -1344,39 +1460,38 @@ void sirius_set_band_occupancies(void*  const* ks_handler__,
     }
 }
 
-/* @fortran begin function void sirius_get_band_energies         Get band energies.
-   @fortran argument in  required void*   ks_handler             K-point set handler.
-   @fortran argument in  required int     ik                     Global index of k-point.
-   @fortran argument in  required int     ispn                   Spin component.
-   @fortran argument out required double  band_energies          Array of band energies.
+/* @fortran begin function void sirius_get_band_occupancies   Set band occupancies.
+   @fortran argument in  required void*   ks_handler          K-point set handler.
+   @fortran argument in  required int     ik                  Global index of k-point.
+   @fortran argument in  required int     ispn                Spin component.
+   @fortran argument out  required double  band_occupancies    Array of band occupancies.
    @fortran end */
-void sirius_get_band_energies(void*  const* ks_handler__,
-                              int    const* ik__,
-                              int    const* ispn__,
-                              double*       band_energies__)
+void
+sirius_get_band_occupancies(void* const* ks_handler__, int const* ik__, int const* ispn__,
+                            double* band_occupancies__)
 {
     auto& ks = get_ks(ks_handler__);
-    int ik = *ik__ - 1;
+    int ik   = *ik__ - 1;
     for (int i = 0; i < ks.ctx().num_bands(); i++) {
-        band_energies__[i] = ks[ik]->band_energy(i, *ispn__);
+        band_occupancies__[i] = ks[ik]->band_occupancy(i, *ispn__);
     }
 }
 
-/* @fortran begin function void sirius_get_band_occupancies      Get band occupancies.
+/* @fortran begin function void sirius_get_band_energies         Get band energies.
    @fortran argument in  required void*   ks_handler             K-point set handler.
    @fortran argument in  required int     ik                     Global index of k-point.
    @fortran argument in  required int     ispn                   Spin component.
-   @fortran argument out required double  band_occupancies       Array of band occupancies.
+   @fortran argument out required double  band_energies          Array of band energies.
    @fortran end */
-void sirius_get_band_occupancies(void*  const* ks_handler__,
-                                 int    const* ik__,
-                                 int    const* ispn__,
-                                 double*       band_occupancies__)
+void sirius_get_band_energies(void*  const* ks_handler__,
+                              int    const* ik__,
+                              int    const* ispn__,
+                              double*       band_energies__)
 {
     auto& ks = get_ks(ks_handler__);
     int ik = *ik__ - 1;
     for (int i = 0; i < ks.ctx().num_bands(); i++) {
-        band_occupancies__[i] = ks[ik]->band_occupancy(i, *ispn__);
+        band_energies__[i] = ks[ik]->band_energy(i, *ispn__);
     }
 }
 
@@ -3596,4 +3711,64 @@ void sirius_set_callback_function(void* const* handler__, char const* label__, v
     }, error_code__);
 }
 
+/* @fortran begin function void sirius_nlcg                       Robust wave function optimizer
+   @fortran argument in  required void*    handler                Ground state handler
+   @fortran argument in  required void*    ks_handler             point set handler
+   @fortran end */
+
+void sirius_nlcg(void* const* handler__,
+                 void* const* ks_handler__)
+{
+#ifdef __NLCGLIB
+    // call nlcg solver
+    auto& gs = get_gs(handler__);
+    auto& potential = gs.potential();
+    auto& density = gs.density();
+
+    auto& kset = get_ks(ks_handler__);
+    auto& ctx = kset.ctx();
+
+    auto nlcg_params  = ctx.nlcg_input();
+    double temp       = nlcg_params.T_;
+    double tol        = nlcg_params.tol_;
+    double kappa      = nlcg_params.kappa_;
+    double tau        = nlcg_params.tau_;
+    int maxiter       = nlcg_params.maxiter_;
+    int restart       = nlcg_params.restart_;
+    std::string smear = nlcg_params.smearing_;
+    std::string pu = nlcg_params.processing_unit_;
+
+    nlcglib::smearing_type smearing;
+    if (smear.compare("FD") == 0) {
+        smearing = nlcglib::smearing_type::FERMI_DIRAC;
+    } else if (smear.compare("GS") == 0) {
+        smearing = nlcglib::smearing_type::GAUSSIAN_SPLINE;
+    } else {
+        throw std::runtime_error("invalid smearing type given");
+    }
+
+    sirius::Energy energy(kset, density, potential);
+    if (is_device_memory(ctx.preferred_memory_t())) {
+        if (pu.empty() || pu.compare("gpu") == 0) {
+            nlcglib::nlcg_mvp2_device(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else if (pu.compare("cpu") == 0) {
+            nlcglib::nlcg_mvp2_device_cpu(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else {
+            throw std::runtime_error("invalid processing unit for nlcg given: " + pu);
+        }
+    } else {
+        if (pu.empty() || pu.compare("gpu") == 0) {
+            nlcglib::nlcg_mvp2_cpu(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else if (pu.compare("cpu") == 0) {
+            nlcglib::nlcg_mvp2_cpu_device(energy, smearing, temp, tol, kappa, tau, maxiter, restart);
+        } else {
+            throw std::runtime_error("invalid processing unit for nlcg given: " + pu);
+        }
+    }
+
+#else
+    throw std::runtime_error("SIRIUS was not compiled with NLCG option.");
+#endif
+}
+
 } // extern "C"
diff --git a/src/band/band.cpp b/src/band/band.cpp
index bda4aebbc..99a830b40 100644
--- a/src/band/band.cpp
+++ b/src/band/band.cpp
@@ -263,9 +263,9 @@ void Band::initialize_subspace(Hamiltonian_k& Hk__, int num_ao__) const
 
     auto& gen_solver = ctx_.gen_evp_solver();
 
-    dmatrix<T> hmlt(mp, num_phi_tot, num_phi_tot, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> ovlp(mp, num_phi_tot, num_phi_tot, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> evec(mp, num_phi_tot, num_phi_tot, ctx_.blacs_grid(), bs, bs);
+    sddk::dmatrix<T> hmlt(num_phi_tot, num_phi_tot, ctx_.blacs_grid(), bs, bs, mp);
+    sddk::dmatrix<T> ovlp(num_phi_tot, num_phi_tot, ctx_.blacs_grid(), bs, bs, mp);
+    sddk::dmatrix<T> evec(num_phi_tot, num_phi_tot, ctx_.blacs_grid(), bs, bs, mp);
 
     std::vector<double> eval(num_bands);
 
diff --git a/src/band/diag_full_potential.cpp b/src/band/diag_full_potential.cpp
index 27db64293..da7a35c46 100644
--- a/src/band/diag_full_potential.cpp
+++ b/src/band/diag_full_potential.cpp
@@ -40,23 +40,29 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k& Hk__) const
 
     auto& kp = Hk__.kp();
 
-    auto mem_type = (ctx_.gen_evp_solver_type() == ev_solver_t::magma) ? memory_t::host_pinned : memory_t::host;
-    int  ngklo    = kp.gklo_basis_size();
-    int  bs       = ctx_.cyclic_block_size();
+    auto& solver = ctx_.gen_evp_solver();
 
-    dmatrix<double_complex> h(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, mem_type);
-    dmatrix<double_complex> o(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, mem_type);
+    /* total eigen-value problem size */
+    int ngklo = kp.gklo_basis_size();
 
-    if (ctx_.gen_evp_solver_type() == ev_solver_t::cusolver || ctx_.processing_unit() == device_t::GPU) {
-        h.allocate(ctx_.mem_pool(memory_t::device));
-        o.allocate(ctx_.mem_pool(memory_t::device));
-    }
+    /* block size of scalapack 2d block-cyclic distribution */
+    int bs = ctx_.cyclic_block_size();
 
-    ctx_.print_memory_usage(__FILE__, __LINE__);
+    sddk::dmatrix<double_complex> h(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, ctx_.mem_pool(solver.host_memory_t()));
+    sddk::dmatrix<double_complex> o(ngklo, ngklo, ctx_.blacs_grid(), bs, bs, ctx_.mem_pool(solver.host_memory_t()));
 
     /* setup Hamiltonian and overlap */
     Hk__.set_fv_h_o(h, o);
 
+    //if (ctx_.gen_evp_solver_type() == ev_solver_t::cusolver || ctx_.processing_unit() == device_t::GPU) {
+    //    //h.allocate(ctx_.mem_pool(memory_t::device));
+    //    //o.allocate(ctx_.mem_pool(memory_t::device));
+    //    h.deallocate(memory_t::device);
+    //    o.deallocate(memory_t::device);
+    //}
+
+    ctx_.print_memory_usage(__FILE__, __LINE__);
+
     if (ctx_.control().verification_ >= 1) {
         double max_diff = check_hermitian(h, ngklo);
         if (max_diff > 1e-12) {
@@ -89,13 +95,9 @@ Band::diag_full_potential_first_variation_exact(Hamiltonian_k& Hk__) const
 
     std::vector<double> eval(ctx_.num_fv_states());
 
-    PROFILE_START("sirius::Band::diag_fv_exact|genevp");
-    auto& solver = ctx_.gen_evp_solver();
-
     if (solver.solve(kp.gklo_basis_size(), ctx_.num_fv_states(), h, o, eval.data(), kp.fv_eigen_vectors())) {
         TERMINATE("error in generalized eigen-value problem");
     }
-    PROFILE_STOP("sirius::Band::diag_fv_exact|genevp");
     kp.set_fv_eigen_values(&eval[0]);
 
     for (int i = 0; i < ctx_.num_fv_states(); i++) {
@@ -326,6 +328,12 @@ void Band::get_singular_components(Hamiltonian_k& Hk__, mdarray<double, 2>& o_di
 
     auto& std_solver = ctx_.std_evp_solver();
 
+    /* tolerance for the norm of L2-norms of the residuals, used for
+     * relative convergence criterion. We can only compute this after
+     * we have the first residual norms available */
+    double relative_frobenius_tolerance{0};
+    double current_frobenius_norm{0};
+
     /* start iterative diagonalization */
     for (int k = 0; k < itso.num_steps_; k++) {
         /* apply Hamiltonian and overlap operators to the new basis functions */
@@ -377,7 +385,7 @@ void Band::get_singular_components(Hamiltonian_k& Hk__, mdarray<double, 2>& o_di
         /* solve standard eigen-value problem with the size N */
         if (std_solver.solve(N, ncomp, ovlp, &eval[0], evec)) {
             std::stringstream s;
-            s << "[sirius::Band::get_singular_components] error in diagonalziation";
+            s << "[sirius::Band::get_singular_components] error in diagonalization";
             TERMINATE(s);
         }
 
@@ -394,28 +402,45 @@ void Band::get_singular_components(Hamiltonian_k& Hk__, mdarray<double, 2>& o_di
             kp.message(4, __function_name__, "eval[%i]=%20.16f, diff=%20.16f\n", i, eval[i], std::abs(eval[i] - eval_old[i]));
         }
 
+        bool last_iteration = k == (itso.num_steps_ - 1);
+
         /* don't compute residuals on last iteration */
-        if (k != itso.num_steps_ - 1) {
+        if (!last_iteration) {
             /* get new preconditionined residuals, and also opsi and psi as a by-product */
-            n = sirius::residuals(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), 0,
+            auto result = sirius::residuals(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), 0,
                                   N, ncomp, eval, evec, ophi, phi, opsi, psi, res, o_diag__, diag1,
                                   itso.converge_by_energy_, itso.residual_tolerance_,
                                   [&](int i, int ispn){return std::abs(eval[i] - eval_old[i]) < itso.energy_tolerance_;});
+            n = result.first;
+            current_frobenius_norm = result.second;
+
+            /* set the relative tolerance convergence criterion */
+            if (k == 0) {
+                relative_frobenius_tolerance = current_frobenius_norm * itso.relative_tolerance_;
+            }
+
             kp.message(3, __function_name__, "number of added residuals: %i\n", n);
             if (ctx_.control().print_checksum_) {
                 res.print_checksum(ctx_.processing_unit(), "res", 0, n);
             }
         }
+        /* verify convergence criteria */
+        bool converged_by_relative_tol = k > 0 && current_frobenius_norm < relative_frobenius_tolerance ;
+        bool converged_by_absolute_tol = n <= itso.min_num_res_;
+        bool converged = converged_by_absolute_tol || converged_by_relative_tol;
+
+        /* check if running out of space */
+        bool should_restart = N + n > num_phi;
 
         /* check if we run out of variational space or eigen-vectors are converged or it's a last iteration */
-        if (N + n > num_phi || n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+        if (should_restart || converged || last_iteration) {
             PROFILE("sirius::Band::get_singular_components|update_phi");
             /* recompute wave-functions */
             /* \Psi_{i} = \sum_{mu} \phi_{mu} * Z_{mu, i} */
             transform(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), 0, phi, 0, N, evec, 0, 0, psi, 0, ncomp);
 
             /* exit the loop if the eigen-vectors are converged or this is a last iteration */
-            if (n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+            if (converged || last_iteration) {
                 break;
             } else { /* otherwise, set Psi as a new trial basis */
                 kp.message(3, __function_name__, "%s", "subspace size limit reached\n");
@@ -593,8 +618,17 @@ void Band::diag_full_potential_first_variation_davidson(Hamiltonian_k& Hk__) con
 
     auto& std_solver = ctx_.std_evp_solver();
 
+    /* tolerance for the norm of L2-norms of the residuals, used for
+     * relative convergence criterion. We can only compute this after
+     * we have the first residual norms available */
+    double relative_frobenius_tolerance{0};
+    double current_frobenius_norm{0};
+
     /* start iterative diagonalization */
     for (int k = 0; k < itso.num_steps_; k++) {
+
+        bool last_iteration = k == (itso.num_steps_ - 1);
+
         /* apply Hamiltonian and overlap operators to the new basis functions */
         if (k == 0) {
             Hk__.apply_fv_h_o(false, true, 0, nlo, phi, &hphi, &ophi);
@@ -627,23 +661,42 @@ void Band::diag_full_potential_first_variation_davidson(Hamiltonian_k& Hk__) con
         }
 
         /* don't compute residuals on last iteration */
-        if (k != itso.num_steps_ - 1) {
+        if (!last_iteration) {
             /* get new preconditionined residuals, and also hpsi and opsi as a by-product */
-            n = sirius::residuals(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), 0,
+            auto result = sirius::residuals(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), 0,
                                   N, num_bands, eval, evec, hphi, ophi, hpsi, opsi, res, h_o_diag.first, h_o_diag.second,
                                   itso.converge_by_energy_, itso.residual_tolerance_,
                                   [&](int i, int ispn){return std::abs(eval[i] - eval_old[i]) < itso.energy_tolerance_;});
+            n = result.first;
+            current_frobenius_norm = result.second;
+
+            /* set the relative tolerance convergence criterion */
+            if (k == 0) {
+                relative_frobenius_tolerance = current_frobenius_norm * itso.relative_tolerance_;
+            }
+        }
+
+        /* verify convergence criteria */
+        bool converged_by_relative_tol = k > 0 && current_frobenius_norm < relative_frobenius_tolerance ;
+        bool converged_by_absolute_tol = n <= itso.min_num_res_;
+        bool converged = converged_by_absolute_tol || converged_by_relative_tol;
+
+        /* check if running out of space */
+        bool should_restart = N + n > num_phi;
+
+        if (converged) {
+            kp.message(3, __function_name__, "converged by %s tolerance\n", converged_by_relative_tol ? "relative" : "absolute");
         }
 
         /* check if we run out of variational space or eigen-vectors are converged or it's a last iteration */
-        if (N + n > num_phi || n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+        if (should_restart || converged || last_iteration) {
             PROFILE("sirius::Band::diag_fv_davidson|update_phi");
             /* recompute wave-functions */
             /* \Psi_{i} = \sum_{mu} \phi_{mu} * Z_{mu, i} */
             transform(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), 0, phi, 0, N, evec, 0, 0, psi, 0, num_bands);
 
             /* exit the loop if the eigen-vectors are converged or this is a last iteration */
-            if (n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+            if (converged || last_iteration) {
                 break;
             } else { /* otherwise, set Psi as a new trial basis */
                 kp.message(3, __function_name__, "%s", "subspace size limit reached\n");
@@ -685,13 +738,9 @@ void Band::diag_full_potential_second_variation(Hamiltonian_k& Hk__) const
     /* product of the second-variational Hamiltonian and a first-variational wave-function */
     std::vector<Wave_functions> hpsi;
     for (int i = 0; i < ctx_.num_mag_comp(); i++) {
-        hpsi.push_back(std::move(Wave_functions(kp.gkvec_partition(),
-                                                unit_cell_.num_atoms(),
-                                                [this](int ia) {
-                                                    return unit_cell_.atom(ia).mt_basis_size();
-                                                },
-                                                ctx_.num_fv_states(),
-                                                ctx_.preferred_memory_t())));
+        hpsi.push_back(Wave_functions(kp.gkvec_partition(), unit_cell_.num_atoms(),
+                                      [this](int ia) { return unit_cell_.atom(ia).mt_basis_size(); },
+                                      ctx_.num_fv_states(), ctx_.preferred_memory_t()));
     }
 
     /* compute product of magnetic field and wave-function */
@@ -723,10 +772,10 @@ void Band::diag_full_potential_second_variation(Hamiltonian_k& Hk__) const
     int bs  = ctx_.cyclic_block_size();
 
     if (ctx_.processing_unit() == device_t::GPU) {
-        kp.fv_states().allocate(spin_range(0), memory_t::device);
+        kp.fv_states().allocate(spin_range(0), ctx_.mem_pool(memory_t::device));
         kp.fv_states().copy_to(spin_range(0), memory_t::device, 0, nfv);
         for (int i = 0; i < ctx_.num_mag_comp(); i++) {
-            hpsi[i].allocate(spin_range(0), memory_t::device);
+            hpsi[i].allocate(spin_range(0), ctx_.mem_pool(memory_t::device));
             hpsi[i].copy_to(spin_range(0), memory_t::device, 0, nfv);
         }
     }
@@ -754,7 +803,7 @@ void Band::diag_full_potential_second_variation(Hamiltonian_k& Hk__) const
     if (ctx_.num_mag_dims() != 3) {
         dmatrix<double_complex> h(nfv, nfv, ctx_.blacs_grid(), bs, bs);
         if (ctx_.blacs_grid().comm().size() == 1 && ctx_.processing_unit() == device_t::GPU) {
-            h.allocate(memory_t::device);
+            h.allocate(ctx_.mem_pool(memory_t::device));
         }
         /* perform one or two consecutive diagonalizations */
         for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) {
@@ -776,7 +825,7 @@ void Band::diag_full_potential_second_variation(Hamiltonian_k& Hk__) const
         int nb = ctx_.num_bands();
         dmatrix<double_complex> h(nb, nb, ctx_.blacs_grid(), bs, bs);
         if (ctx_.blacs_grid().comm().size() == 1 && ctx_.processing_unit() == device_t::GPU) {
-            h.allocate(memory_t::device);
+            h.allocate(ctx_.mem_pool(memory_t::device));
         }
         /* compute <wf_i | h * wf_j> for up-up block */
         inner(mem, la, 0, kp.fv_states(), 0, nfv, hpsi[0], 0, nfv, h, 0, 0);
diff --git a/src/band/diag_pseudo_potential.cpp b/src/band/diag_pseudo_potential.cpp
index 8ff0ae503..39b3b4c8e 100644
--- a/src/band/diag_pseudo_potential.cpp
+++ b/src/band/diag_pseudo_potential.cpp
@@ -228,7 +228,7 @@ Band::diag_pseudo_potential_exact(int ispn__, Hamiltonian_k& Hk__) const
 
         std::vector<double> eo(kp.num_gkvec());
 
-        auto solver = Eigensolver_factory(ev_solver_t::scalapack);
+        auto solver = Eigensolver_factory("scalapack", nullptr);
         solver->solve(kp.num_gkvec(), ovlp1, eo.data(), evec);
 
         for (int i = 0; i < kp.num_gkvec(); i++) {
@@ -314,11 +314,11 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
 
     const int bs = ctx_.cyclic_block_size();
 
-    dmatrix<T> hmlt(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> ovlp(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> evec(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> hmlt_old(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> ovlp_old(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
+    dmatrix<T> hmlt(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
+    dmatrix<T> ovlp(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
+    dmatrix<T> evec(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
+    dmatrix<T> hmlt_old(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
+    dmatrix<T> ovlp_old(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
 
     if (is_device_memory(ctx_.aux_preferred_memory_t())) {
         auto& mpd = ctx_.mem_pool(memory_t::device);
@@ -402,11 +402,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
             if (std::abs(kp.band_occupancy(j__, ispn__)) < ctx_.min_occupancy() * ctx_.max_occupancy()) {
                 tol += empy_tol;
             }
-            if (std::abs(eval[j__] - eval_old[j__]) > tol) {
-                return false;
-            } else {
-                return true;
-            }
+            return std::abs(eval[j__] - eval_old[j__]) <= tol;
         };
 
         if (itso.init_eval_old_) {
@@ -428,8 +424,8 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
             }
         }
 
-        /* fisrt phase: setup and diagonalize reduced Hamiltonian and get eigen-values;
-         * this is done before the main itertive loop */
+        /* first phase: setup and diagonalize reduced Hamiltonian and get eigen-values;
+         * this is done before the main iterative loop */
 
         /* apply Hamiltonian and S operators to the basis functions */
         Hk__.apply_h_s<T>(spin_range(nc_mag ? 2 : ispin_step), 0, num_bands, phi, &hphi, &sphi);
@@ -452,7 +448,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
             double max_diff = check_hermitian(hmlt, num_bands);
             if (max_diff > 1e-12) {
                 std::stringstream s;
-                s << "H matrix is not hermitian, max_err = " << max_diff;
+                s << "H matrix is not Hermitian, max_err = " << max_diff;
                 WARNING(s);
             }
         }
@@ -464,14 +460,12 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
         /* current subspace size */
         int N = num_bands;
 
-        PROFILE_START("sirius::Band::diag_pseudo_potential_davidson|evp");
         /* solve generalized eigen-value problem with the size N and get lowest num_bands eigen-vectors */
         if (std_solver.solve(N, num_bands, hmlt, &eval[0], evec)) {
             std::stringstream s;
             s << "error in diagonalziation";
             TERMINATE(s);
         }
-        PROFILE_STOP("sirius::Band::diag_pseudo_potential_davidson|evp");
 
         ctx_.evp_work_count(1);
 
@@ -482,20 +476,47 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
         /* number of newly added basis functions */
         int n{0};
 
+        /* tolerance for the norm of L2-norms of the residuals, used for
+         * relative convergence criterion. We can only compute this after
+         * we have the first residual norms available */
+        double relative_frobenius_tolerance{0};
+        double current_frobenius_norm{0};
+
         /* second phase: start iterative diagonalization */
         for (int k = 0; k < itso.num_steps_; k++) {
 
+            bool last_iteration = k == (itso.num_steps_ - 1);
+
             /* don't compute residuals on last iteration */
-            if (k != itso.num_steps_ - 1) {
-                /* get new preconditionined residuals, and also hpsi and opsi as a by-product */
-                n = sirius::residuals<T>(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), nc_mag ? 2 : ispin_step,
+            if (!last_iteration) {
+                /* get new preconditioned residuals, and also hpsi and opsi as a by-product */
+                auto result = sirius::residuals<T>(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), nc_mag ? 2 : ispin_step,
                                          N, num_bands, eval, evec, hphi, sphi, hpsi, spsi, res, h_o_diag.first,
                                          h_o_diag.second, itso.converge_by_energy_, itso.residual_tolerance_,
                                          is_converged);
+                n = result.first;
+                current_frobenius_norm = result.second;
+
+                /* set the relative tolerance convergence criterion */
+                if (k == 0) {
+                    relative_frobenius_tolerance = current_frobenius_norm * itso.relative_tolerance_;
+                }
+            }
+
+            /* verify convergence criteria */
+            bool converged_by_relative_tol = k > 0 && current_frobenius_norm < relative_frobenius_tolerance ;
+            bool converged_by_absolute_tol = n <= itso.min_num_res_;
+            bool converged = converged_by_absolute_tol || converged_by_relative_tol;
+
+            /* check if running out of space */
+            bool should_restart = N + n > num_phi;
+
+            if (converged) {
+                kp.message(3, __function_name__, "converged by %s tolerance\n", converged_by_relative_tol ? "relative" : "absolute");
             }
 
             /* check if we run out of variational space or eigen-vectors are converged or it's a last iteration */
-            if (N + n > num_phi || n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+            if (should_restart || converged || last_iteration) {
                 PROFILE("sirius::Band::diag_pseudo_potential_davidson|update_phi");
                 /* recompute wave-functions */
                 /* \Psi_{i} = \sum_{mu} \phi_{mu} * Z_{mu, i} */
@@ -511,7 +532,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
                     kp.message(2, __function_name__, "%s", "wave-functions are not recomputed\n");
                 }
 
-                if (k == (itso.num_steps_ - 1) && n > itso.min_num_res_) {
+                if (last_iteration && !converged) {
                     std::stringstream s;
                     s << "[sirius::Band::diag_pseudo_potential_davidson] maximum number of iterations reached, but " <<
                          n << " residual(s) did not converge for k-point " << kp.vk();
@@ -519,7 +540,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
                 }
 
                 /* exit the loop if the eigen-vectors are converged or this is a last iteration */
-                if (n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+                if (converged || last_iteration) {
                     kp.message(3, __function_name__, "end of iterative diagonalization; n=%i, k=%i\n", n, k);
                     break;
                 } else { /* otherwise, set Psi as a new trial basis */
@@ -553,7 +574,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
                 }
             }
 
-            /* expand variational subspace with new basis vectors obtatined from residuals */
+            /* expand variational subspace with new basis vectors obtained from residuals */
             for (int ispn = 0; ispn < num_sc; ispn++) {
                 phi.copy_from(res, n, ispn, 0, ispn, N);
             }
@@ -574,7 +595,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
                 double max_diff = check_hermitian(hmlt, N + n);
                 if (max_diff > 1e-12) {
                     std::stringstream s;
-                    s << "H matrix is not hermitian, max_err = " << max_diff;
+                    s << "H matrix is not Hermitian, max_err = " << max_diff;
                     WARNING(s);
                 }
             }
@@ -587,7 +608,7 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
                     double max_diff = check_hermitian(ovlp, N + n);
                     if (max_diff > 1e-12) {
                         std::stringstream s;
-                        s << "S matrix is not hermitian, max_err = " << max_diff;
+                        s << "S matrix is not Hermitian, max_err = " << max_diff;
                         WARNING(s);
                     }
                 }
@@ -603,14 +624,14 @@ Band::diag_pseudo_potential_davidson(Hamiltonian_k& Hk__) const
                 /* solve standard eigen-value problem with the size N */
                 if (std_solver.solve(N, num_bands, hmlt, &eval[0], evec)) {
                     std::stringstream s;
-                    s << "error in diagonalziation";
+                    s << "error in diagonalization";
                     TERMINATE(s);
                 }
             } else {
                 /* solve generalized eigen-value problem with the size N */
                 if (gen_solver.solve(N, num_bands, hmlt, ovlp, &eval[0], evec)) {
                     std::stringstream s;
-                    s << "error in diagonalziation";
+                    s << "error in diagonalization";
                     TERMINATE(s);
                 }
             }
@@ -747,9 +768,9 @@ Band::diag_S_davidson(Hamiltonian_k& Hk__) const
 
     const int bs = ctx_.cyclic_block_size();
 
-    dmatrix<T> ovlp(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> evec(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
-    dmatrix<T> ovlp_old(mp, num_phi, num_phi, ctx_.blacs_grid(), bs, bs);
+    dmatrix<T> ovlp(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
+    dmatrix<T> evec(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
+    dmatrix<T> ovlp_old(num_phi, num_phi, ctx_.blacs_grid(), bs, bs, mp);
 
     if (is_device_memory(ctx_.aux_preferred_memory_t())) {
         auto& mpd = ctx_.mem_pool(memory_t::device);
@@ -809,6 +830,12 @@ Band::diag_S_davidson(Hamiltonian_k& Hk__) const
     mdarray<double, 1> eval_old(nevec);
     eval_old = [](){return 1e10;};
 
+    /* tolerance for the norm of L2-norms of the residuals, used for
+     * relative convergence criterion. We can only compute this after
+     * we have the first residual norms available */
+    double relative_frobenius_tolerance{0};
+    double current_frobenius_norm{0};
+
     for (int k = 0; k < itso.num_steps_; k++) {
 
         /* apply Hamiltonian and S operators to the basis functions */
@@ -849,20 +876,37 @@ Band::diag_S_davidson(Hamiltonian_k& Hk__) const
             }
 
             /* get new preconditionined residuals, and also opsi and psi as a by-product */
-            n = sirius::residuals<T>(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), nc_mag ? 2 : 0,
+            auto result = sirius::residuals<T>(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), nc_mag ? 2 : 0,
                                      N, nevec, eval, evec, sphi, phi, spsi, psi, res, o_diag, o_diag1,
                                      itso.converge_by_energy_, itso.residual_tolerance_,
                                      [&](int i, int ispn){return std::abs(eval[i] - eval_old[i]) < iterative_solver_tolerance;});
+            n = result.first;
+            current_frobenius_norm = result.second;
+
+            /* set the relative tolerance convergence criterion */
+            if (k == 0) {
+                relative_frobenius_tolerance = current_frobenius_norm * itso.relative_tolerance_;
+            }
         }
 
+        /* verify convergence criteria */
+        bool converged_by_relative_tol = k > 0 && current_frobenius_norm < relative_frobenius_tolerance ;
+        bool converged_by_absolute_tol = n <= itso.min_num_res_;
+        bool converged = converged_by_absolute_tol || converged_by_relative_tol;
+
+        /* check if running out of space */
+        bool should_restart = N + n > num_phi;
+
+        bool last_iteration = k == (itso.num_steps_ - 1);
+
         /* check if we run out of variational space or eigen-vectors are converged or it's a last iteration */
-        if (N + n > num_phi || n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+        if (should_restart || converged || last_iteration) {
             /* recompute wave-functions */
             /* \Psi_{i} = \sum_{mu} \phi_{mu} * Z_{mu, i} */
             transform(ctx_.preferred_memory_t(), ctx_.blas_linalg_t(), nc_mag ? 2 : 0, phi, 0, N, evec, 0, 0, psi, 0, nevec);
 
             /* exit the loop if the eigen-vectors are converged or this is a last iteration */
-            if (n <= itso.min_num_res_ || k == (itso.num_steps_ - 1)) {
+            if (converged || last_iteration) {
                 break;
             } else { /* otherwise, set Psi as a new trial basis */
                 kp.message(3, __function_name__, "%s", "subspace size limit reached\n");
diff --git a/src/band/residuals.cpp b/src/band/residuals.cpp
index 6026dc052..1b1abb597 100644
--- a/src/band/residuals.cpp
+++ b/src/band/residuals.cpp
@@ -52,19 +52,19 @@ compute_residuals(sddk::memory_t mem_type__, sddk::spin_range spins__, int num_b
             }
         } else {
 #if defined(__GPU)
-            compute_residuals_gpu(hpsi__.pw_coeffs(ispn).prime().at(memory_t::device),
-                                  opsi__.pw_coeffs(ispn).prime().at(memory_t::device),
-                                  res__.pw_coeffs(ispn).prime().at(memory_t::device),
+            compute_residuals_gpu(hpsi__.pw_coeffs(ispn).prime().at(sddk::memory_t::device),
+                                  opsi__.pw_coeffs(ispn).prime().at(sddk::memory_t::device),
+                                  res__.pw_coeffs(ispn).prime().at(sddk::memory_t::device),
                                   res__.pw_coeffs(ispn).num_rows_loc(),
                                   num_bands__,
-                                  eval__.at(memory_t::device));
+                                  eval__.at(sddk::memory_t::device));
             if (res__.has_mt()) {
-                compute_residuals_gpu(hpsi__.mt_coeffs(ispn).prime().at(memory_t::device),
-                                      opsi__.mt_coeffs(ispn).prime().at(memory_t::device),
-                                      res__.mt_coeffs(ispn).prime().at(memory_t::device),
+                compute_residuals_gpu(hpsi__.mt_coeffs(ispn).prime().at(sddk::memory_t::device),
+                                      opsi__.mt_coeffs(ispn).prime().at(sddk::memory_t::device),
+                                      res__.mt_coeffs(ispn).prime().at(sddk::memory_t::device),
                                       res__.mt_coeffs(ispn).num_rows_loc(),
                                       num_bands__,
-                                      eval__.at(memory_t::device));
+                                      eval__.at(sddk::memory_t::device));
             }
 #endif
         }
@@ -73,8 +73,9 @@ compute_residuals(sddk::memory_t mem_type__, sddk::spin_range spins__, int num_b
 
 /// Apply preconditioner to the residuals.
 static void
-apply_preconditioner(memory_t mem_type__, spin_range spins__, int num_bands__, Wave_functions& res__,
-                     mdarray<double, 2> const& h_diag__, mdarray<double, 2> const& o_diag__, mdarray<double, 1>& eval__)
+apply_preconditioner(sddk::memory_t mem_type__, sddk::spin_range spins__, int num_bands__, sddk::Wave_functions& res__,
+                     sddk::mdarray<double, 2> const& h_diag__, sddk::mdarray<double, 2> const& o_diag__,
+                     sddk::mdarray<double, 1>& eval__)
 {
     for (int ispn: spins__) {
         if (is_host_memory(mem_type__)) {
@@ -96,19 +97,19 @@ apply_preconditioner(memory_t mem_type__, spin_range spins__, int num_bands__, W
             }
         } else {
 #if defined(__GPU)
-            apply_preconditioner_gpu(res__.pw_coeffs(ispn).prime().at(memory_t::device),
+            apply_preconditioner_gpu(res__.pw_coeffs(ispn).prime().at(sddk::memory_t::device),
                                      res__.pw_coeffs(ispn).num_rows_loc(),
                                      num_bands__,
-                                     eval__.at(memory_t::device),
-                                     h_diag__.at(memory_t::device, 0, ispn),
-                                     o_diag__.at(memory_t::device, 0, ispn));
+                                     eval__.at(sddk::memory_t::device),
+                                     h_diag__.at(sddk::memory_t::device, 0, ispn),
+                                     o_diag__.at(sddk::memory_t::device, 0, ispn));
             if (res__.has_mt()) {
-                apply_preconditioner_gpu(res__.mt_coeffs(ispn).prime().at(memory_t::device),
+                apply_preconditioner_gpu(res__.mt_coeffs(ispn).prime().at(sddk::memory_t::device),
                                          res__.mt_coeffs(ispn).num_rows_loc(),
                                          num_bands__,
-                                         eval__.at(memory_t::device),
-                                         h_diag__.at(memory_t::device, res__.pw_coeffs(ispn).num_rows_loc(), ispn),
-                                         o_diag__.at(memory_t::device, res__.pw_coeffs(ispn).num_rows_loc(), ispn));
+                                         eval__.at(sddk::memory_t::device),
+                                         h_diag__.at(sddk::memory_t::device, res__.pw_coeffs(ispn).num_rows_loc(), ispn),
+                                         o_diag__.at(sddk::memory_t::device, res__.pw_coeffs(ispn).num_rows_loc(), ispn));
             }
 #endif
         }
@@ -116,10 +117,11 @@ apply_preconditioner(memory_t mem_type__, spin_range spins__, int num_bands__, W
 }
 
 template <typename T>
-static inline int
-normalized_preconditioned_residuals(memory_t mem_type__, spin_range spins__, int num_bands__, mdarray<double,1>& eval__,
-                                    Wave_functions& hpsi__, Wave_functions& opsi__, Wave_functions& res__,
-                                    mdarray<double, 2> const& h_diag__, mdarray<double, 2> const& o_diag__,
+static std::pair<int, double>
+normalized_preconditioned_residuals(sddk::memory_t mem_type__, sddk::spin_range spins__, int num_bands__,
+                                    sddk::mdarray<double,1>& eval__, sddk::Wave_functions& hpsi__,
+                                    sddk::Wave_functions& opsi__, sddk::Wave_functions& res__,
+                                    sddk::mdarray<double, 2> const& h_diag__, sddk::mdarray<double, 2> const& o_diag__,
                                     double norm_tolerance__)
 {
     PROFILE("sirius::normalized_preconditioned_residuals");
@@ -134,6 +136,11 @@ normalized_preconditioned_residuals(memory_t mem_type__, spin_range spins__, int
     /* compute norm of the "raw" residuals */
     auto res_norm = res__.l2norm(pu, spins__, num_bands__);
 
+    auto frobenius_norm = 0.0;
+    for (int i = 0; i < num_bands__; i++)
+        frobenius_norm += res_norm[i] * res_norm[i];
+    frobenius_norm = std::sqrt(frobenius_norm);
+
     /* apply preconditioner */
     apply_preconditioner(mem_type__, spins__, num_bands__, res__, h_diag__, o_diag__, eval__);
 
@@ -160,7 +167,7 @@ normalized_preconditioned_residuals(memory_t mem_type__, spin_range spins__, int
     if (std::is_same<T, double>::value && res__.comm().rank() == 0 && n != 0 && spins__() != 2) {
         if (is_device_memory(res__.preferred_memory_t())) {
 #if defined(__GPU)
-            make_real_g0_gpu(res__.pw_coeffs(spins__()).prime().at(memory_t::device), res__.pw_coeffs(spins__()).prime().ld(), n);
+            make_real_g0_gpu(res__.pw_coeffs(spins__()).prime().at(sddk::memory_t::device), res__.pw_coeffs(spins__()).prime().ld(), n);
 #endif
         } else {
             for (int i = 0; i < n; i++) {
@@ -169,28 +176,29 @@ normalized_preconditioned_residuals(memory_t mem_type__, spin_range spins__, int
         }
     }
 
-    return n;
+    return std::make_pair(n, frobenius_norm);
 }
 
 /// Compute residuals from eigen-vectors.
 template <typename T>
-int
-residuals(memory_t mem_type__, linalg_t la_type__, int ispn__, int N__, int num_bands__, mdarray<double, 1>& eval__,
-          dmatrix<T>& evec__, Wave_functions& hphi__, Wave_functions& ophi__, Wave_functions& hpsi__,
-          Wave_functions& opsi__, Wave_functions& res__, mdarray<double, 2> const& h_diag__,
-          mdarray<double, 2> const& o_diag__, bool estimate_eval__, double norm_tolerance__,
+std::pair<int, double>
+residuals(sddk::memory_t mem_type__, sddk::linalg_t la_type__, int ispn__, int N__, int num_bands__,
+          sddk::mdarray<double, 1>& eval__, sddk::dmatrix<T>& evec__, sddk::Wave_functions& hphi__,
+          sddk::Wave_functions& ophi__, sddk::Wave_functions& hpsi__, sddk::Wave_functions& opsi__,
+          sddk::Wave_functions& res__, sddk::mdarray<double, 2> const& h_diag__,
+          sddk::mdarray<double, 2> const& o_diag__, bool estimate_eval__, double norm_tolerance__,
           std::function<bool(int, int)> is_converged__)
 {
     PROFILE("sirius::residuals");
 
     assert(N__ != 0);
 
-    mdarray<double, 1> res_norm;
-    dmatrix<T> evec_tmp;
-    mdarray<double, 1> eval_tmp;
+    sddk::mdarray<double, 1> res_norm;
+    sddk::dmatrix<T> evec_tmp;
+    sddk::mdarray<double, 1> eval_tmp;
 
-    dmatrix<T>* evec_ptr{nullptr};
-    mdarray<double, 1>* eval_ptr{nullptr};
+    sddk::dmatrix<T>* evec_ptr{nullptr};
+    sddk::mdarray<double, 1>* eval_ptr{nullptr};
 
     int n{0};
     if (estimate_eval__) {
@@ -204,9 +212,9 @@ residuals(memory_t mem_type__, linalg_t la_type__, int ispn__, int N__, int num_
         n = static_cast<int>(ev_idx.size());
 
         if (n) {
-            eval_tmp = mdarray<double, 1>(n);
+            eval_tmp = sddk::mdarray<double, 1>(n);
             eval_ptr = &eval_tmp;
-            evec_tmp = dmatrix<T>(N__, n, evec__.blacs_grid(), evec__.bs_row(), evec__.bs_col());
+            evec_tmp = sddk::dmatrix<T>(N__, n, evec__.blacs_grid(), evec__.bs_row(), evec__.bs_col());
             evec_ptr = &evec_tmp;
 
             int num_rows_local = evec_tmp.num_rows_local();
@@ -228,47 +236,45 @@ residuals(memory_t mem_type__, linalg_t la_type__, int ispn__, int N__, int num_
                 }
             }
             if (is_device_memory(mem_type__) && evec_tmp.blacs_grid().comm().size() == 1) {
-                evec_tmp.allocate(memory_t::device);
+                evec_tmp.allocate(sddk::memory_t::device);
             }
             if (is_device_memory(mem_type__)) {
-                eval_tmp.allocate(memory_t::device).copy_to(memory_t::device);
+                eval_tmp.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
             }
         }
     } else { /* compute all residuals first */
         if (is_device_memory(mem_type__)) {
-            eval__.allocate(memory_t::device).copy_to(memory_t::device);
+            eval__.allocate(sddk::memory_t::device).copy_to(sddk::memory_t::device);
         }
         evec_ptr = &evec__;
         eval_ptr = &eval__;
         n = num_bands__;
     }
     if (!n) {
-        return 0;
+        return std::make_pair(0, 0);
     }
 
     /* compute H\Psi_{i} = \sum_{mu} H\phi_{mu} * Z_{mu, i} and O\Psi_{i} = \sum_{mu} O\phi_{mu} * Z_{mu, i} */
-    transform<T>(mem_type__, la_type__, ispn__, {&hphi__, &ophi__}, 0, N__, *evec_ptr, 0, 0, {&hpsi__, &opsi__}, 0, n);
+    sddk::transform<T>(mem_type__, la_type__, ispn__, {&hphi__, &ophi__}, 0, N__, *evec_ptr, 0, 0, {&hpsi__, &opsi__}, 0, n);
 
-    n = normalized_preconditioned_residuals<T>(mem_type__, spin_range(ispn__), n, *eval_ptr, hpsi__, opsi__, res__,
+    return normalized_preconditioned_residuals<T>(mem_type__, sddk::spin_range(ispn__), n, *eval_ptr, hpsi__, opsi__, res__,
                                                h_diag__, o_diag__, norm_tolerance__);
-
-    return n;
 }
 
-template
-int
-residuals<double>(memory_t mem_type__, linalg_t la_type__, int ispn__, int N__, int num_bands__, mdarray<double, 1>& eval__,
-                  dmatrix<double>& evec__, Wave_functions& hphi__, Wave_functions& ophi__, Wave_functions& hpsi__,
-                  Wave_functions& opsi__, Wave_functions& res__, mdarray<double, 2> const& h_diag__,
-                  mdarray<double, 2> const& o_diag__, bool estimate_eval__, double norm_tolerance__,
+template std::pair<int, double>
+residuals<double>(sddk::memory_t mem_type__, sddk::linalg_t la_type__, int ispn__, int N__, int num_bands__,
+                  sddk::mdarray<double, 1>& eval__, sddk::dmatrix<double>& evec__, sddk::Wave_functions& hphi__,
+                  sddk::Wave_functions& ophi__, sddk::Wave_functions& hpsi__, sddk::Wave_functions& opsi__,
+                  sddk::Wave_functions& res__, sddk::mdarray<double, 2> const& h_diag__,
+                  sddk::mdarray<double, 2> const& o_diag__, bool estimate_eval__, double norm_tolerance__,
                   std::function<bool(int, int)> is_converged__);
 
-template
-int
-residuals<double_complex>(memory_t mem_type__, linalg_t la_type__, int ispn__, int N__, int num_bands__, mdarray<double, 1>& eval__,
-                          dmatrix<double_complex>& evec__, Wave_functions& hphi__, Wave_functions& ophi__, Wave_functions& hpsi__,
-                          Wave_functions& opsi__, Wave_functions& res__, mdarray<double, 2> const& h_diag__,
-                          mdarray<double, 2> const& o_diag__, bool estimate_eval__, double norm_tolerance__,
-                          std::function<bool(int, int)> is_converged__);
+template std::pair<int, double>
+residuals<double_complex>(sddk::memory_t mem_type__, sddk::linalg_t la_type__, int ispn__, int N__, int num_bands__,
+                          sddk::mdarray<double, 1>& eval__, sddk::dmatrix<double_complex>& evec__,
+                          sddk::Wave_functions& hphi__, sddk::Wave_functions& ophi__, sddk::Wave_functions& hpsi__,
+                          sddk::Wave_functions& opsi__, sddk::Wave_functions& res__,
+                          sddk::mdarray<double, 2> const& h_diag__, sddk::mdarray<double, 2> const& o_diag__,
+                          bool estimate_eval__, double norm_tolerance__, std::function<bool(int, int)> is_converged__);
 
 } // namespace
diff --git a/src/band/residuals.hpp b/src/band/residuals.hpp
index c332e5ca2..9bff9aea5 100644
--- a/src/band/residuals.hpp
+++ b/src/band/residuals.hpp
@@ -70,13 +70,13 @@ extern "C" void make_real_g0_gpu(double_complex* res__,
 namespace sirius {
 
 /// Compute preconditionined residuals.
-/** The residuals of wave-functions are difined as:
+/** The residuals of wave-functions are defined as:
     \f[
       R_{i} = \hat H \psi_{i} - \epsilon_{i} \hat S \psi_{i}
     \f]
  */
 template <typename T>
-int
+std::pair<int, double>
 residuals(sddk::memory_t mem_type__, sddk::linalg_t la_type__, int ispn__, int N__, int num_bands__,
           sddk::mdarray<double, 1>& eval__, sddk::dmatrix<T>& evec__, sddk::Wave_functions& hphi__,
           sddk::Wave_functions& ophi__, sddk::Wave_functions& hpsi__,
diff --git a/src/density/augmentation_operator.hpp b/src/density/augmentation_operator.hpp
index 7463c0dfb..0f099ca79 100644
--- a/src/density/augmentation_operator.hpp
+++ b/src/density/augmentation_operator.hpp
@@ -191,6 +191,11 @@ class Augmentation_operator_gvec_deriv
         return q_pw_;
     }
 
+    mdarray<double, 2> & q_pw()
+    {
+        return q_pw_;
+    }
+
     double q_pw(int i__, int ig__) const
     {
         return q_pw_(i__, ig__);
diff --git a/src/density/density.cpp b/src/density/density.cpp
index e4b3fb70f..cdb1b5bc5 100644
--- a/src/density/density.cpp
+++ b/src/density/density.cpp
@@ -548,9 +548,10 @@ void Density::add_k_point_contribution_rg(K_point* kp__)
             if (!kp__->spinor_wave_functions().pw_coeffs(ispn).spl_num_col().global_index_size()) {
                 continue;
             }
-
-            for (int i = 0; i < kp__->spinor_wave_functions().pw_coeffs(ispn).spl_num_col().local_size(); i++) {
+            int ncols =kp__->spinor_wave_functions().pw_coeffs(ispn).spl_num_col().local_size();
+            for (int ii = 0; ii < ncols; ii++) {
                 /* global index of the band */
+                int i = ncols-1-ii;
                 int j    = kp__->spinor_wave_functions().pw_coeffs(ispn).spl_num_col()[i];
                 double w = kp__->band_occupancy(j, ispn) * kp__->weight() / omega;
 
@@ -1247,7 +1248,7 @@ mdarray<double_complex, 2> Density::generate_rho_aug()
         }
     }
 
-    // TODO: the GPU memory consumption here is huge, rewrite this; split gloc in blocks and 
+    // TODO: the GPU memory consumption here is huge, rewrite this; split gloc in blocks and
     //       overlap transfer of Q(G) for two consequtive blocks within one atom type
 
     if (ctx_.augmentation_op(0)) {
diff --git a/src/geometry/force.cpp b/src/geometry/force.cpp
index a3f3292d8..aaf035957 100644
--- a/src/geometry/force.cpp
+++ b/src/geometry/force.cpp
@@ -310,7 +310,6 @@ mdarray<double, 2> const& Force::calc_forces_ewald()
 
             /* cartesian form for getting cartesian force components */
             vector3d<double> gvec_cart = ctx_.gvec().gvec_cart<index_domain_t::local>(igloc);
-            double_complex rho(0, 0);
 
             double scalar_part = prefac * (rho_tmp[igloc] * ctx_.gvec_phase_factor(ig, ja)).imag() *
                                  static_cast<double>(unit_cell.atom(ja).zn()) * std::exp(-g2 / (4 * alpha)) / g2;
@@ -372,8 +371,14 @@ mdarray<double, 2> const& Force::calc_forces_us()
             break;
         }
         case device_t::GPU: {
+#ifdef __ROCM
+            // ROCm does not support cubblasxt functionality
+            mp = &ctx_.mem_pool(memory_t::host);
+            la = linalg_t::blas;
+#else
             mp = &ctx_.mem_pool(memory_t::host_pinned);
             la = linalg_t::cublasxt;
+#endif
             break;
         }
     }
diff --git a/src/geometry/stress.cpp b/src/geometry/stress.cpp
index 1c7f2541c..8bf0a06f2 100644
--- a/src/geometry/stress.cpp
+++ b/src/geometry/stress.cpp
@@ -323,9 +323,16 @@ matrix3d<double> Stress::calc_stress_us()
             break;
         }
         case device_t::GPU: {
+#ifdef __ROCM
+            // ROCm does not support cubblasxt functionality
+            mp = &ctx_.mem_pool(memory_t::host_pinned);
+            la = linalg_t::blas;
+            qmem = memory_t::host;
+#else
             mp = &ctx_.mem_pool(memory_t::host_pinned);
             la = linalg_t::cublasxt;
             qmem = memory_t::device;
+#endif
             break;
         }
     }
@@ -337,6 +344,11 @@ matrix3d<double> Stress::calc_stress_us()
         }
 
         q_deriv.prepare(atom_type, ri, ri_dq);
+#ifdef __ROCM
+        // ROCm does not support cubblasxt functionality - data required on host
+        q_deriv.q_pw().allocate(memory_t::host);
+#endif
+
 
         int nbf = atom_type.mt_basis_size();
 
@@ -362,6 +374,10 @@ matrix3d<double> Stress::calc_stress_us()
         for (int ispin = 0; ispin < ctx_.num_mag_dims() + 1; ispin++) {
             for (int nu = 0; nu < 3; nu++) {
                 q_deriv.generate_pw_coeffs(atom_type, nu);
+#ifdef __ROCM
+                // ROCm does not support cubblasxt functionality - data required on host
+                q_deriv.q_pw().copy_to(memory_t::host);
+#endif
 
                 for (int mu = 0; mu < 3; mu++) {
                     PROFILE_START("sirius::Stress|us|prepare");
diff --git a/src/gpu/acc.hpp b/src/gpu/acc.hpp
index 188947d06..fa5daa191 100644
--- a/src/gpu/acc.hpp
+++ b/src/gpu/acc.hpp
@@ -214,7 +214,9 @@ inline void sync_stream(stream_id sid__)
 /// Reset device.
 inline void reset()
 {
+#ifdef __CUDA
     CALL_DEVICE_API(ProfilerStop, ());
+#endif
     CALL_DEVICE_API(DeviceReset, ());
 }
 
@@ -385,7 +387,18 @@ inline void zero(T* ptr__, int ld__, int nrow__, int ncol__)
 template <typename T>
 inline T* allocate(size_t size__) {
     T* ptr{nullptr};
-    CALL_DEVICE_API(Malloc, (&ptr, size__ * sizeof(T)));
+#if defined(__CUDA) || defined(__ROCM)
+    //CALL_DEVICE_API(Malloc, (&ptr, size__ * sizeof(T)));
+    if (acc::num_devices()) {
+        acc_error_t error;
+        error = GPU_PREFIX(Malloc)(&ptr, size__ * sizeof(T));
+        if (error != GPU_PREFIX(Success)) {
+            std::printf("Device memory allocation of %li MB failed; available memory %li MB\n",
+                (size__ * sizeof(T)) >> 20, get_free_mem() >> 20);
+            stack_backtrace();
+        }
+    }
+#endif
     return ptr;
 }
 
diff --git a/src/gpu/acc_blas.cpp b/src/gpu/acc_blas.cpp
new file mode 100644
index 000000000..e3179d8cf
--- /dev/null
+++ b/src/gpu/acc_blas.cpp
@@ -0,0 +1,32 @@
+#if defined(__CUDA) || defined(__ROCM)
+#include "acc_blas.hpp"
+
+namespace accblas {
+
+::acc::blas::handle_t&
+null_stream_handle()
+{
+    static ::acc::blas::handle_t null_stream_handle_;
+    return null_stream_handle_;
+}
+
+std::vector<::acc::blas::handle_t>&
+stream_handles()
+{
+    static std::vector<::acc::blas::handle_t> stream_handles_;
+    return stream_handles_;
+}
+
+#if defined(__CUDA)
+namespace xt {
+cublasXtHandle_t&
+cublasxt_handle()
+{
+    static cublasXtHandle_t handle;
+    return handle;
+}
+} // namespace xt
+#endif
+}
+
+#endif
diff --git a/src/gpu/acc_blas.hpp b/src/gpu/acc_blas.hpp
new file mode 100644
index 000000000..b307007df
--- /dev/null
+++ b/src/gpu/acc_blas.hpp
@@ -0,0 +1,382 @@
+// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file acc_blas.hpp
+ *
+ *  \brief Blas functions for execution on GPUs.
+ */
+
+#ifndef __ACCBLAS_HPP__
+#define __ACCBLAS_HPP__
+
+#include <unistd.h>
+#include <vector>
+#include "acc_blas_api.hpp"
+#include "acc.hpp"
+
+namespace accblas {
+
+#ifdef __CUDA
+inline const char*
+error_message(::acc::blas::status_t status)
+{
+    switch (status) {
+        case CUBLAS_STATUS_NOT_INITIALIZED: {
+            return "the library was not initialized";
+            break;
+        }
+        case CUBLAS_STATUS_INVALID_VALUE: {
+            return "the parameters m,n,k<0";
+            break;
+        }
+        case CUBLAS_STATUS_ARCH_MISMATCH: {
+            return "the device does not support double-precision";
+            break;
+        }
+        case CUBLAS_STATUS_EXECUTION_FAILED: {
+            return "the function failed to launch on the GPU";
+            break;
+        }
+        default: {
+            return "gpublas status unknown";
+        }
+    }
+}
+#else
+inline const char*
+error_message(::acc::blas::status_t status)
+{
+    return rocblas_status_to_string(status);
+}
+#endif
+
+inline ::acc::blas::operation_t
+get_gpublasOperation_t(char c)
+{
+    switch (c) {
+        case 'n':
+        case 'N': {
+            return ::acc::blas::operation::None;
+        }
+        case 't':
+        case 'T': {
+            return ::acc::blas::operation::Transpose;
+        }
+        case 'c':
+        case 'C': {
+            return ::acc::blas::operation::ConjugateTranspose;
+        }
+        default: {
+            throw std::runtime_error("get_gpublasOperation_t(): wrong operation");
+        }
+    }
+    return ::acc::blas::operation::None; // make compiler happy
+}
+
+inline ::acc::blas::side_mode_t
+get_gpublasSideMode_t(char c)
+{
+    switch (c) {
+        case 'l':
+        case 'L': {
+            return ::acc::blas::side::Left;
+        }
+        case 'r':
+        case 'R': {
+            return ::acc::blas::side::Right;
+        }
+        default: {
+            throw std::runtime_error("get_gpublasSideMode_t(): wrong side");
+        }
+    }
+    return ::acc::blas::side::Left; // make compiler happy
+}
+
+inline ::acc::blas::fill_mode_t
+get_gpublasFillMode_t(char c)
+{
+    switch (c) {
+        case 'u':
+        case 'U': {
+            return ::acc::blas::fill::Upper;
+        }
+        case 'l':
+        case 'L': {
+            return ::acc::blas::fill::Lower;
+        }
+        default: {
+            throw std::runtime_error("get_gpublasFillMode_t(): wrong mode");
+        }
+    }
+    return ::acc::blas::fill::Upper; // make compiler happy
+}
+
+inline ::acc::blas::diagonal_t
+get_gpublasDiagonal_t(char c)
+{
+    switch (c) {
+        case 'n':
+        case 'N': {
+            return ::acc::blas::diagonal::NonUnit;
+        }
+        case 'u':
+        case 'U': {
+            return ::acc::blas::diagonal::Unit;
+        }
+        default: {
+            throw std::runtime_error("get_gpublasDiagonal_t(): wrong diagonal type");
+        }
+    }
+    return ::acc::blas::diagonal::NonUnit; // make compiler happy
+}
+
+#define CALL_GPU_BLAS(func__, args__)                                                                                  \
+    {                                                                                                                  \
+        ::acc::blas::status_t status;                                                                                  \
+        if ((status = func__ args__) != ::acc::blas::status::Success) {                                                \
+            error_message(status);                                                                                     \
+            char nm[1024];                                                                                             \
+            gethostname(nm, 1024);                                                                                     \
+            std::printf("hostname: %s\n", nm);                                                                         \
+            std::printf("Error in %s at line %i of file %s\n", #func__, __LINE__, __FILE__);                           \
+            stack_backtrace();                                                                                         \
+        }                                                                                                              \
+    }
+
+/// Store the default (null) stream handler.
+::acc::blas::handle_t& null_stream_handle();
+
+
+/// Store the gpublas handlers associated with acc streams.
+std::vector<::acc::blas::handle_t>& stream_handles();
+
+inline void
+create_stream_handles()
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::create, (&null_stream_handle()));
+
+    stream_handles() = std::vector<::acc::blas::handle_t>(acc::num_streams());
+    for (int i = 0; i < acc::num_streams(); i++) {
+        CALL_GPU_BLAS(::acc::blas::create, (&stream_handles()[i]));
+
+        CALL_GPU_BLAS(::acc::blas::set_stream, (stream_handles()[i], acc::stream(stream_id(i))));
+    }
+}
+
+inline void
+destroy_stream_handles()
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::destroy, (null_stream_handle()));
+    for (int i = 0; i < acc::num_streams(); i++) {
+        CALL_GPU_BLAS(::acc::blas::destroy, (stream_handles()[i]));
+    }
+}
+
+inline ::acc::blas::handle_t
+stream_handle(int id__)
+{
+    return (id__ == -1) ? null_stream_handle() : stream_handles()[id__];
+}
+
+inline void
+zgemv(char transa, int32_t m, int32_t n, acc_complex_double_t* alpha, acc_complex_double_t* a, int32_t lda,
+      acc_complex_double_t* x, int32_t incx, acc_complex_double_t* beta, acc_complex_double_t* y, int32_t incy,
+      int stream_id)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::zgemv, (stream_handle(stream_id), get_gpublasOperation_t(transa), m, n,
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(alpha),
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(a), lda,
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(x), incx,
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(beta),
+                                       reinterpret_cast<::acc::blas::complex_double_t*>(y), incy));
+}
+
+inline void
+zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, acc_complex_double_t const* alpha,
+      acc_complex_double_t const* a, int32_t lda, acc_complex_double_t const* b, int32_t ldb,
+      acc_complex_double_t const* beta, acc_complex_double_t* c, int32_t ldc, int stream_id)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::zgemm,
+                  (stream_handle(stream_id), get_gpublasOperation_t(transa), get_gpublasOperation_t(transb), m, n, k,
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(alpha),
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(a), lda,
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(b), ldb,
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(beta),
+                   reinterpret_cast<::acc::blas::complex_double_t*>(c), ldc));
+}
+
+inline void
+dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, double const* alpha, double const* a, int32_t lda,
+      double const* b, int32_t ldb, double const* beta, double* c, int32_t ldc, int stream_id)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::dgemm, (stream_handle(stream_id), get_gpublasOperation_t(transa),
+                                       get_gpublasOperation_t(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
+inline void
+dtrmm(char side__, char uplo__, char transa__, char diag__, int m__, int n__, double const* alpha__, double const* A__,
+      int lda__, double* B__, int ldb__, int stream_id)
+{
+    ::acc::blas::side_mode_t side   = get_gpublasSideMode_t(side__);
+    ::acc::blas::fill_mode_t uplo   = get_gpublasFillMode_t(uplo__);
+    ::acc::blas::operation_t transa = get_gpublasOperation_t(transa__);
+    ::acc::blas::diagonal_t diag    = get_gpublasDiagonal_t(diag__);
+    // acc::set_device();
+#ifdef __CUDA
+    CALL_GPU_BLAS(::acc::blas::dtrmm, (stream_handle(stream_id), side, uplo, transa, diag, m__, n__, alpha__, A__,
+                                       lda__, B__, ldb__, B__, ldb__));
+#else
+    // rocblas trmm function does not take three matrices
+    CALL_GPU_BLAS(::acc::blas::dtrmm,
+                  (stream_handle(stream_id), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__));
+#endif
+}
+
+inline void
+ztrmm(char side__, char uplo__, char transa__, char diag__, int m__, int n__, acc_complex_double_t const* alpha__,
+      acc_complex_double_t const* A__, int lda__, acc_complex_double_t* B__, int ldb__, int stream_id)
+{
+    ::acc::blas::side_mode_t side   = get_gpublasSideMode_t(side__);
+    ::acc::blas::fill_mode_t uplo   = get_gpublasFillMode_t(uplo__);
+    ::acc::blas::operation_t transa = get_gpublasOperation_t(transa__);
+    ::acc::blas::diagonal_t diag    = get_gpublasDiagonal_t(diag__);
+    // acc::set_device();
+#ifdef __CUDA
+    CALL_GPU_BLAS(::acc::blas::ztrmm, (stream_handle(stream_id), side, uplo, transa, diag, m__, n__,
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(alpha__),
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(A__), lda__,
+                                       reinterpret_cast<::acc::blas::complex_double_t*>(B__), ldb__,
+                                       reinterpret_cast<::acc::blas::complex_double_t*>(B__), ldb__));
+#else
+    // rocblas trmm function does not take three matrices
+    CALL_GPU_BLAS(::acc::blas::ztrmm, (stream_handle(stream_id), side, uplo, transa, diag, m__, n__,
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(alpha__),
+                                       reinterpret_cast<const ::acc::blas::complex_double_t*>(A__), lda__,
+                                       reinterpret_cast<::acc::blas::complex_double_t*>(B__), ldb__));
+#endif
+}
+
+inline void
+dger(int m, int n, double const* alpha, double const* x, int incx, double const* y, int incy, double* A, int lda,
+     int stream_id)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::dger, (stream_handle(stream_id), m, n, alpha, x, incx, y, incy, A, lda));
+}
+
+inline void
+zgeru(int m, int n, acc_complex_double_t const* alpha, acc_complex_double_t const* x, int incx,
+      acc_complex_double_t const* y, int incy, acc_complex_double_t* A, int lda, int stream_id)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::zgeru,
+                  (stream_handle(stream_id), m, n, reinterpret_cast<const ::acc::blas::complex_double_t*>(alpha),
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(x), incx,
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(y), incy,
+                   reinterpret_cast<::acc::blas::complex_double_t*>(A), lda));
+}
+
+inline void
+zaxpy(int n__, acc_complex_double_t const* alpha__, acc_complex_double_t const* x__, int incx__,
+      acc_complex_double_t* y__, int incy__)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(::acc::blas::zaxpy,
+                  (null_stream_handle(), n__, reinterpret_cast<const ::acc::blas::complex_double_t*>(alpha__),
+                   reinterpret_cast<const ::acc::blas::complex_double_t*>(x__), incx__,
+                   reinterpret_cast<::acc::blas::complex_double_t*>(y__), incy__));
+}
+
+#if defined(__CUDA)
+namespace xt {
+
+cublasXtHandle_t& cublasxt_handle();
+
+inline void
+create_handle()
+{
+    int device_id[1];
+    device_id[0] = acc::get_device_id();
+    CALL_GPU_BLAS(cublasXtCreate, (&cublasxt_handle()));
+    CALL_GPU_BLAS(cublasXtDeviceSelect, (cublasxt_handle(), 1, device_id));
+    CALL_GPU_BLAS(cublasXtSetBlockDim, (cublasxt_handle(), 4096));
+}
+
+inline void
+destroy_handle()
+{
+    CALL_GPU_BLAS(cublasXtDestroy, (cublasxt_handle()));
+}
+
+inline void
+zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, acc_complex_double_t const* alpha,
+      acc_complex_double_t const* a, int32_t lda, acc_complex_double_t const* b, int32_t ldb,
+      acc_complex_double_t const* beta, acc_complex_double_t* c, int32_t ldc)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(cublasXtZgemm, (cublasxt_handle(), get_gpublasOperation_t(transa), get_gpublasOperation_t(transb), m,
+                                  n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
+inline void
+dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, double const* alpha, double const* a, int32_t lda,
+      double const* b, int32_t ldb, double const* beta, double* c, int32_t ldc)
+{
+    // acc::set_device();
+    CALL_GPU_BLAS(cublasXtDgemm, (cublasxt_handle(), get_gpublasOperation_t(transa), get_gpublasOperation_t(transb), m,
+                                  n, k, alpha, a, lda, b, ldb, beta, c, ldc));
+}
+
+inline void
+dtrmm(char side__, char uplo__, char transa__, char diag__, int m__, int n__, double const* alpha__, double const* A__,
+      int lda__, double* B__, int ldb__)
+{
+    ::acc::blas::side_mode_t side   = get_gpublasSideMode_t(side__);
+    ::acc::blas::fill_mode_t uplo   = get_gpublasFillMode_t(uplo__);
+    ::acc::blas::operation_t transa = get_gpublasOperation_t(transa__);
+    ::acc::blas::diagonal_t diag    = get_gpublasDiagonal_t(diag__);
+    // acc::set_device();
+    CALL_GPU_BLAS(cublasXtDtrmm,
+                  (cublasxt_handle(), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__, B__, ldb__));
+}
+
+inline void
+ztrmm(char side__, char uplo__, char transa__, char diag__, int m__, int n__, acc_complex_double_t const* alpha__,
+      acc_complex_double_t const* A__, int lda__, acc_complex_double_t* B__, int ldb__)
+{
+    ::acc::blas::side_mode_t side   = get_gpublasSideMode_t(side__);
+    ::acc::blas::fill_mode_t uplo   = get_gpublasFillMode_t(uplo__);
+    ::acc::blas::operation_t transa = get_gpublasOperation_t(transa__);
+    ::acc::blas::diagonal_t diag    = get_gpublasDiagonal_t(diag__);
+    // acc::set_device();
+    CALL_GPU_BLAS(cublasXtZtrmm,
+                  (cublasxt_handle(), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__, B__, ldb__));
+}
+
+} // namespace xt
+#endif
+
+} // namespace gpublas
+
+#endif
diff --git a/src/gpu/acc_blas_api.hpp b/src/gpu/acc_blas_api.hpp
new file mode 100644
index 000000000..d4824fe06
--- /dev/null
+++ b/src/gpu/acc_blas_api.hpp
@@ -0,0 +1,249 @@
+// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that 
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file acc_blas_api.hpp
+ *
+ *  \brief Interface to cuBLAS / rocblas related functions.
+ */
+
+#ifndef __ACC_BLAS_API_HPP__
+#define __ACC_BLAS_API_HPP__
+
+#include <utility>
+
+#if defined(__CUDA)
+#include <cublas_v2.h>
+
+#elif defined(__ROCM)
+#include <rocblas.h>
+
+#else
+#error Either __CUDA or __ROCM must be defined!
+#endif
+
+namespace acc {
+namespace blas {
+
+#if defined(__CUDA)
+using handle_t = cublasHandle_t;
+using status_t = cublasStatus_t;
+using operation_t = cublasOperation_t;
+using side_mode_t = cublasSideMode_t;
+using diagonal_t = cublasDiagType_t;
+using fill_mode_t = cublasFillMode_t;
+using complex_float_t = cuComplex;
+using complex_double_t = cuDoubleComplex;
+#endif
+
+#if defined(__ROCM)
+using handle_t = rocblas_handle;
+using status_t = rocblas_status;
+using operation_t = rocblas_operation;
+using side_mode_t = rocblas_side;
+using diagonal_t = rocblas_diagonal;
+using fill_mode_t = rocblas_fill;
+using complex_float_t = rocblas_float_complex;
+using complex_double_t = rocblas_double_complex;
+#endif
+
+namespace operation {
+#if defined(__CUDA)
+constexpr auto None = CUBLAS_OP_N;
+constexpr auto Transpose = CUBLAS_OP_T;
+constexpr auto ConjugateTranspose = CUBLAS_OP_C;
+#endif
+
+#if defined(__ROCM)
+constexpr auto None = rocblas_operation_none;
+constexpr auto Transpose = rocblas_operation_transpose;
+constexpr auto ConjugateTranspose = rocblas_operation_conjugate_transpose;
+#endif
+}  // namespace operation
+
+namespace side {
+#if defined(__CUDA)
+constexpr auto Left = CUBLAS_SIDE_LEFT;
+constexpr auto Right = CUBLAS_SIDE_RIGHT;
+#endif
+
+#if defined(__ROCM)
+constexpr auto Left = rocblas_side_left;
+constexpr auto Right = rocblas_side_right;
+#endif
+}  // namespace side
+
+namespace diagonal {
+#if defined(__CUDA)
+constexpr auto NonUnit = CUBLAS_DIAG_NON_UNIT;
+constexpr auto Unit = CUBLAS_DIAG_UNIT;
+#endif
+
+#if defined(__ROCM)
+constexpr auto NonUnit = rocblas_diagonal_non_unit;
+constexpr auto Unit = rocblas_diagonal_unit;
+#endif
+}  // namespace diagonal
+
+namespace fill {
+#if defined(__CUDA)
+constexpr auto Upper = CUBLAS_FILL_MODE_UPPER;
+constexpr auto Lower = CUBLAS_FILL_MODE_LOWER;
+#endif
+
+#if defined(__ROCM)
+constexpr auto Upper = rocblas_fill_upper;
+constexpr auto Lower = rocblas_fill_lower;
+#endif
+}  // namespace fill
+
+namespace status {
+#if defined(__CUDA)
+constexpr auto Success = CUBLAS_STATUS_SUCCESS;
+#endif
+
+#if defined(__ROCM)
+constexpr auto Success = rocblas_status_success;
+#endif
+}  // namespace status
+
+// =======================================
+// Forwarding functions of to GPU BLAS API
+// =======================================
+template <typename... ARGS>
+inline auto create(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_create_handle(std::forward<ARGS>(args)...);
+#else
+  return cublasCreate(std::forward<ARGS>(args)...);
+#endif
+}
+
+template <typename... ARGS>
+inline auto destroy(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_destroy_handle(std::forward<ARGS>(args)...);
+#else
+  return cublasDestroy(std::forward<ARGS>(args)...);
+#endif
+}
+
+template <typename... ARGS>
+inline auto set_stream(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_set_stream(std::forward<ARGS>(args)...);
+#else
+  return cublasSetStream(std::forward<ARGS>(args)...);
+#endif
+}
+
+template <typename... ARGS>
+inline auto get_stream(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_get_stream(std::forward<ARGS>(args)...);
+#else
+  return cublasGetStream(std::forward<ARGS>(args)...);
+#endif
+}
+
+
+template <typename... ARGS>
+inline auto dgemm(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_dgemm(std::forward<ARGS>(args)...);
+#else
+  return cublasDgemm(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto zgemm(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_zgemm(std::forward<ARGS>(args)...);
+#else
+  return cublasZgemm(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto dgemv(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_dgemv(std::forward<ARGS>(args)...);
+#else
+  return cublasDgemv(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto zgemv(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_zgemv(std::forward<ARGS>(args)...);
+#else
+  return cublasZgemv(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto dtrmm(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_dtrmm(std::forward<ARGS>(args)...);
+#else
+  return cublasDtrmm(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto ztrmm(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_ztrmm(std::forward<ARGS>(args)...);
+#else
+  return cublasZtrmm(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto dger(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_dger(std::forward<ARGS>(args)...);
+#else
+  return cublasDger(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto zgeru(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_zgeru(std::forward<ARGS>(args)...);
+#else
+  return cublasZgeru(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+template <typename... ARGS>
+inline auto zaxpy(ARGS&&... args) -> status_t {
+#if defined(__ROCM)
+  return rocblas_zaxpy(std::forward<ARGS>(args)...);
+#else
+  return cublasZaxpy(std::forward<ARGS>(args)...);
+#endif // __ROCM
+}
+
+}  // namespace blas
+}  // namespace acc
+
+#endif
diff --git a/src/gpu/cublas.cpp b/src/gpu/cublas.cpp
new file mode 100644
index 000000000..3878b704e
--- /dev/null
+++ b/src/gpu/cublas.cpp
@@ -0,0 +1,32 @@
+#ifdef __CUDA
+#include "cublas.hpp"
+
+namespace cublas {
+
+cublasHandle_t&
+null_stream_handle()
+{
+    static cublasHandle_t null_stream_handle_;
+    return null_stream_handle_;
+}
+
+/// Store the cublas handlers associated with cuda streams.
+std::vector<cublasHandle_t>&
+stream_handles()
+{
+    static std::vector<cublasHandle_t> stream_handles_;
+    return stream_handles_;
+}
+
+namespace xt {
+
+cublasXtHandle_t&
+cublasxt_handle()
+{
+    static cublasXtHandle_t handle;
+    return handle;
+}
+} // namespace xt
+
+} // namespace cublas
+#endif
diff --git a/src/gpu/cublas.hpp b/src/gpu/cublas.hpp
index 3c7324611..ef1b1d56c 100644
--- a/src/gpu/cublas.hpp
+++ b/src/gpu/cublas.hpp
@@ -1,20 +1,20 @@
 // Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
 // All rights reserved.
 //
-// Redistribution and use in source and binary forms, with or without modification, are permitted provided that 
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
 // the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 //    following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 //    and the following disclaimer in the documentation and/or other materials provided with the distribution.
 //
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 /** \file cublas.hpp
@@ -51,8 +51,24 @@ inline void error_message(cublasStatus_t status)
             std::printf("the function failed to launch on the GPU\n");
             break;
         }
+        case CUBLAS_STATUS_INTERNAL_ERROR: {
+            std::printf("cublas internal error\n");
+            break;
+        }
+        case CUBLAS_STATUS_MAPPING_ERROR: {
+            std::printf("cublas mapping error\n");
+            break;
+        }
+        case CUBLAS_STATUS_ALLOC_FAILED: {
+            std::printf("cublas mapping error\n");
+            break;
+        }
+        case CUBLAS_STATUS_NOT_SUPPORTED: {
+            std::printf("cublas not supported error\n");
+            break;
+        }
         default: {
-            std::printf("cublas status unknown");
+            std::printf((std::string("cublas status unknown, error code = ") + std::to_string(status) + std::string("\n")).c_str());
         }
     }
 }
@@ -165,18 +181,9 @@ inline cublasDiagType_t get_cublasDiagType_t(char c)
 #endif
 
 /// Store the default (null) stream handler.
-inline cublasHandle_t& null_stream_handle()
-{
-    static cublasHandle_t null_stream_handle_;
-    return null_stream_handle_;
-}
-
+cublasHandle_t& null_stream_handle();
 /// Store the cublas handlers associated with cuda streams.
-inline std::vector<cublasHandle_t>& stream_handles()
-{
-    static std::vector<cublasHandle_t> stream_handles_;
-    return stream_handles_;
-}
+std::vector<cublasHandle_t>& stream_handles();
 
 inline void create_stream_handles()
 {
@@ -205,15 +212,15 @@ inline cublasHandle_t stream_handle(int id__)
     return (id__ == -1) ? null_stream_handle() : stream_handles()[id__];
 }
 
-inline void zgemv(char transa, int32_t m, int32_t n, cuDoubleComplex* alpha, cuDoubleComplex* a, int32_t lda, 
+inline void zgemv(char transa, int32_t m, int32_t n, cuDoubleComplex* alpha, cuDoubleComplex* a, int32_t lda,
                   cuDoubleComplex* x, int32_t incx, cuDoubleComplex* beta, cuDoubleComplex* y, int32_t incy, int stream_id)
 {
     //acc::set_device();
     CALL_CUBLAS(cublasZgemv, (stream_handle(stream_id), get_cublasOperation_t(transa), m, n, alpha, a, lda, x, incx, beta, y, incy));
 }
 
-inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, 
-                  cuDoubleComplex const* alpha, cuDoubleComplex const* a, int32_t lda, cuDoubleComplex const* b, 
+inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k,
+                  cuDoubleComplex const* alpha, cuDoubleComplex const* a, int32_t lda, cuDoubleComplex const* b,
                   int32_t ldb, cuDoubleComplex const* beta, cuDoubleComplex* c, int32_t ldc, int stream_id)
 {
     //acc::set_device();
@@ -221,12 +228,12 @@ inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k,
                               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
 }
 
-inline void dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, 
-                  double const* alpha, double const* a, int32_t lda, double const* b, 
+inline void dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k,
+                  double const* alpha, double const* a, int32_t lda, double const* b,
                   int32_t ldb, double const* beta, double* c, int32_t ldc, int stream_id)
 {
     //acc::set_device();
-    CALL_CUBLAS(cublasDgemm, (stream_handle(stream_id), get_cublasOperation_t(transa), get_cublasOperation_t(transb), 
+    CALL_CUBLAS(cublasDgemm, (stream_handle(stream_id), get_cublasOperation_t(transa), get_cublasOperation_t(transb),
                               m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
 }
 
@@ -276,7 +283,7 @@ inline void zgeru(int                    m,
                   cuDoubleComplex const* y,
                   int                    incy,
                   cuDoubleComplex*       A,
-                  int                    lda, 
+                  int                    lda,
                   int                    stream_id)
 {
     //acc::set_device();
@@ -296,11 +303,7 @@ inline void zaxpy(int                    n__,
 
 namespace xt {
 
-inline cublasXtHandle_t& cublasxt_handle()
-{
-    static cublasXtHandle_t handle;
-    return handle;
-}
+cublasXtHandle_t& cublasxt_handle();
 
 inline void create_handle()
 {
@@ -316,7 +319,7 @@ inline void destroy_handle()
     CALL_CUBLAS(cublasXtDestroy, (cublasxt_handle()));
 }
 
-inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, 
+inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k,
                   cuDoubleComplex const* alpha, cuDoubleComplex const* a, int32_t lda, cuDoubleComplex const* b,
                   int32_t ldb, cuDoubleComplex const* beta, cuDoubleComplex* c, int32_t ldc)
 {
@@ -325,8 +328,8 @@ inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k,
                                 m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
 }
 
-inline void dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, 
-                  double const* alpha, double const* a, int32_t lda, double const* b, 
+inline void dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k,
+                  double const* alpha, double const* a, int32_t lda, double const* b,
                   int32_t ldb, double const* beta, double* c, int32_t ldc)
 {
     //acc::set_device();
diff --git a/src/gpu/cuda_common.hpp b/src/gpu/cuda_common.hpp
index cc6df0c5c..e06627833 100644
--- a/src/gpu/cuda_common.hpp
+++ b/src/gpu/cuda_common.hpp
@@ -27,6 +27,7 @@
 
 #include <stdio.h>
 #include "acc.hpp"
+#include "acc_runtime.hpp"
 
 const double twopi = 6.2831853071795864769;
 
diff --git a/src/gpu/cusolver.cpp b/src/gpu/cusolver.cpp
new file mode 100644
index 000000000..6e2f23819
--- /dev/null
+++ b/src/gpu/cusolver.cpp
@@ -0,0 +1,27 @@
+#ifdef __CUDA
+#include "cusolver.hpp"
+
+namespace cusolver {
+
+cusolverDnHandle_t&
+cusolver_handle()
+{
+    static cusolverDnHandle_t handle;
+    return handle;
+}
+
+void
+create_handle()
+{
+    CALL_CUSOLVER(cusolverDnCreate, (&cusolver_handle()));
+}
+
+void
+destroy_handle()
+{
+    CALL_CUSOLVER(cusolverDnDestroy, (cusolver_handle()));
+}
+
+
+} // namespace cusolver
+#endif
diff --git a/src/gpu/cusolver.hpp b/src/gpu/cusolver.hpp
index 0ee09d882..0a1c4e10a 100644
--- a/src/gpu/cusolver.hpp
+++ b/src/gpu/cusolver.hpp
@@ -62,21 +62,9 @@ inline void error_message(cusolverStatus_t status)
     }                                                                               \
 }
 
-inline cusolverDnHandle_t& cusolver_handle()
-{
-    static cusolverDnHandle_t handle;
-    return handle;
-}
-
-inline void create_handle()
-{
-    CALL_CUSOLVER(cusolverDnCreate, (&cusolver_handle()));
-}
-
-inline void destroy_handle()
-{
-    CALL_CUSOLVER(cusolverDnDestroy, (cusolver_handle()));
-}
+cusolverDnHandle_t& cusolver_handle();
+void create_handle();
+void destroy_handle();
 
 } // namespace cusolver
 
diff --git a/src/gpu/generate_dm_pw.cu b/src/gpu/generate_dm_pw.cu
index 6b38d15ed..e11eadbc9 100644
--- a/src/gpu/generate_dm_pw.cu
+++ b/src/gpu/generate_dm_pw.cu
@@ -24,7 +24,7 @@
 
 #include "gpu/cuda_common.hpp"
 #include "gpu/acc_runtime.hpp"
-#include "gpu/gpublas_interface.hpp"
+#include "gpu/acc_blas.hpp"
 
 __global__ void generate_phase_factors_conj_gpu_kernel
 (
@@ -86,7 +86,7 @@ extern "C" void generate_dm_pw_gpu(int num_atoms__,
     double alpha = 1;
     double beta = 0;
 
-    gpublas::dgemm('N', 'T', nbf__ * (nbf__ + 1) / 2, num_gvec_loc__ * 2, num_atoms__,
+    accblas::dgemm('N', 'T', nbf__ * (nbf__ + 1) / 2, num_gvec_loc__ * 2, num_atoms__,
                   &alpha,
                   dm__, nbf__ * (nbf__ + 1) / 2,
                   phase_factors__, num_gvec_loc__ * 2,
diff --git a/src/gpu/gpublas_interface.hpp b/src/gpu/gpublas_interface.hpp
deleted file mode 100644
index a0f9579bf..000000000
--- a/src/gpu/gpublas_interface.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without modification, are permitted provided that 
-// the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 
-//    following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
-//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-/** \file cublas.hpp
- *
- *  \brief Interface to GPU blas libraries
- */
-
-#ifndef __GPUBLAS_INTERFACE_HPP__
-#define __GPUBLAS_INTERFACE_HPP__
-
-
-#if defined(__GPU) && defined(__CUDA)
-#include "cublas.hpp"
-namespace gpublas = cublas;
-
-#elif defined(__GPU) && defined(__ROCM)
-#include "hipblas_interface.hpp"
-namespace gpublas = hipblas;
-
-#endif
-
-#endif
diff --git a/src/gpu/hipblas_interface.hpp b/src/gpu/hipblas_interface.hpp
deleted file mode 100644
index b6f8317b8..000000000
--- a/src/gpu/hipblas_interface.hpp
+++ /dev/null
@@ -1,303 +0,0 @@
-// Copyright (c) 2013-2017 Anton Kozhevnikov, Thomas Schulthess
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without modification, are permitted provided that 
-// the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the 
-//    following disclaimer.
-// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
-//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
-// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
-// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
-// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
-// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-/** \file hipblas.hpp
- *
- *  \brief Interface to hipblas related functions.
- */
-
-#ifndef __HIP_BLAS_INTERFACE_HPP__
-#define __HIP_BLAS_INTERFACE_HPP__
-
-#include <unistd.h>
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_complex.h>
-#include <vector>
-#include <hipblas.h>
-#include "acc.hpp"
-#include "hipblas_port.h"
-// #include "blas_lapack.h"
-
-namespace hipblas {
-
-inline void error_message(hipblasStatus_t status)
-{
-    switch (status) {
-        case HIPBLAS_STATUS_NOT_INITIALIZED: {
-            std::printf("the library was not initialized\n");
-            break;
-        }
-        case HIPBLAS_STATUS_INVALID_VALUE: {
-            std::printf("the parameters m,n,k<0\n");
-            break;
-        }
-        case HIPBLAS_STATUS_ARCH_MISMATCH: {
-            std::printf("the device does not support double-precision\n");
-            break;
-        }
-        case HIPBLAS_STATUS_EXECUTION_FAILED: {
-            std::printf("the function failed to launch on the GPU\n");
-            break;
-        }
-        default: {
-            std::printf("hipblas status unknown");
-        }
-    }
-}
-
-inline hipblasOperation_t get_hipblasOperation_t(char c)
-{
-    switch (c) {
-        case 'n':
-        case 'N': {
-            return HIPBLAS_OP_N;
-        }
-        case 't':
-        case 'T': {
-            return HIPBLAS_OP_T;
-        }
-        case 'c':
-        case 'C': {
-            return HIPBLAS_OP_C;
-        }
-        default: {
-            throw std::runtime_error("get_hipblasOperation_t(): wrong operation");
-        }
-    }
-    return HIPBLAS_OP_N; // make compiler happy
-}
-
-inline hipblasSideMode_t get_hipblasSideMode_t(char c)
-{
-    switch (c) {
-        case 'l':
-        case 'L': {
-            return HIPBLAS_SIDE_LEFT;
-        }
-        case 'r':
-        case 'R': {
-            return HIPBLAS_SIDE_RIGHT;
-        }
-        default: {
-            throw std::runtime_error("get_hipblasSideMode_t(): wrong side");
-        }
-    }
-    return HIPBLAS_SIDE_LEFT; //make compiler happy
-}
-
-inline hipblasFillMode_t get_hipblasFillMode_t(char c)
-{
-    switch (c) {
-        case 'u':
-        case 'U': {
-            return HIPBLAS_FILL_MODE_UPPER;
-        }
-        case 'l':
-        case 'L': {
-            return HIPBLAS_FILL_MODE_LOWER;
-        }
-        default: {
-            throw std::runtime_error("get_hipblasFillMode_t(): wrong mode");
-        }
-    }
-    return HIPBLAS_FILL_MODE_UPPER; // make compiler happy
-}
-
-inline hipblasDiagType_t get_hipblasDiagType_t(char c)
-{
-    switch (c) {
-        case 'n':
-        case 'N': {
-            return HIPBLAS_DIAG_NON_UNIT;
-        }
-        case 'u':
-        case 'U': {
-            return HIPBLAS_DIAG_UNIT;
-        }
-        default: {
-            throw std::runtime_error("get_hipblasDiagType_t(): wrong diagonal type");
-        }
-    }
-    return HIPBLAS_DIAG_NON_UNIT; // make compiler happy
-}
-
-#ifdef NDEBUG
-#define CALL_HIPBLAS(func__, args__)                                                                                   \
-    {                                                                                                                  \
-        hipblasStatus_t status;                                                                                        \
-        if ((status = func__ args__) != HIPBLAS_STATUS_SUCCESS) {                                                      \
-            error_message(status);                                                                                     \
-            char nm[1024];                                                                                             \
-            gethostname(nm, 1024);                                                                                     \
-            std::printf("hostname: %s\n", nm);                                                                              \
-            std::printf("Error in %s at line %i of file %s\n", #func__, __LINE__, __FILE__);                                \
-            stack_backtrace();                                                                                         \
-        }                                                                                                              \
-        hipDeviceSynchronize();                                                                                        \
-    }
-#else
-#define CALL_HIPBLAS(func__, args__)                                                                                   \
-    {                                                                                                                  \
-        hipblasStatus_t status;                                                                                        \
-        if ((status = func__ args__) != HIPBLAS_STATUS_SUCCESS) {                                                      \
-            error_message(status);                                                                                     \
-            char nm[1024];                                                                                             \
-            gethostname(nm, 1024);                                                                                     \
-            std::printf("hostname: %s\n", nm);                                                                              \
-            std::printf("Error in %s at line %i of file %s\n", #func__, __LINE__, __FILE__);                                \
-            stack_backtrace();                                                                                         \
-        }                                                                                                              \
-        hipDeviceSynchronize();                                                                                        \
-    }
-#endif
-
-/// Store the default (null) stream handler.
-inline hipblasHandle_t& null_stream_handle()
-{
-    static hipblasHandle_t null_stream_handle_;
-    return null_stream_handle_;
-}
-
-/// Store the hipblas handlers associated with hip streams.
-inline std::vector<hipblasHandle_t>& stream_handles()
-{
-    static std::vector<hipblasHandle_t> stream_handles_;
-    return stream_handles_;
-}
-
-inline void create_stream_handles()
-{
-    CALL_HIPBLAS(hipblasCreate, (&null_stream_handle()));
-
-    stream_handles() = std::vector<hipblasHandle_t>(acc::num_streams());
-    for (int i = 0; i < acc::num_streams(); i++) {
-        CALL_HIPBLAS(hipblasCreate, (&stream_handles()[i]));
-
-        CALL_HIPBLAS(hipblasSetStream, (stream_handles()[i], acc::stream(stream_id(i))));
-    }
-}
-
-inline void destroy_stream_handles()
-{
-    CALL_HIPBLAS(hipblasDestroy, (null_stream_handle()));
-    for (int i = 0; i < acc::num_streams(); i++) {
-        CALL_HIPBLAS(hipblasDestroy, (stream_handles()[i]));
-    }
-}
-
-inline hipblasHandle_t stream_handle(int id)
-{
-    return (id == -1) ? null_stream_handle() : stream_handles()[id];
-}
-
-inline void zgemv(char transa, int32_t m, int32_t n, hipDoubleComplex* alpha, hipDoubleComplex* a, int32_t lda,
-                  hipDoubleComplex* x, int32_t incx, hipDoubleComplex* beta, hipDoubleComplex* y, int32_t incy,
-                  int stream_id)
-{
-    CALL_HIPBLAS(hipblas_port_Zgemv, (stream_handle(stream_id), get_hipblasOperation_t(transa), m, n, alpha, a, lda, x,
-                                      incx, beta, y, incy));
-}
-
-inline void zgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, hipDoubleComplex const* alpha,
-                  hipDoubleComplex const* a, int32_t lda, hipDoubleComplex const* b, int32_t ldb,
-                  hipDoubleComplex const* beta, hipDoubleComplex* c, int32_t ldc, int stream_id)
-{
-    CALL_HIPBLAS(hipblas_port_Zgemm, (stream_handle(stream_id), get_hipblasOperation_t(transa),
-                                      get_hipblasOperation_t(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
-}
-
-inline void dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, double const* alpha, double const* a,
-                  int32_t lda, double const* b, int32_t ldb, double const* beta, double* c, int32_t ldc, int stream_id)
-{
-    CALL_HIPBLAS(hipblasDgemm, (stream_handle(stream_id), get_hipblasOperation_t(transa),
-                                get_hipblasOperation_t(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
-}
-
-inline void dtrmm(char side, char uplo, char transa, char diag, int m, int n, double const* alpha,
-                  double const* A, int lda, double* B, int ldb)
-{
-    hipblasSideMode_t side_gpu    = get_hipblasSideMode_t(side);
-    hipblasFillMode_t uplo_gpu    = get_hipblasFillMode_t(uplo);
-    hipblasOperation_t transa_gpu = get_hipblasOperation_t(transa);
-    hipblasDiagType_t diag_gpu    = get_hipblasDiagType_t(diag);
-    CALL_HIPBLAS(hipblas_port_Dtrmm,
-                 (null_stream_handle(), side_gpu, uplo_gpu, transa_gpu, diag_gpu, m, n, alpha, A, lda, B, ldb, B, ldb));
-}
-
-inline void ztrmm(char side, char uplo, char transa, char diag, int m, int n,
-                  hipDoubleComplex const* alpha, hipDoubleComplex const* A, int lda, hipDoubleComplex* B,
-                  int ldb)
-{
-    hipblasSideMode_t side_gpu    = get_hipblasSideMode_t(side);
-    hipblasFillMode_t uplo_gpu    = get_hipblasFillMode_t(uplo);
-    hipblasOperation_t transa_gpu = get_hipblasOperation_t(transa);
-    hipblasDiagType_t diag_gpu    = get_hipblasDiagType_t(diag);
-    CALL_HIPBLAS(hipblas_port_Ztrmm,
-                 (null_stream_handle(), side_gpu, uplo_gpu, transa_gpu, diag_gpu, m, n, alpha, A, lda, B, ldb, B, ldb));
-
-    // copy to host, calculate, copy back
-    // int size_A, size_B;
-    // size_B = n * ldb;
-    // if (side == 'l' || side == 'L') {
-    //     if (transa == 'n' || transa == 'N')
-    //         size_A = m * lda;
-    //     else
-    //         size_A = n * lda;
-    // } else {
-    //     if (transa == 'n' || transa == 'N')
-    //         size_A = n * lda;
-    //     else
-    //         size_A = m * lda;
-    // }
-    // std::vector<hipDoubleComplex> A_host(size_A);
-    // std::vector<hipDoubleComplex> B_host(size_B);
-    // acc::copyout(A_host.data(), A, A_host.size());
-    // acc::copyout(B_host.data(), B, B_host.size());
-    // ftn_int mf = m;
-    // ftn_int nf = n;
-    // ftn_int ldaf = lda;
-    // ftn_int ldbf = ldb;
-    // FORTRAN(dtrmm)
-    // (&side, &uplo, &transa, "N", &mf, &nf, const_cast<ftn_double*>((const ftn_double*)alpha),
-    //  ((ftn_double*)A_host.data()), &ldaf, ((ftn_double*)B_host.data()), &ldbf, (ftn_len)1,
-    //  (ftn_len)1, (ftn_len)1, (ftn_len)1);
-    // acc::copyin(const_cast<hipDoubleComplex*>(B), B_host.data(), B_host.size());
-}
-
-inline void dger(int m, int n, double const* alpha, double const* x, int incx, double const* y, int incy, double* A,
-                 int lda, int stream_id)
-{
-    CALL_HIPBLAS(hipblasDger, (stream_handle(stream_id), m, n, alpha, x, incx, y, incy, A, lda));
-}
-
-inline void zgeru(int m, int n, hipDoubleComplex const* alpha, hipDoubleComplex const* x, int incx,
-                  hipDoubleComplex const* y, int incy, hipDoubleComplex* A, int lda, int stream_id)
-{
-    CALL_HIPBLAS(hipblas_port_Zgeru, (stream_handle(stream_id), m, n, alpha, x, incx, y, incy, A, lda));
-}
-
-inline void zaxpy(int n, hipDoubleComplex const* alpha, hipDoubleComplex const* x, int incx,
-                  hipDoubleComplex* y, int incy)
-{
-    CALL_HIPBLAS(hipblas_port_Zaxpy, (null_stream_handle(), n, alpha, x, incx, y, incy));
-}
-
-} // namespace hipblas
-
-#endif
diff --git a/src/gpu/hipblas_port/CMakeLists.txt b/src/gpu/hipblas_port/CMakeLists.txt
deleted file mode 100644
index 27a5af5be..000000000
--- a/src/gpu/hipblas_port/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-
-if(NOT USE_ROCM)
-    message(FATAL_ERROR "CMake file must not be included without ROCm enabled!")
-endif()
-
-rocm_hip_add_library(hipblas_port SHARED
-    rocblas_port_gemv.hip.cpp hipblas_port.hip.cpp rocblas_port_gemm.hip.cpp 
-    rocblas_port_trmm.hip.cpp rocblas_port_ger.hip.cpp rocblas_port_axpy.hip.cpp
-    FLAGS "-Wno-macro-redefined -std=c++14" INCLUDE_DIRS ${ROCM_INCLUDE_DIRS})
-
-option(BUILD_HIPBLAS_TESTS "Build tests for custom implementation of blas functions in ROCm" OFF)
-if (BUILD_HIPBLAS_TESTS)
-    # download google test
-	set(BUILD_GMOCK OFF)
-	set(INSTALL_GTEST OFF)
-	include(FetchContent) # requires CMake 3.11
-	FetchContent_Declare(
-	  googletest
-	  GIT_REPOSITORY https://github.com/google/googletest.git
-	  GIT_TAG        release-1.8.1
-	)
-	FetchContent_GetProperties(googletest)
-	if(NOT googletest_POPULATED)
-	  FetchContent_Populate(googletest)
-	  add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
-	endif()
-
-
-    add_executable(test_hipblas_port tests/main.cpp tests/gemv_test.cpp tests/gemm_test.cpp tests/trmm_test.cpp tests/ger_test.cpp tests/axpy_test.cpp)
-    target_link_libraries(test_hipblas_port ${ROCM_LIBRARIES} hipblas_port gtest_main)
-	target_include_directories(test_hipblas_port PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-endif()
-
diff --git a/src/gpu/hipblas_port/hipblas_port.h b/src/gpu/hipblas_port/hipblas_port.h
deleted file mode 100644
index 3eaa6e9c4..000000000
--- a/src/gpu/hipblas_port/hipblas_port.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef _HIPBLAS_PORT_H_
-#define _HIPBLAS_PORT_H_
-
-#include <hip/hip_complex.h>
-#include <hip/hip_runtime_api.h>
-#include <hipblas.h>
-
-/*
- * GEMV
- */
-hipblasStatus_t hipblas_port_Sgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n, const float* alpha,
-                                   const float* A, int lda, const float* x, int incx, const float* beta, float* y,
-                                   int incy);
-
-hipblasStatus_t hipblas_port_Dgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n, const double* alpha,
-                                   const double* A, int lda, const double* x, int incx, const double* beta, double* y,
-                                   int incy);
-
-hipblasStatus_t hipblas_port_Cgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n,
-                                   const hipFloatComplex* alpha, const hipFloatComplex* A, int lda,
-                                   const hipFloatComplex* x, int incx, const hipFloatComplex* beta, hipFloatComplex* y,
-                                   int incy);
-
-hipblasStatus_t hipblas_port_Zgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n,
-                                   const hipDoubleComplex* alpha, const hipDoubleComplex* A, int lda,
-                                   const hipDoubleComplex* x, int incx, const hipDoubleComplex* beta,
-                                   hipDoubleComplex* y, int incy);
-
-/*
- * GEMM
- */
-hipblasStatus_t hipblas_port_Sgemm(hipblasHandle_t handle,  hipblasOperation_t transa, hipblasOperation_t transb,
-                           int m, int n, int k, const float *alpha, 
-                           const float *A, int lda, 
-                           const float *B, int ldb, 
-                           const float *beta, 
-                           float *C, int ldc);
-
-hipblasStatus_t hipblas_port_Dgemm(hipblasHandle_t handle,  hipblasOperation_t transa, hipblasOperation_t transb,
-                           int m, int n, int k, const double *alpha, 
-                           const double *A, int lda, 
-                           const double *B, int ldb, 
-                           const double *beta, 
-                           double *C, int ldc);
-
-hipblasStatus_t hipblas_port_Cgemm(hipblasHandle_t handle,  hipblasOperation_t transa, hipblasOperation_t transb,
-                           int m, int n, int k, const hipFloatComplex *alpha, 
-                           const hipFloatComplex *A, int lda, 
-                           const hipFloatComplex *B, int ldb, 
-                           const hipFloatComplex *beta, 
-                           hipFloatComplex *C, int ldc);
-
-hipblasStatus_t hipblas_port_Zgemm(hipblasHandle_t handle,  hipblasOperation_t transa, hipblasOperation_t transb,
-                           int m, int n, int k, const hipDoubleComplex *alpha, 
-                           const hipDoubleComplex *A, int lda, 
-                           const hipDoubleComplex *B, int ldb, 
-                           const hipDoubleComplex *beta, 
-                           hipDoubleComplex *C, int ldc);
-
-/*
- * TRMM
- */
-
-hipblasStatus_t hipblas_port_Strmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n, const float* alpha,
-                                   const float* A, int lda, const float* B, int ldb, float* C, int ldc);
-
-hipblasStatus_t hipblas_port_Dtrmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n, const double* alpha,
-                                   const double* A, int lda, const double* B, int ldb, double* C, int ldc);
-
-hipblasStatus_t hipblas_port_Ctrmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,
-                                   const hipFloatComplex* alpha, const hipFloatComplex* A, int lda,
-                                   const hipFloatComplex* B, int ldb, hipFloatComplex* C, int ldc);
-
-hipblasStatus_t hipblas_port_Ztrmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,
-                                   const hipDoubleComplex* alpha, const hipDoubleComplex* A, int lda,
-                                   const hipDoubleComplex* B, int ldb, hipDoubleComplex* C, int ldc);
-
-
-
-/*
- * GER
- */
-hipblasStatus_t hipblas_port_Sger(hipblasHandle_t handle, int m, int n, const float* alpha, const float* x, int incx,
-                                  const float* y, int incy, float* A, int lda);
-
-hipblasStatus_t hipblas_port_Dger(hipblasHandle_t handle, int m, int n, const double* alpha, const double* x, int incx,
-                                  const double* y, int incy, double* A, int lda);
-
-hipblasStatus_t hipblas_port_Cgeru(hipblasHandle_t handle, int m, int n, const hipFloatComplex* alpha,
-                                   const hipFloatComplex* x, int incx, const hipFloatComplex* y, int incy,
-                                   hipFloatComplex* A, int lda);
-
-hipblasStatus_t hipblas_port_Zgeru(hipblasHandle_t handle, int m, int n, const hipDoubleComplex* alpha,
-                                   const hipDoubleComplex* x, int incx, const hipDoubleComplex* y, int incy,
-                                   hipDoubleComplex* A, int lda);
-
-/*
- * AXPY
- */
-hipblasStatus_t hipblas_port_Saxpy(hipblasHandle_t handle, int n, const float* alpha, const float* x, int incx,
-                                   float* y, int incy);
-
-hipblasStatus_t hipblas_port_Daxpy(hipblasHandle_t handle, int n, const double* alpha, const double* x, int incx,
-                                   double* y, int incy);
-
-hipblasStatus_t hipblas_port_Caxpy(hipblasHandle_t handle, int n, const hipFloatComplex* alpha,
-                                   const hipFloatComplex* x, int incx, hipFloatComplex* y, int incy);
-
-hipblasStatus_t hipblas_port_Zaxpy(hipblasHandle_t handle, int n, const hipDoubleComplex* alpha,
-                                   const hipDoubleComplex* x, int incx, hipDoubleComplex* y, int incy);
-#endif
diff --git a/src/gpu/hipblas_port/hipblas_port.hip.cpp b/src/gpu/hipblas_port/hipblas_port.hip.cpp
deleted file mode 100644
index 966aced2c..000000000
--- a/src/gpu/hipblas_port/hipblas_port.hip.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#include "rocblas_port.h"
-#include <hipblas.h>
-#include "rocblas_port/port_hip_roc_translation.h"
-
-hipblasStatus_t hipblas_port_Sgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n, const float* alpha,
-                                   const float* A, int lda, const float* x, int incx, const float* beta, float* y,
-                                   int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_sgemv((rocblas_handle)handle, hipOperationToHCCOperation(trans), m, n,
-                                                       alpha, A, lda, x, incx, beta, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Dgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n, const double* alpha,
-                                   const double* A, int lda, const double* x, int incx, const double* beta, double* y,
-                                   int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_dgemv((rocblas_handle)handle, hipOperationToHCCOperation(trans), m, n,
-                                                       alpha, A, lda, x, incx, beta, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Cgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n,
-                                   const hipFloatComplex* alpha, const hipFloatComplex* A, int lda,
-                                   const hipFloatComplex* x, int incx, const hipFloatComplex* beta, hipFloatComplex* y,
-                                   int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_cgemv((rocblas_handle)handle, hipOperationToHCCOperation(trans), m, n,
-                                                       alpha, A, lda, x, incx, beta, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Zgemv(hipblasHandle_t handle, hipblasOperation_t trans, int m, int n,
-                                   const hipDoubleComplex* alpha, const hipDoubleComplex* A, int lda,
-                                   const hipDoubleComplex* x, int incx, const hipDoubleComplex* beta,
-                                   hipDoubleComplex* y, int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_zgemv((rocblas_handle)handle, hipOperationToHCCOperation(trans), m, n,
-                                                       alpha, A, lda, x, incx, beta, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Sgemm(hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m,
-                                   int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb,
-                                   const float* beta, float* C, int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_sgemm((rocblas_handle)handle, hipOperationToHCCOperation(transa),
-                                                       hipOperationToHCCOperation(transb), m, n, k, alpha, A, lda, B,
-                                                       ldb, beta, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Dgemm(hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m,
-                                   int n, int k, const double* alpha, const double* A, int lda, const double* B,
-                                   int ldb, const double* beta, double* C, int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_dgemm((rocblas_handle)handle, hipOperationToHCCOperation(transa),
-                                                       hipOperationToHCCOperation(transb), m, n, k, alpha, A, lda, B,
-                                                       ldb, beta, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Cgemm(hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m,
-                                   int n, int k, const hipFloatComplex* alpha, const hipFloatComplex* A, int lda,
-                                   const hipFloatComplex* B, int ldb, const hipFloatComplex* beta, hipFloatComplex* C,
-                                   int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_cgemm((rocblas_handle)handle, hipOperationToHCCOperation(transa),
-                                                       hipOperationToHCCOperation(transb), m, n, k, alpha, A, lda, B,
-                                                       ldb, beta, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Zgemm(hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m,
-                                   int n, int k, const hipDoubleComplex* alpha, const hipDoubleComplex* A, int lda,
-                                   const hipDoubleComplex* B, int ldb, const hipDoubleComplex* beta,
-                                   hipDoubleComplex* C, int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_zgemm((rocblas_handle)handle, hipOperationToHCCOperation(transa),
-                                                       hipOperationToHCCOperation(transb), m, n, k, alpha, A, lda, B,
-                                                       ldb, beta, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Strmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n, const float* alpha,
-                                   const float* A, int lda, const float* B, int ldb, float* C, int ldc)
-{
-
-    return rocBLASStatusToHIPStatus(rocblas_port_strmm(
-        (rocblas_handle)handle, hipSideToHCCSide(side), hipFillToHCCFill(uplo), hipOperationToHCCOperation(trans),
-        hipDiagonalToHCCDiagonal(diag), m, n, alpha, A, lda, B, ldb, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Dtrmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n, const double* alpha,
-                                   const double* A, int lda, const double* B, int ldb, double* C, int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_dtrmm(
-        (rocblas_handle)handle, hipSideToHCCSide(side), hipFillToHCCFill(uplo), hipOperationToHCCOperation(trans),
-        hipDiagonalToHCCDiagonal(diag), m, n, alpha, A, lda, B, ldb, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Ctrmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,
-                                   const hipFloatComplex* alpha, const hipFloatComplex* A, int lda,
-                                   const hipFloatComplex* B, int ldb, hipFloatComplex* C, int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_ctrmm(
-        (rocblas_handle)handle, hipSideToHCCSide(side), hipFillToHCCFill(uplo), hipOperationToHCCOperation(trans),
-        hipDiagonalToHCCDiagonal(diag), m, n, alpha, A, lda, B, ldb, C, ldc));
-}
-
-hipblasStatus_t hipblas_port_Ztrmm(hipblasHandle_t handle, hipblasSideMode_t side, hipblasFillMode_t uplo,
-                                   hipblasOperation_t trans, hipblasDiagType_t diag, int m, int n,
-                                   const hipDoubleComplex* alpha, const hipDoubleComplex* A, int lda,
-                                   const hipDoubleComplex* B, int ldb, hipDoubleComplex* C, int ldc)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_ztrmm(
-        (rocblas_handle)handle, hipSideToHCCSide(side), hipFillToHCCFill(uplo), hipOperationToHCCOperation(trans),
-        hipDiagonalToHCCDiagonal(diag), m, n, alpha, A, lda, B, ldb, C, ldc));
-}
-
-
-
-/*
- * GER
- */
-hipblasStatus_t hipblas_port_Sger(hipblasHandle_t handle, int m, int n, const float* alpha, const float* x, int incx,
-                                  const float* y, int incy, float* A, int lda)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_sger((rocblas_handle)handle, m, n, alpha, x, incx, y, incy, A, lda));
-}
-
-hipblasStatus_t hipblas_port_Dger(hipblasHandle_t handle, int m, int n, const double* alpha, const double* x, int incx,
-                                  const double* y, int incy, double* A, int lda)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_dger((rocblas_handle)handle, m, n, alpha, x, incx, y, incy, A, lda));
-}
-
-hipblasStatus_t hipblas_port_Cgeru(hipblasHandle_t handle, int m, int n, const hipFloatComplex* alpha,
-                                   const hipFloatComplex* x, int incx, const hipFloatComplex* y, int incy,
-                                   hipFloatComplex* A, int lda)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_cgeru((rocblas_handle)handle, m, n, alpha, x, incx, y, incy, A, lda));
-}
-
-hipblasStatus_t hipblas_port_Zgeru(hipblasHandle_t handle, int m, int n, const hipDoubleComplex* alpha,
-                                   const hipDoubleComplex* x, int incx, const hipDoubleComplex* y, int incy,
-                                   hipDoubleComplex* A, int lda)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_zgeru((rocblas_handle)handle, m, n, alpha, x, incx, y, incy, A, lda));
-}
-
-/*
- * AXPY
- */
-hipblasStatus_t hipblas_port_Saxpy(hipblasHandle_t handle, int n, const float* alpha, const float* x, int incx,
-                                   float* y, int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_saxpy((rocblas_handle)handle, n, alpha, x, incx, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Daxpy(hipblasHandle_t handle, int n, const double* alpha, const double* x, int incx,
-                                   double* y, int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_daxpy((rocblas_handle)handle, n, alpha, x, incx, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Caxpy(hipblasHandle_t handle, int n, const hipFloatComplex* alpha,
-                                   const hipFloatComplex* x, int incx, hipFloatComplex* y, int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_caxpy((rocblas_handle)handle, n, alpha, x, incx, y, incy));
-}
-
-hipblasStatus_t hipblas_port_Zaxpy(hipblasHandle_t handle, int n, const hipDoubleComplex* alpha,
-                                   const hipDoubleComplex* x, int incx, hipDoubleComplex* y, int incy)
-{
-    return rocBLASStatusToHIPStatus(rocblas_port_zaxpy((rocblas_handle)handle, n, alpha, x, incx, y, incy));
-}
diff --git a/src/gpu/hipblas_port/rocblas_port.h b/src/gpu/hipblas_port/rocblas_port.h
deleted file mode 100644
index a8156caaf..000000000
--- a/src/gpu/hipblas_port/rocblas_port.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef _ROCBLAS_PORT_H_
-#define _ROCBLAS_PORT_H_
-
-#include <hip/hip_complex.h>
-#include "rocblas_port/rocblas-types.h"
-#include "rocblas_port/handle.h"
-
-extern "C" {
-
-/*
- * GEMV
- */
-rocblas_status rocblas_port_sgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const float* alpha, const float* A, rocblas_int lda, const float* x, rocblas_int incx,
-                                  const float* beta, float* y, rocblas_int incy);
-
-rocblas_status rocblas_port_dgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const double* alpha, const double* A, rocblas_int lda, const double* x,
-                                  rocblas_int incx, const double* beta, double* y, rocblas_int incy);
-
-rocblas_status rocblas_port_cgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const hipFloatComplex* alpha, const hipFloatComplex* A, rocblas_int lda,
-                                  const hipFloatComplex* x, rocblas_int incx, const hipFloatComplex* beta,
-                                  hipFloatComplex* y, rocblas_int incy);
-
-rocblas_status rocblas_port_zgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const hipDoubleComplex* alpha, const hipDoubleComplex* A, rocblas_int lda,
-                                  const hipDoubleComplex* x, rocblas_int incx, const hipDoubleComplex* beta,
-                                  hipDoubleComplex* y, rocblas_int incy);
-
-/*
- * GEMM
- */
-rocblas_status rocblas_port_sgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const float* alpha, const float* A,
-                                  rocblas_int lda, const float* B, rocblas_int ldb, const float* beta, float* C,
-                                  rocblas_int ldc);
-
-rocblas_status rocblas_port_dgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const double* alpha, const double* A,
-                                  rocblas_int lda, const double* B, rocblas_int ldb, const double* beta, double* C,
-                                  rocblas_int ldc);
-
-rocblas_status rocblas_port_cgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* A, rocblas_int lda, const hipFloatComplex* B, rocblas_int ldb,
-                                  const hipFloatComplex* beta, hipFloatComplex* C, rocblas_int ldc);
-
-rocblas_status rocblas_port_zgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* A, rocblas_int lda, const hipDoubleComplex* B,
-                                  rocblas_int ldb, const hipDoubleComplex* beta, hipDoubleComplex* C, rocblas_int ldc);
-
-/*
- * TRMM
- */
-
-rocblas_status rocblas_port_strmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const float* alpha,
-                                  const float* A, rocblas_int lda, const float* B, rocblas_int ldb, float* C,
-                                  rocblas_int ldc);
-
-rocblas_status rocblas_port_dtrmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const double* alpha,
-                                  const double* A, rocblas_int lda, const double* B, rocblas_int ldb, double* C,
-                                  rocblas_int ldc);
-
-rocblas_status rocblas_port_ctrmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* A, rocblas_int lda, const hipFloatComplex* B, rocblas_int ldb,
-                                  hipFloatComplex* C, rocblas_int ldc);
-
-rocblas_status rocblas_port_ztrmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* A, rocblas_int lda, const hipDoubleComplex* B,
-                                  rocblas_int ldb, hipDoubleComplex* C, rocblas_int ldc);
-/*
- * GER
- */
-
-rocblas_status rocblas_port_sger(rocblas_handle handle, rocblas_int m, rocblas_int n, const float* alpha,
-                                 const float* x, rocblas_int incx, const float* y, rocblas_int incy, float* A,
-                                 rocblas_int lda);
-
-rocblas_status rocblas_port_dger(rocblas_handle handle, rocblas_int m, rocblas_int n, const double* alpha,
-                                 const double* x, rocblas_int incx, const double* y, rocblas_int incy, double* A,
-                                 rocblas_int lda);
-
-rocblas_status rocblas_port_cgeru(rocblas_handle handle, rocblas_int m, rocblas_int n, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* x, rocblas_int incx, const hipFloatComplex* y,
-                                  rocblas_int incy, hipFloatComplex* A, rocblas_int lda);
-
-rocblas_status rocblas_port_zgeru(rocblas_handle handle, rocblas_int m, rocblas_int n, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* x, rocblas_int incx, const hipDoubleComplex* y,
-                                  rocblas_int incy, hipDoubleComplex* A, rocblas_int lda);
-
-/*
- * AXPY
- */
-rocblas_status rocblas_port_saxpy(rocblas_handle handle, rocblas_int n, const float* alpha, const float* x,
-                                  rocblas_int incx, float* y, rocblas_int incy);
-
-rocblas_status rocblas_port_daxpy(rocblas_handle handle, rocblas_int n, const double* alpha, const double* x,
-                                  rocblas_int incx, double* y, rocblas_int incy);
-
-rocblas_status rocblas_port_caxpy(rocblas_handle handle, rocblas_int n, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* x, rocblas_int incx, hipFloatComplex* y,
-                                  rocblas_int incy);
-
-rocblas_status rocblas_port_zaxpy(rocblas_handle handle, rocblas_int n, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* x, rocblas_int incx, hipDoubleComplex* y,
-                                  rocblas_int incy);
-
-} // extern "C"
-
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/definitions.h b/src/gpu/hipblas_port/rocblas_port/definitions.h
deleted file mode 100644
index 4025a3529..000000000
--- a/src/gpu/hipblas_port/rocblas_port/definitions.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#ifndef DEFINITIONS_H
-#define DEFINITIONS_H
-#include "rocblas-types.h"
-#include "status.h"
-
-/*******************************************************************************
- * Definitions
- * this file to not include any others
- * thereby it can include top-level definitions included by all
- ******************************************************************************/
-
-namespace {
-// half vectors
-typedef _Float16 rocblas_half8 __attribute__((ext_vector_type(8)));
-typedef _Float16 rocblas_half2 __attribute__((ext_vector_type(2)));
-
-#ifndef GOOGLE_TEST // suppress warnings about __device__ when building tests
-extern "C" __device__ rocblas_half2 llvm_fma_v2f16(rocblas_half2,
-                                                   rocblas_half2,
-                                                   rocblas_half2) __asm("llvm.fma.v2f16");
-
-__device__ inline rocblas_half2
-rocblas_fmadd_half2(rocblas_half2 multiplier, rocblas_half2 multiplicand, rocblas_half2 addend)
-{
-    return llvm_fma_v2f16(multiplier, multiplicand, addend);
-}
-#endif
-
-#define RETURN_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK)                         \
-    do                                                                      \
-    {                                                                       \
-        hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK;           \
-        if(TMP_STATUS_FOR_CHECK != hipSuccess)                              \
-        {                                                                   \
-            return get_rocblas_status_for_hip_status(TMP_STATUS_FOR_CHECK); \
-        }                                                                   \
-    } while(0)
-
-#define RETURN_IF_ROCBLAS_ERROR(INPUT_STATUS_FOR_CHECK)               \
-    do                                                                \
-    {                                                                 \
-        rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \
-        if(TMP_STATUS_FOR_CHECK != rocblas_status_success)            \
-        {                                                             \
-            return TMP_STATUS_FOR_CHECK;                              \
-        }                                                             \
-    } while(0)
-
-#define THROW_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK)                         \
-    do                                                                     \
-    {                                                                      \
-        hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK;          \
-        if(TMP_STATUS_FOR_CHECK != hipSuccess)                             \
-        {                                                                  \
-            throw get_rocblas_status_for_hip_status(TMP_STATUS_FOR_CHECK); \
-        }                                                                  \
-    } while(0)
-
-#define THROW_IF_ROCBLAS_ERROR(INPUT_STATUS_FOR_CHECK)                \
-    do                                                                \
-    {                                                                 \
-        rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \
-        if(TMP_STATUS_FOR_CHECK != rocblas_status_success)            \
-        {                                                             \
-            throw TMP_STATUS_FOR_CHECK;                               \
-        }                                                             \
-    } while(0)
-
-#define PRINT_IF_HIP_ERROR(INPUT_STATUS_FOR_CHECK)                \
-    do                                                            \
-    {                                                             \
-        hipError_t TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \
-        if(TMP_STATUS_FOR_CHECK != hipSuccess)                    \
-        {                                                         \
-            fprintf(stderr,                                       \
-                    "hip error code: %d at %s:%d\n",              \
-                    TMP_STATUS_FOR_CHECK,                         \
-                    __FILE__,                                     \
-                    __LINE__);                                    \
-        }                                                         \
-    } while(0)
-
-#define PRINT_IF_ROCBLAS_ERROR(INPUT_STATUS_FOR_CHECK)                \
-    do                                                                \
-    {                                                                 \
-        rocblas_status TMP_STATUS_FOR_CHECK = INPUT_STATUS_FOR_CHECK; \
-        if(TMP_STATUS_FOR_CHECK != rocblas_status_success)            \
-        {                                                             \
-            fprintf(stderr,                                           \
-                    "rocblas error code: %d at %s:%d\n",              \
-                    TMP_STATUS_FOR_CHECK,                             \
-                    __FILE__,                                         \
-                    __LINE__);                                        \
-        }                                                             \
-    } while(0)
-
-}
-#endif // DEFINITIONS_H
diff --git a/src/gpu/hipblas_port/rocblas_port/handle.h b/src/gpu/hipblas_port/rocblas_port/handle.h
deleted file mode 100644
index 08cebcfaa..000000000
--- a/src/gpu/hipblas_port/rocblas_port/handle.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#ifndef HANDLE_H
-#define HANDLE_H
-
-#include <fstream>
-#include <iostream>
-#include "rocblas-types.h"
-#include "definitions.h"
-#include <hip/hip_runtime_api.h>
-
-/*******************************************************************************
- * \brief rocblas_handle is a structure holding the rocblas library context.
- * It must be initialized using rocblas_create_handle() and the returned handle mus
- * It should be destroyed at the end using rocblas_destroy_handle().
- * Exactly like CUBLAS, ROCBLAS only uses one stream for one API routine
-******************************************************************************/
-namespace {
-struct _rocblas_handle
-{
-    _rocblas_handle();
-    ~_rocblas_handle();
-
-    /*******************************************************************************
-     * Exactly like CUBLAS, ROCBLAS only uses one stream for one API routine
-     ******************************************************************************/
-
-    /*******************************************************************************
-     * set stream:
-        This API assumes user has already created a valid stream
-        Associate the following rocblas API call with this user provided stream
-     ******************************************************************************/
-    rocblas_status set_stream(hipStream_t user_stream)
-    {
-        // TODO: check the user_stream valid or not
-        rocblas_stream = user_stream;
-        return rocblas_status_success;
-    }
-
-    /*******************************************************************************
-     * get stream
-     ******************************************************************************/
-    rocblas_status get_stream(hipStream_t* stream) const
-    {
-        *stream = rocblas_stream;
-        return rocblas_status_success;
-    }
-
-    // trsm get pointers
-    void* get_trsm_Y() const { return trsm_Y; }
-    void* get_trsm_invA() const { return trsm_invA; }
-    void* get_trsm_invA_C() const { return trsm_invA_C; }
-
-    // trsv get pointers
-    void* get_trsv_x() const { return trsv_x; }
-    void* get_trsv_alpha() const { return trsv_alpha; }
-
-    rocblas_int device;
-    hipDeviceProp_t device_properties;
-
-    // rocblas by default take the system default stream 0 users cannot create
-    hipStream_t rocblas_stream = 0;
-
-    // default pointer_mode is on host
-    rocblas_pointer_mode pointer_mode = rocblas_pointer_mode_host;
-
-    // space allocated for trsm
-    void* trsm_Y      = nullptr;
-    void* trsm_invA   = nullptr;
-    void* trsm_invA_C = nullptr;
-
-    // space allocated for trsv
-    void* trsv_x     = nullptr;
-    void* trsv_alpha = nullptr;
-
-    // default logging_mode is no logging
-    static rocblas_layer_mode layer_mode;
-
-    // logging streams
-    static std::ofstream log_trace_ofs;
-    static std::ostream* log_trace_os;
-    static std::ofstream log_bench_ofs;
-    static std::ostream* log_bench_os;
-    static std::ofstream log_profile_ofs;
-    static std::ostream* log_profile_os;
-
-    // static data for startup initialization
-    static struct init
-    {
-        init();
-    } handle_init;
-};
-
-
-// work buffer size constants
-constexpr size_t WORKBUF_TRSM_A_BLKS    = 10;
-constexpr size_t WORKBUF_TRSM_B_CHNK    = 32000;
-constexpr size_t WORKBUF_TRSM_Y_SZ      = 32000 * 128 * sizeof(double);
-constexpr size_t WORKBUF_TRSM_INVA_SZ   = 128 * 128 * 10 * sizeof(double);
-constexpr size_t WORKBUF_TRSM_INVA_C_SZ = 128 * 128 * 10 * sizeof(double) / 2;
-constexpr size_t WORKBUF_TRSV_X_SZ      = 131072 * sizeof(double);
-constexpr size_t WORKBUF_TRSV_ALPHA_SZ  = sizeof(double);
-
-/*******************************************************************************
- * constructor
- ******************************************************************************/
-_rocblas_handle::_rocblas_handle()
-{
-    // default device is active device
-    THROW_IF_HIP_ERROR(hipGetDevice(&device));
-    THROW_IF_HIP_ERROR(hipGetDeviceProperties(&device_properties, device));
-
-    // rocblas by default take the system default stream 0 users cannot create
-
-    // allocate trsm temp buffers
-    THROW_IF_HIP_ERROR(hipMalloc(&trsm_Y, WORKBUF_TRSM_Y_SZ));
-    THROW_IF_HIP_ERROR(hipMalloc(&trsm_invA, WORKBUF_TRSM_INVA_SZ));
-    THROW_IF_HIP_ERROR(hipMalloc(&trsm_invA_C, WORKBUF_TRSM_INVA_C_SZ));
-
-    // allocate trsv temp buffers
-    THROW_IF_HIP_ERROR(hipMalloc(&trsv_x, WORKBUF_TRSV_X_SZ));
-    THROW_IF_HIP_ERROR(hipMalloc(&trsv_alpha, WORKBUF_TRSV_ALPHA_SZ));
-}
-
-/*******************************************************************************
- * destructor
- ******************************************************************************/
-_rocblas_handle::~_rocblas_handle()
-{
-    if(trsm_Y)
-        hipFree(trsm_Y);
-    if(trsm_invA)
-        hipFree(trsm_invA);
-    if(trsm_invA_C)
-        hipFree(trsm_invA_C);
-    if(trsv_x)
-        hipFree(trsv_x);
-    if(trsv_alpha)
-        hipFree(trsv_alpha);
-}
-
-/*******************************************************************************
- * Static handle data
- ******************************************************************************/
-rocblas_layer_mode _rocblas_handle::layer_mode = rocblas_layer_mode_none;
-std::ofstream _rocblas_handle::log_trace_ofs;
-std::ostream* _rocblas_handle::log_trace_os;
-std::ofstream _rocblas_handle::log_bench_ofs;
-std::ostream* _rocblas_handle::log_bench_os;
-std::ofstream _rocblas_handle::log_profile_ofs;
-std::ostream* _rocblas_handle::log_profile_os;
-_rocblas_handle::init _rocblas_handle::handle_init;
-
-/**
- *  @brief Logging function
- *
- *  @details
- *  open_log_stream Open stream log_os for logging.
- *                  If the environment variable with name environment_variable_name
- *                  is not set, then stream log_os to std::cerr.
- *                  Else open a file at the full logfile path contained in
- *                  the environment variable.
- *                  If opening the file suceeds, stream to the file
- *                  else stream to std::cerr.
- *
- *  @param[in]
- *  environment_variable_name   const char*
- *                              Name of environment variable that contains
- *                              the full logfile path.
- *
- *  @parm[out]
- *  log_os      std::ostream*&
- *              Output stream. Stream to std:cerr if environment_variable_name
- *              is not set, else set to stream to log_ofs
- *
- *  @parm[out]
- *  log_ofs     std::ofstream&
- *              Output file stream. If log_ofs->is_open()==true, then log_os
- *              will stream to log_ofs. Else it will stream to std::cerr.
- */
-
-static void open_log_stream(const char* environment_variable_name,
-                            std::ostream*& log_os,
-                            std::ofstream& log_ofs)
-
-{
-    // By default, output to cerr
-    log_os = &std::cerr;
-
-    // if environment variable is set, open file at logfile_pathname contained in the
-    // environment variable
-    auto logfile_pathname = getenv(environment_variable_name);
-    if(logfile_pathname)
-    {
-        log_ofs.open(logfile_pathname, std::ios_base::trunc);
-
-        // if log_ofs is open, then stream to log_ofs, else log_os is already set to std::cerr
-        if(log_ofs.is_open())
-            log_os = &log_ofs;
-    }
-}
-
-/*******************************************************************************
- * Static runtime initialization
- ******************************************************************************/
-_rocblas_handle::init::init()
-{
-    // set layer_mode from value of environment variable ROCBLAS_LAYER
-    auto str_layer_mode = getenv("ROCBLAS_LAYER");
-    if(str_layer_mode)
-    {
-        layer_mode = static_cast<rocblas_layer_mode>(strtol(str_layer_mode, 0, 0));
-
-        // open log_trace file
-        if(layer_mode & rocblas_layer_mode_log_trace)
-            open_log_stream("ROCBLAS_LOG_TRACE_PATH", log_trace_os, log_trace_ofs);
-
-        // open log_bench file
-        if(layer_mode & rocblas_layer_mode_log_bench)
-            open_log_stream("ROCBLAS_LOG_BENCH_PATH", log_bench_os, log_bench_ofs);
-
-        // open log_profile file
-        if(layer_mode & rocblas_layer_mode_log_profile)
-            open_log_stream("ROCBLAS_LOG_PROFILE_PATH", log_profile_os, log_profile_ofs);
-    }
-}
-
-
-}
-
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/port_helper_func.h b/src/gpu/hipblas_port/rocblas_port/port_helper_func.h
deleted file mode 100644
index 289968f1f..000000000
--- a/src/gpu/hipblas_port/rocblas_port/port_helper_func.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _PORT_HELPER_FUNC_H_
-#define _PORT_HELPER_FUNC_H_
-
-#include <hip/hip_complex.h>
-#include "rocblas-types.h"
-
-namespace {
-
-/*
- * Check if real only and cmp value
- */
-template<typename T, typename U>
-__host__ __device__ inline bool rb_port_cmp_and_real_only(const T& a, const U& val) { return a == val; }
-
-template<typename T>
-__host__ __device__ inline bool rb_port_cmp_and_real_only(const hipDoubleComplex& a, const T& val) { return a.x == val && a.y == 0; }
-
-template<typename T>
-__host__ __device__ inline bool rb_port_cmp_and_real_only(const hipFloatComplex& a, const T& val) { return a.x == val && a.y == 0; }
-
-/*
- * Conjugate helper functions
- */
-template<rocblas_operation OP, typename T>
-struct ConjOp {
-    __host__ __device__ static inline T eval(const T& val) { return val; }
-};
-
-template<>
-struct ConjOp<rocblas_operation_conjugate_transpose, hipDoubleComplex> {
-    __host__ __device__ static inline hipDoubleComplex eval(const hipDoubleComplex& val) {
-        return hipDoubleComplex(val.x, -val.y);
-    }
-};
-
-template<>
-struct ConjOp<rocblas_operation_conjugate_transpose, hipFloatComplex> {
-    __host__ __device__ static inline hipFloatComplex eval(const hipFloatComplex& val) {
-        return hipFloatComplex(val.x, -val.y);
-    }
-};
-
-/*
- * Swap of leading dimension / increment for transposed matrices
- */
-template<rocblas_operation OP>
-struct MatrixDim {
-    __host__ __device__ static inline rocblas_int ld(const rocblas_int& ld, const rocblas_int& inc) { return inc; }
-    __host__ __device__ static inline rocblas_int inc(const rocblas_int& ld, const rocblas_int& inc) { return ld; }
-};
-
-template<>
-struct MatrixDim<rocblas_operation_none> {
-    __host__ __device__ static inline rocblas_int ld(const rocblas_int& ld, const rocblas_int& inc) { return ld; }
-    __host__ __device__ static inline rocblas_int inc(const rocblas_int& ld, const rocblas_int& inc) { return inc; }
-};
-
-
-}
-
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/port_hip_roc_translation.h b/src/gpu/hipblas_port/rocblas_port/port_hip_roc_translation.h
deleted file mode 100644
index fab9a91ca..000000000
--- a/src/gpu/hipblas_port/rocblas_port/port_hip_roc_translation.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-#ifndef _PORT_HIP_ROC_TRANSLATION_
-#define _PORT_HIP_ROC_TRANSLATION_
-
-#include <hipblas.h>
-
-namespace {
-
-rocblas_operation_ hipOperationToHCCOperation(hipblasOperation_t op)
-{
-    switch (op) {
-        case HIPBLAS_OP_N:
-            return rocblas_operation_none;
-        case HIPBLAS_OP_T:
-            return rocblas_operation_transpose;
-        case HIPBLAS_OP_C:
-            return rocblas_operation_conjugate_transpose;
-        default:
-            throw "Non existent OP";
-    }
-}
-
-hipblasOperation_t HCCOperationToHIPOperation(rocblas_operation_ op)
-{
-    switch (op) {
-        case rocblas_operation_none:
-            return HIPBLAS_OP_N;
-        case rocblas_operation_transpose:
-            return HIPBLAS_OP_T;
-        case rocblas_operation_conjugate_transpose:
-            return HIPBLAS_OP_C;
-        default:
-            throw "Non existent OP";
-    }
-}
-
-rocblas_fill_ hipFillToHCCFill(hipblasFillMode_t fill)
-{
-    switch (fill) {
-        case HIPBLAS_FILL_MODE_UPPER:
-            return rocblas_fill_upper;
-        case HIPBLAS_FILL_MODE_LOWER:
-            return rocblas_fill_lower;
-        case HIPBLAS_FILL_MODE_FULL:
-            return rocblas_fill_full;
-        default:
-            throw "Non existent FILL";
-    }
-}
-
-hipblasFillMode_t HCCFillToHIPFill(rocblas_fill_ fill)
-{
-    switch (fill) {
-        case rocblas_fill_upper:
-            return HIPBLAS_FILL_MODE_UPPER;
-        case rocblas_fill_lower:
-            return HIPBLAS_FILL_MODE_LOWER;
-        case rocblas_fill_full:
-            return HIPBLAS_FILL_MODE_FULL;
-        default:
-            throw "Non existent FILL";
-    }
-}
-
-rocblas_diagonal_ hipDiagonalToHCCDiagonal(hipblasDiagType_t diagonal)
-{
-    switch (diagonal) {
-        case HIPBLAS_DIAG_NON_UNIT:
-            return rocblas_diagonal_non_unit;
-        case HIPBLAS_DIAG_UNIT:
-            return rocblas_diagonal_unit;
-        default:
-            throw "Non existent DIAGONAL";
-    }
-}
-
-hipblasDiagType_t HCCDiagonalToHIPDiagonal(rocblas_diagonal_ diagonal)
-{
-    switch (diagonal) {
-        case rocblas_diagonal_non_unit:
-            return HIPBLAS_DIAG_NON_UNIT;
-        case rocblas_diagonal_unit:
-            return HIPBLAS_DIAG_UNIT;
-        default:
-            throw "Non existent DIAGONAL";
-    }
-}
-
-rocblas_side_ hipSideToHCCSide(hipblasSideMode_t side)
-{
-    switch (side) {
-        case HIPBLAS_SIDE_LEFT:
-            return rocblas_side_left;
-        case HIPBLAS_SIDE_RIGHT:
-            return rocblas_side_right;
-        case HIPBLAS_SIDE_BOTH:
-            return rocblas_side_both;
-        default:
-            throw "Non existent SIDE";
-    }
-}
-
-hipblasSideMode_t HCCSideToHIPSide(rocblas_side_ side)
-{
-    switch (side) {
-        case rocblas_side_left:
-            return HIPBLAS_SIDE_LEFT;
-        case rocblas_side_right:
-            return HIPBLAS_SIDE_RIGHT;
-        case rocblas_side_both:
-            return HIPBLAS_SIDE_BOTH;
-        default:
-            throw "Non existent SIDE";
-    }
-}
-
-rocblas_pointer_mode HIPPointerModeToRocblasPointerMode(hipblasPointerMode_t mode)
-{
-    switch (mode) {
-        case HIPBLAS_POINTER_MODE_HOST:
-            return rocblas_pointer_mode_host;
-
-        case HIPBLAS_POINTER_MODE_DEVICE:
-            return rocblas_pointer_mode_device;
-
-        default:
-            throw "Non existent PointerMode";
-    }
-}
-
-hipblasPointerMode_t RocblasPointerModeToHIPPointerMode(rocblas_pointer_mode mode)
-{
-    switch (mode) {
-        case rocblas_pointer_mode_host:
-            return HIPBLAS_POINTER_MODE_HOST;
-
-        case rocblas_pointer_mode_device:
-            return HIPBLAS_POINTER_MODE_DEVICE;
-
-        default:
-            throw "Non existent PointerMode";
-    }
-}
-
-rocblas_datatype HIPDatatypeToRocblasDatatype(hipblasDatatype_t type)
-{
-    switch (type) {
-        case HIPBLAS_R_16F:
-            return rocblas_datatype_f16_r;
-
-        case HIPBLAS_R_32F:
-            return rocblas_datatype_f32_r;
-
-        case HIPBLAS_R_64F:
-            return rocblas_datatype_f64_r;
-
-        case HIPBLAS_C_16F:
-            return rocblas_datatype_f16_c;
-
-        case HIPBLAS_C_32F:
-            return rocblas_datatype_f32_c;
-
-        case HIPBLAS_C_64F:
-            return rocblas_datatype_f64_c;
-
-        default:
-            throw "Non existant DataType";
-    }
-}
-
-hipblasDatatype_t RocblasDatatypeToHIPDatatype(rocblas_datatype type)
-{
-    switch (type) {
-        case rocblas_datatype_f16_r:
-            return HIPBLAS_R_16F;
-
-        case rocblas_datatype_f32_r:
-            return HIPBLAS_R_32F;
-
-        case rocblas_datatype_f64_r:
-            return HIPBLAS_R_64F;
-
-        case rocblas_datatype_f16_c:
-            return HIPBLAS_C_16F;
-
-        case rocblas_datatype_f32_c:
-            return HIPBLAS_C_32F;
-
-        case rocblas_datatype_f64_c:
-            return HIPBLAS_C_64F;
-
-        default:
-            throw "Non existant DataType";
-    }
-}
-
-rocblas_gemm_algo HIPGemmAlgoToRocblasGemmAlgo(hipblasGemmAlgo_t algo)
-{
-    switch (algo) {
-        case HIPBLAS_GEMM_DEFAULT:
-            return rocblas_gemm_algo_standard;
-
-        default:
-            throw "Non existant GemmAlgo";
-    }
-}
-
-hipblasGemmAlgo_t RocblasGemmAlgoToHIPGemmAlgo(rocblas_gemm_algo algo)
-{
-    switch (algo) {
-        case rocblas_gemm_algo_standard:
-            return HIPBLAS_GEMM_DEFAULT;
-
-        default:
-            throw "Non existant GemmAlgo";
-    }
-}
-
-hipblasStatus_t rocBLASStatusToHIPStatus(rocblas_status_ error)
-{
-    switch (error) {
-        case rocblas_status_success:
-            return HIPBLAS_STATUS_SUCCESS;
-        case rocblas_status_invalid_handle:
-            return HIPBLAS_STATUS_NOT_INITIALIZED;
-        case rocblas_status_not_implemented:
-            return HIPBLAS_STATUS_NOT_SUPPORTED;
-        case rocblas_status_invalid_pointer:
-            return HIPBLAS_STATUS_INVALID_VALUE;
-        case rocblas_status_invalid_size:
-            return HIPBLAS_STATUS_INVALID_VALUE;
-        case rocblas_status_memory_error:
-            return HIPBLAS_STATUS_ALLOC_FAILED;
-        case rocblas_status_internal_error:
-            return HIPBLAS_STATUS_INTERNAL_ERROR;
-        default:
-            throw "Unimplemented status";
-    }
-}
-
-}
-
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/reduction.h b/src/gpu/hipblas_port/rocblas_port/reduction.h
deleted file mode 100644
index e4b4d0455..000000000
--- a/src/gpu/hipblas_port/rocblas_port/reduction.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-#ifndef REDUCTION_H_
-#define REDUCTION_H_
-
-#include "rocblas-types.h"
-#include "definitions.h"
-#include "handle.h"
-
-/*
- * ===========================================================================
- *    This file provide common device function used in various BLAS routines
- * ===========================================================================
- */
-
-// BLAS Level 1 includes routines and functions performing vector-vector
-// operations. Most BLAS 1 routines are about reduction: compute the norm,
-// calculate the dot production of two vectors, find the maximum/minimum index
-// of the element of the vector. As you may observed, although the computation
-// type is different, the core algorithm is the same: scan all element of the
-// vector(s) and reduce to one single result.
-//
-// The reduction algorithm on GPU is called [parallel
-// reduction](https://raw.githubusercontent.com/mateuszbuda/GPUExample/master/reduce3.png)
-// which is adopted in rocBLAS. At the beginning, all the threads in the thread
-// block participate. After each step of reduction (like a tree), the number of
-// participating threads decrease by half. At the end of the parallel reduction,
-// only one thread (usually thread 0) owns the result in its thread block.
-//
-// Classically, the BLAS 1 reduction needs more than one GPU kernel to finish,
-// because the lack of global synchronization of thread blocks without exiting
-// the kernel. The first kernels gather partial results, write into a temporary
-// working buffer. The second kernel finishes the final reduction.
-//
-// For example, BLAS 1 routine i*amax is to find index of the maximum absolute
-// value element of a vector. In this routine:
-//
-// Kernel 1: launch many thread block as needed. Each thread block works on a
-// subset of the vector. Each thread block use the parallel reduction to find a
-// local index with the maximum absolute value of the subset. There are
-// number-of-the-thread-blocks local results.The results are written into a
-// temporary working buffer. The working buffer has number-of-the-thread-blocks
-// elements.
-//
-// Kernel 2: launch only one thread block which reads the temporary work buffer and
-// reduces to final result still with the parallel reduction.
-//
-// As you may see, if there is a mechanism to synchronize all the thread blocks
-// after local index is obtained in kernel 1 (without ending the kernel), then
-// Kernel 2's computation can be merged into Kernel 1. One such mechanism is called
-// atomic operation. However, atomic operation is new and is not used in rocBLAS
-// yet. rocBLAS still use the classic standard parallel reduction right now.
-
-namespace {
-// Recursively compute reduction
-template <rocblas_int k, typename REDUCE, typename T>
-struct rocblas_reduction_s
-{
-    __forceinline__ __device__ void operator()(rocblas_int tx, T* x)
-    {
-        // Reduce the lower half with the upper half
-        if(tx < k)
-            REDUCE{}(x[tx], x[tx + k]);
-        __syncthreads();
-
-        // Recurse down with k / 2
-        rocblas_reduction_s<k / 2, REDUCE, T>{}(tx, x);
-    }
-};
-
-// leaf node for terminating recursion
-template <typename REDUCE, typename T>
-struct rocblas_reduction_s<0, REDUCE, T>
-{
-    __forceinline__ __device__ void operator()(rocblas_int tx, T* x) {}
-};
-
-/*! \brief general parallel reduction
-
-    \details
-
-    @param[in]
-    n         rocblas_int. assume a power of 2
-    @param[in]
-    T         element type of vector x
-    @param[in]
-    REDUCE    reduction functor
-    @param[in]
-    tx        rocblas_int. thread id
-    @param[inout]
-    x         pointer storing vector x on the GPU.
-              usually x is stored in shared memory;
-              x[0] store the final result.
-    ********************************************************************/
-template <rocblas_int NB, typename REDUCE, typename T>
-__attribute__((flatten)) __device__ void rocblas_reduction(rocblas_int tx, T* x)
-{
-    static_assert(NB > 1 && !(NB & (NB - 1)), "NB must be a power of 2");
-    __syncthreads();
-    rocblas_reduction_s<NB / 2, REDUCE, T>{}(tx, x);
-}
-
-/*! \brief parallel reduction: sum
-
-    \details
-
-    @param[in]
-    n         rocblas_int. assume a power of 2
-    @param[in]
-    tx        rocblas_int. thread id
-    @param[inout]
-    x         pointer storing vector x on the GPU.
-              usually x is stored in shared memory;
-              x[0] store the final result.
-    ********************************************************************/
-struct rocblas_reduce_sum
-{
-    template <typename T>
-    __forceinline__ __device__ void operator()(T& __restrict__ a, const T& __restrict__ b)
-    {
-        a += b;
-    }
-};
-
-template <rocblas_int NB, typename T>
-__attribute__((flatten)) __device__ void rocblas_sum_reduce(rocblas_int tx, T* x)
-{
-    rocblas_reduction<NB, rocblas_reduce_sum>(tx, x);
-}
-// end sum_reduce
-
-// Identity finalizer
-struct rocblas_finalize_identity
-{
-    template <typename T>
-    __forceinline__ __host__ __device__ T&& operator()(T&& x)
-    {
-        return std::forward<T>(x); // Perfect identity, preserving valueness
-    }
-};
-
-// Emulates value initialization T{}. Allows specialization for certain types.
-template <typename T>
-struct default_value
-{
-    __forceinline__ __host__ __device__ constexpr T operator()() const { return {}; }
-};
-
-// kennel 1 writes partial results per thread block in workspace; number of partial results is
-// blocks
-template <rocblas_int NB,
-          typename FETCH,
-          typename REDUCE = rocblas_reduce_sum,
-          typename Ti,
-          typename To>
-__global__ void
-rocblas_reduction_kernel_part1(rocblas_int n, const Ti* x, rocblas_int incx, To* workspace)
-{
-    ssize_t tx  = hipThreadIdx_x;
-    ssize_t tid = hipBlockIdx_x * hipBlockDim_x + tx;
-    __shared__ To tmp[NB];
-
-    // bound
-    if(tid < n)
-        tmp[tx] = FETCH{}(x[tid * incx], tid);
-    else
-        tmp[tx] = default_value<To>{}(); // pad with default value
-
-    rocblas_reduction<NB, REDUCE>(tx, tmp);
-
-    if(tx == 0)
-        workspace[hipBlockIdx_x] = tmp[0];
-}
-
-// kernel 2 gathers all the partial results in workspace and finishes the final reduction;
-// number of threads (NB) loop blocks
-template <rocblas_int NB,
-          typename REDUCE   = rocblas_reduce_sum,
-          typename FINALIZE = rocblas_finalize_identity,
-          typename To,
-          typename Tr>
-__global__ void rocblas_reduction_kernel_part2(rocblas_int nblocks, To* workspace, Tr* result)
-{
-    rocblas_int tx = hipThreadIdx_x;
-    __shared__ To tmp[NB];
-
-    if(tx < nblocks)
-    {
-        tmp[tx] = workspace[tx];
-
-        // bound, loop
-        for(rocblas_int i = tx + NB; i < nblocks; i += NB)
-            REDUCE{}(tmp[tx], workspace[i]);
-    }
-    else
-    { // pad with default value
-        tmp[tx] = default_value<To>{}();
-    }
-
-    if(nblocks < 32)
-    {
-        // no need parallel reduction
-        __syncthreads();
-
-        if(tx == 0)
-            for(rocblas_int i = 1; i < nblocks; i++)
-                REDUCE{}(tmp[0], tmp[i]);
-    }
-    else
-    {
-        // parallel reduction
-        rocblas_reduction<NB, REDUCE>(tx, tmp);
-    }
-
-    // Store result on device or in workspace
-    if(tx == 0)
-        *result = FINALIZE{}(tmp[0]);
-}
-
-// At least two kernels are needed to finish the reduction
-// kennel 1 write partial result per thread block in workspace, blocks partial results
-// kernel 2 gathers all the partial result in workspace and finishes the final reduction.
-template <rocblas_int NB,
-          typename FETCH,
-          typename REDUCE   = rocblas_reduce_sum,
-          typename FINALIZE = rocblas_finalize_identity,
-          typename Ti,
-          typename To,
-          typename Tr>
-rocblas_status rocblas_reduction_kernel(rocblas_handle __restrict__ handle,
-                                        rocblas_int n,
-                                        const Ti* x,
-                                        rocblas_int incx,
-                                        Tr* result,
-                                        To* workspace,
-                                        rocblas_int blocks)
-{
-    hipLaunchKernelGGL((rocblas_reduction_kernel_part1<NB, FETCH, REDUCE>),
-                       blocks,
-                       NB,
-                       0,
-                       handle->rocblas_stream,
-                       n,
-                       x,
-                       incx,
-                       workspace);
-
-    if(handle->pointer_mode == rocblas_pointer_mode_device)
-    {
-        hipLaunchKernelGGL((rocblas_reduction_kernel_part2<NB, REDUCE, FINALIZE>),
-                           1,
-                           NB,
-                           0,
-                           handle->rocblas_stream,
-                           blocks,
-                           workspace,
-                           result);
-    }
-    else
-    {
-        // If in host pointer mode, workspace is converted to Tr* and the result is
-        // placed there, and then copied from device to host. If To is a class type,
-        // it must be a standard layout type and its first member must be of type Tr.
-        static_assert(std::is_standard_layout<To>(), "To must be a standard layout type");
-
-        if(blocks > 1)
-        {
-            hipLaunchKernelGGL((rocblas_reduction_kernel_part2<NB, REDUCE, FINALIZE>),
-                               1,
-                               NB,
-                               0,
-                               handle->rocblas_stream,
-                               blocks,
-                               workspace,
-                               (Tr*)workspace);
-        }
-
-        if(std::is_same<FINALIZE, rocblas_finalize_identity>() || blocks > 1)
-        {
-            // If FINALIZE is trivial or kernel part2 was called, result is in the
-            // beginning of workspace[0], and can be copied directly.
-            RETURN_IF_HIP_ERROR(hipMemcpy(result, workspace, sizeof(Tr), hipMemcpyDeviceToHost));
-        }
-        else
-        {
-            // If FINALIZE is not trivial and kernel part2 was not called, then
-            // workspace[0] needs to be finalized on host.
-            To res;
-            RETURN_IF_HIP_ERROR(hipMemcpy(&res, workspace, sizeof(To), hipMemcpyDeviceToHost));
-            *result = FINALIZE{}(res);
-        }
-    }
-
-    return rocblas_status_success;
-}
-
-}
-
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/rocblas-types.h b/src/gpu/hipblas_port/rocblas_port/rocblas-types.h
deleted file mode 100644
index 6c38fde07..000000000
--- a/src/gpu/hipblas_port/rocblas_port/rocblas-types.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-/*! \file
- * \brief rocblas-types.h defines data types used by rocblas
- */
-
-#ifndef _ROCBLAS_TYPES_H_
-#define _ROCBLAS_TYPES_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <hip/hip_vector_types.h>
-
-namespace {
-
-// integer types
-/*! \brief To specify whether int32 or int64 is used
- */
-#if defined(rocblas_ILP64)
-typedef int64_t rocblas_int;
-typedef int64_t rocblas_long;
-#else
-typedef int32_t rocblas_int;
-typedef int64_t rocblas_long;
-#endif
-// complex types
-typedef float2 rocblas_float_complex;
-typedef double2 rocblas_double_complex;
-// half types
-typedef uint16_t rocblas_half;
-typedef float2 rocblas_half_complex;
-
-typedef struct _rocblas_handle* rocblas_handle;
-
-/* ============================================================================================ */
-
-/*! parameter constants.
- *  numbering is consistent with CBLAS, ACML and most standard C BLAS libraries
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*! \brief Used to specify whether the matrix is to be transposed or not. */
-typedef enum rocblas_operation_ {
-    rocblas_operation_none      = 111, /**< Operate with the matrix. */
-    rocblas_operation_transpose = 112, /**< Operate with the transpose of the matrix. */
-    rocblas_operation_conjugate_transpose =
-        113 /**< Operate with the conjugate transpose of the matrix. */
-} rocblas_operation;
-
-/*! \brief Used by the Hermitian, symmetric and triangular matrix
- * routines to specify whether the upper or lower triangle is being referenced.
- */
-typedef enum rocblas_fill_ {
-    rocblas_fill_upper = 121, /**< Upper triangle. */
-    rocblas_fill_lower = 122, /**< Lower triangle. */
-    rocblas_fill_full  = 123
-} rocblas_fill;
-
-/*! \brief It is used by the triangular matrix routines to specify whether the
- * matrix is unit triangular.
- */
-typedef enum rocblas_diagonal_ {
-    rocblas_diagonal_non_unit = 131, /**< Non-unit triangular. */
-    rocblas_diagonal_unit     = 132, /**< Unit triangular. */
-} rocblas_diagonal;
-
-/*! \brief Indicates the side matrix A is located relative to matrix B during multiplication. */
-typedef enum rocblas_side_ {
-    rocblas_side_left = 141,  /**< Multiply general matrix by symmetric,
-                        Hermitian or triangular matrix on the left. */
-    rocblas_side_right = 142, /**< Multiply general matrix by symmetric,
-                        Hermitian or triangular matrix on the right. */
-    rocblas_side_both = 143
-} rocblas_side;
-
-/* ============================================================================================ */
-/**
- *   @brief rocblas status codes definition
- */
-typedef enum rocblas_status_ {
-    rocblas_status_success         = 0, /**< success */
-    rocblas_status_invalid_handle  = 1, /**< handle not initialized, invalid or null */
-    rocblas_status_not_implemented = 2, /**< function is not implemented */
-    rocblas_status_invalid_pointer = 3, /**< invalid pointer parameter */
-    rocblas_status_invalid_size    = 4, /**< invalid size parameter */
-    rocblas_status_memory_error    = 5, /**< failed internal memory allocation, copy or dealloc */
-    rocblas_status_internal_error  = 6, /**< other internal library failure */
-} rocblas_status;
-
-/*! \brief Indicates the precision width of data stored in a blas type. */
-typedef enum rocblas_datatype_ {
-    rocblas_datatype_f16_r = 150,
-    rocblas_datatype_f32_r = 151,
-    rocblas_datatype_f64_r = 152,
-    rocblas_datatype_f16_c = 153,
-    rocblas_datatype_f32_c = 154,
-    rocblas_datatype_f64_c = 155,
-    rocblas_datatype_i8_r  = 160,
-    rocblas_datatype_u8_r  = 161,
-    rocblas_datatype_i32_r = 162,
-    rocblas_datatype_u32_r = 163,
-    rocblas_datatype_i8_c  = 164,
-    rocblas_datatype_u8_c  = 165,
-    rocblas_datatype_i32_c = 166,
-    rocblas_datatype_u32_c = 167,
-} rocblas_datatype;
-
-/*! \brief Indicates the pointer is device pointer or host pointer */
-typedef enum rocblas_pointer_mode_ {
-    rocblas_pointer_mode_host   = 0,
-    rocblas_pointer_mode_device = 1
-} rocblas_pointer_mode;
-
-/*! \brief Indicates if layer is active with bitmask*/
-typedef enum rocblas_layer_mode_ {
-    rocblas_layer_mode_none        = 0b0000000000,
-    rocblas_layer_mode_log_trace   = 0b0000000001,
-    rocblas_layer_mode_log_bench   = 0b0000000010,
-    rocblas_layer_mode_log_profile = 0b0000000100,
-} rocblas_layer_mode;
-
-/*! \brief Indicates if layer is active with bitmask*/
-typedef enum rocblas_gemm_algo_ {
-    rocblas_gemm_algo_standard = 0b0000000000,
-} rocblas_gemm_algo;
-
-#ifdef __cplusplus
-}
-#endif
-
-}
-
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/status.h b/src/gpu/hipblas_port/rocblas_port/status.h
deleted file mode 100644
index f0627eff6..000000000
--- a/src/gpu/hipblas_port/rocblas_port/status.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-#ifndef STATUS_H_
-#define STATUS_H_
-
-#include <hip/hip_runtime_api.h>
-#include "rocblas-types.h"
-#include "status.h"
-
-/*******************************************************************************
- * \brief convert hipError_t to rocblas_status
- * TODO - enumerate library calls to hip runtime, enumerate possible errors from those calls
- ******************************************************************************/
-
-namespace {
-inline rocblas_status get_rocblas_status_for_hip_status(hipError_t status)
-{
-    switch(status)
-    {
-    // success
-    case hipSuccess:
-        return rocblas_status_success;
-
-    // internal hip memory allocation
-    case hipErrorMemoryAllocation:
-    case hipErrorLaunchOutOfResources:
-        return rocblas_status_memory_error;
-
-    // user-allocated hip memory
-    case hipErrorInvalidDevicePointer: // hip memory
-        return rocblas_status_invalid_pointer;
-
-    // user-allocated device, stream, event
-    case hipErrorInvalidDevice:
-    case hipErrorInvalidResourceHandle:
-        return rocblas_status_invalid_handle;
-
-    // library using hip incorrectly
-    case hipErrorInvalidValue:
-        return rocblas_status_internal_error;
-
-    // hip runtime failing
-    case hipErrorNoDevice: // no hip devices
-    case hipErrorUnknown:
-    default: return rocblas_status_internal_error;
-    }
-}
-
-}
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port/utility.h b/src/gpu/hipblas_port/rocblas_port/utility.h
deleted file mode 100644
index 006b48096..000000000
--- a/src/gpu/hipblas_port/rocblas_port/utility.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#ifndef UTILITY_H
-#define UTILITY_H
-
-#include "rocblas-types.h"
-
-namespace {
-
-#ifndef GOOGLE_TEST
-
-// Load a scalar. If the argument is a pointer, dereference it; otherwise copy
-// it. Allows the same kernels to be used for host and device scalars.
-
-// For host scalars
-template <typename T>
-__forceinline__ __device__ __host__ T load_scalar(T x)
-{
-    return x;
-}
-
-// For device scalars
-template <typename T>
-__forceinline__ __device__ __host__ T load_scalar(const T* xp)
-{
-    return *xp;
-}
-
-// For rocblas_half2, we broadcast a fp16 across two halves
-template <>
-__forceinline__ __device__ __host__ rocblas_half2 load_scalar(const rocblas_half2* xp)
-{
-    auto x = *reinterpret_cast<const _Float16*>(xp);
-    return {x, x};
-}
-
-#endif // GOOGLE_TEST
-
-inline bool isAligned(const void* pointer, size_t byte_count)
-{
-    return reinterpret_cast<uintptr_t>(pointer) % byte_count == 0;
-}
-
-// clang-format off
-// return letter N,T,C in place of rocblas_operation enum
-constexpr auto rocblas_transpose_letter(rocblas_operation trans)
-{
-    switch(trans)
-    {
-    case rocblas_operation_none:                return 'N';
-    case rocblas_operation_transpose:           return 'T';
-    case rocblas_operation_conjugate_transpose: return 'C';
-    default:                                    return ' ';
-    }
-}
-
-// return letter L, R, B in place of rocblas_side enum
-constexpr auto rocblas_side_letter(rocblas_side side)
-{
-    switch(side)
-    {
-    case rocblas_side_left:  return 'L';
-    case rocblas_side_right: return 'R';
-    case rocblas_side_both:  return 'B';
-    default:                 return ' ';
-    }
-}
-
-// return letter U, L, B in place of rocblas_fill enum
-constexpr auto rocblas_fill_letter(rocblas_fill fill)
-{
-    switch(fill)
-    {
-    case rocblas_fill_upper: return 'U';
-    case rocblas_fill_lower: return 'L';
-    case rocblas_fill_full:  return 'F';
-    default:                 return ' ';
-    }
-}
-
-// return letter N, U in place of rocblas_diagonal enum
-constexpr auto rocblas_diag_letter(rocblas_diagonal diag)
-{
-    switch(diag)
-    {
-    case rocblas_diagonal_non_unit: return 'N';
-    case rocblas_diagonal_unit:     return 'U';
-    default:                        return ' ';
-    }
-}
-
-// return precision string for rocblas_datatype
-constexpr auto rocblas_datatype_string(rocblas_datatype type)
-{
-    switch(type)
-    {
-    case rocblas_datatype_f16_r: return "f16_r";
-    case rocblas_datatype_f32_r: return "f32_r";
-    case rocblas_datatype_f64_r: return "f64_r";
-    case rocblas_datatype_f16_c: return "f16_k";
-    case rocblas_datatype_f32_c: return "f32_c";
-    case rocblas_datatype_f64_c: return "f64_c";
-    case rocblas_datatype_i8_r:  return "i8_r";
-    case rocblas_datatype_u8_r:  return "u8_r";
-    case rocblas_datatype_i32_r: return "i32_r";
-    case rocblas_datatype_u32_r: return "u32_r";
-    case rocblas_datatype_i8_c:  return "i8_c";
-    case rocblas_datatype_u8_c:  return "u8_c";
-    case rocblas_datatype_i32_c: return "i32_c";
-    case rocblas_datatype_u32_c: return "u32_c";
-    default:                     return "invalid";
-    }
-}
-
-// return precision string for data type
-template <typename> constexpr char rocblas_precision_string                [] = "invalid";
-template <> constexpr char rocblas_precision_string<rocblas_half          >[] = "f16_r";
-template <> constexpr char rocblas_precision_string<float                 >[] = "f32_r";
-template <> constexpr char rocblas_precision_string<double                >[] = "f64_r";
-template <> constexpr char rocblas_precision_string<int8_t                >[] = "i8_r";
-template <> constexpr char rocblas_precision_string<uint8_t               >[] = "u8_r";
-template <> constexpr char rocblas_precision_string<int32_t               >[] = "i32_r";
-template <> constexpr char rocblas_precision_string<uint32_t              >[] = "u32_r";
-template <> constexpr char rocblas_precision_string<rocblas_float_complex >[] = "f32_c";
-template <> constexpr char rocblas_precision_string<rocblas_double_complex>[] = "f64_c";
-#if 0 // Not implemented
-template <> constexpr char rocblas_precision_string<rocblas_half_complex  >[] = "f16_c";
-template <> constexpr char rocblas_precision_string<rocblas_i8_complex    >[] = "i8_c";
-template <> constexpr char rocblas_precision_string<rocblas_u8_complex    >[] = "u8_c";
-template <> constexpr char rocblas_precision_string<rocblas_i32_complex   >[] = "i32_c";
-template <> constexpr char rocblas_precision_string<rocblas_u32_complex   >[] = "u32_c";
-#endif
-
-}
-// clang-format on
-#endif
diff --git a/src/gpu/hipblas_port/rocblas_port_axpy.hip.cpp b/src/gpu/hipblas_port/rocblas_port_axpy.hip.cpp
deleted file mode 100644
index 56fc22c77..000000000
--- a/src/gpu/hipblas_port/rocblas_port_axpy.hip.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_complex.h>
-#include "rocblas_port/rocblas-types.h"
-#include "rocblas_port/status.h"
-#include "rocblas_port/definitions.h"
-#include "rocblas_port/handle.h"
-#include "rocblas_port/utility.h"
-#include "rocblas_port/reduction.h"
-#include "rocblas_port/port_helper_func.h"
-
-namespace {
-
-template <typename U, typename T>
-__global__ void rocblas_axpy_kernel(const rocblas_int n, U alpha_device_host, const T* __restrict__ x,
-                                   const rocblas_int incx, T* y, const rocblas_int incy)
-{
-    auto alpha = load_scalar(alpha_device_host);
-
-    auto row = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-
-    if (row < n)
-        y[row * incy] += alpha * x[row * incx];
-}
-
-template <typename T>
-rocblas_status rocblas_axpy(rocblas_handle handle, rocblas_int n, const T* alpha, const T* x, rocblas_int incx, T* y,
-                            rocblas_int incy)
-{
-    if (!handle)
-        return rocblas_status_invalid_handle;
-    if (!alpha)
-        return rocblas_status_invalid_pointer;
-    if (!n)
-        return rocblas_status_success;
-    if (!x || !y)
-        return rocblas_status_invalid_pointer;
-
-    dim3 threads(256);
-    dim3 grid(n / 256 + (n % 256 != 0));
-    hipStream_t rocblas_stream = handle->rocblas_stream;
-
-    if (handle->pointer_mode == rocblas_pointer_mode_device) {
-        hipLaunchKernelGGL((rocblas_axpy_kernel), grid, threads, 0, rocblas_stream, n, alpha, x, incx, y, incy);
-    } else {
-        hipLaunchKernelGGL((rocblas_axpy_kernel), grid, threads, 0, rocblas_stream, n, *alpha, x, incx, y, incy);
-    }
-    return rocblas_status_success;
-}
-
-} // namespace
-
-/*
- * ===========================================================================
- *    C wrapper
- * ===========================================================================
- */
-
-extern "C" {
-
-rocblas_status rocblas_port_saxpy(rocblas_handle handle, rocblas_int n, const float* alpha, const float* x,
-                                  rocblas_int incx, float* y, rocblas_int incy)
-{
-    return rocblas_axpy(handle, n, alpha, x, incx, y, incy);
-}
-
-rocblas_status rocblas_port_daxpy(rocblas_handle handle, rocblas_int n, const double* alpha, const double* x,
-                                  rocblas_int incx, double* y, rocblas_int incy)
-{
-    return rocblas_axpy(handle, n, alpha, x, incx, y, incy);
-}
-
-rocblas_status rocblas_port_caxpy(rocblas_handle handle, rocblas_int n, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* x, rocblas_int incx, hipFloatComplex* y, rocblas_int incy)
-{
-    return rocblas_axpy(handle, n, alpha, x, incx, y, incy);
-}
-rocblas_status rocblas_port_zaxpy(rocblas_handle handle, rocblas_int n, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* x, rocblas_int incx, hipDoubleComplex* y, rocblas_int incy)
-{
-    return rocblas_axpy(handle, n, alpha, x, incx, y, incy);
-}
-
-} // extern "C"
diff --git a/src/gpu/hipblas_port/rocblas_port_gemm.hip.cpp b/src/gpu/hipblas_port/rocblas_port_gemm.hip.cpp
deleted file mode 100644
index b3b7092ad..000000000
--- a/src/gpu/hipblas_port/rocblas_port_gemm.hip.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_complex.h>
-#include <utility>
-#include <tuple>
-#include "rocblas_port/rocblas-types.h"
-#include "rocblas_port/status.h"
-#include "rocblas_port/definitions.h"
-#include "rocblas_port/handle.h"
-#include "rocblas_port/utility.h"
-#include "rocblas_port/reduction.h"
-#include "rocblas_port/port_helper_func.h"
-
-namespace {
-
-
-template<typename T>
-struct CreateReal {
-    template<typename U>
-    __device__ __host__ static inline T eval(const U& val) {
-        return T(val);
-    }
-};
-
-template<>
-struct CreateReal<hipFloatComplex> {
-    template<typename U>
-    __device__ __host__ static inline hipFloatComplex eval(const U& val) {
-        return hipFloatComplex((float)val, 0.f);
-    }
-};
-
-template<>
-struct CreateReal<hipDoubleComplex> {
-    template<typename U>
-    __device__ __host__ static inline hipDoubleComplex eval(const U& val) {
-        return hipDoubleComplex((double)val, 0.);
-    }
-};
-
-
-
-/*
- * Load matrix value
- */
-// transposed
-template<rocblas_operation OP>
-struct MatrixLoadGemm {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        return M[col + row * ld];
-    }
-};
-
-// Normal
-template<>
-struct MatrixLoadGemm<rocblas_operation_none> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int row, const rocblas_int col)
-    {
-        return M[row + col * ld];
-    }
-};
-
-
-
-template <rocblas_operation OP>
-struct MatrixRowsGemm {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return cols;
-    }
-};
-
-template <>
-struct MatrixRowsGemm<rocblas_operation_none> {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return rows;
-    }
-};
-
-template <rocblas_operation OP>
-struct MatrixColsGemm {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return rows;
-    }
-};
-
-template <>
-struct MatrixColsGemm<rocblas_operation_none> {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return cols;
-    }
-};
-
-
-/*
- *
- */
-template <rocblas_operation OP_A, rocblas_operation OP_B, typename U, typename T>
-__global__ void gemm_kernel(rocblas_int m, rocblas_int n, rocblas_int k, U alpha_device_host,
-                                   const T* __restrict__ A, rocblas_int lda, const T* __restrict__ B, rocblas_int ldb,
-                                   U beta_device_host, T* C, rocblas_int ldc)
-{
-
-    const int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    if (tx >= m)
-        return;
-
-    const auto alpha = load_scalar(alpha_device_host);
-    const auto beta = load_scalar(beta_device_host);
-    const int row_C = tx;
-    const int col_C = hipBlockIdx_y;
-
-    const int cols_A = k;
-
-    T res = CreateReal<T>::eval(0);
-    for (int col = 0; col < cols_A; ++col) {
-        res += ConjOp<OP_A, T>::eval(MatrixLoadGemm<OP_A>::eval(A, lda, row_C, col)) *
-               ConjOp<OP_B, T>::eval(MatrixLoadGemm<OP_B>::eval(B, ldb, col, col_C));
-    }
-
-    C[row_C + col_C * ldc] = alpha * res + beta * C[row_C + col_C * ldc];
-}
-
-template <rocblas_operation OP_A, rocblas_operation OP_B, typename T>
-rocblas_status rocblas_gemm(rocblas_handle handle, rocblas_int m, rocblas_int n, rocblas_int k, const T* alpha,
-                            const T* A, rocblas_int lda, const T* B, rocblas_int ldb, const T* beta, T* C,
-                            rocblas_int ldc)
-{
-    if (!handle)
-        return rocblas_status_invalid_handle;
-    if (!alpha)
-        return rocblas_status_invalid_pointer;
-
-    if (!A || !B || !C)
-        return rocblas_status_invalid_pointer;
-
-    if (!m || !n || !k)
-        return rocblas_status_success;
-
-    hipStream_t rocblas_stream = handle->rocblas_stream;
-
-    dim3 threads(256);
-    dim3 grid((m + threads.x - 1) / threads.x, n);
-
-    if (handle->pointer_mode == rocblas_pointer_mode_device) {
-        hipLaunchKernelGGL(gemm_kernel<OP_A, OP_B>,
-                           grid, threads, 0, rocblas_stream, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-    } else {
-        if (rb_port_cmp_and_real_only(*alpha, 0))
-            return rocblas_status_success;
-        hipLaunchKernelGGL(gemm_kernel<OP_A, OP_B>,
-                           grid, threads, 0, rocblas_stream, m, n, k, *alpha, A, lda, B, ldb, *beta, C, ldc);
-    }
-    return rocblas_status_success;
-}
-
-template <rocblas_operation OP_A, typename T>
-rocblas_status rocblas_select_op_b(rocblas_handle handle, rocblas_operation transb,
-                                        rocblas_int m, rocblas_int n, rocblas_int k, const T* alpha, const T* A,
-                                        rocblas_int lda, const T* B, rocblas_int ldb, const T* beta, T* C,
-                                        rocblas_int ldc)
-{
-    if (transb == rocblas_operation_none)
-        return rocblas_gemm<OP_A, rocblas_operation_none>(handle, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-    else if (transb == rocblas_operation_transpose)
-        return rocblas_gemm<OP_A, rocblas_operation_transpose>(handle, m, n, k, alpha, A, lda, B, ldb, beta, C,
-                                                               ldc);
-    else if (transb == rocblas_operation_conjugate_transpose)
-        return rocblas_gemm<OP_A, rocblas_operation_conjugate_transpose>(handle, m, n, k, alpha, A, lda, B, ldb,
-                                                                         beta, C, ldc);
-    else
-        return rocblas_status_not_implemented;
-}
-
-template <typename T>
-rocblas_status rocblas_select(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                        rocblas_int m, rocblas_int n, rocblas_int k, const T* alpha, const T* A,
-                                        rocblas_int lda, const T* B, rocblas_int ldb, const T* beta, T* C,
-                                        rocblas_int ldc)
-{
-    if (transa == rocblas_operation_none)
-        return rocblas_select_op_b<rocblas_operation_none>(handle, transb, m, n, k, alpha, A, lda, B, ldb, beta, C,
-                                                           ldc);
-    else if (transa == rocblas_operation_transpose)
-        return rocblas_select_op_b<rocblas_operation_transpose>(handle, transb, m, n, k, alpha, A, lda, B, ldb, beta, C,
-                                                                ldc);
-    else if (transa == rocblas_operation_conjugate_transpose)
-        return rocblas_select_op_b<rocblas_operation_conjugate_transpose>(handle, transb, m, n, k, alpha, A, lda, B,
-                                                                          ldb, beta, C, ldc);
-    else
-        return rocblas_status_not_implemented;
-}
-
-} // namespace
-
-/*
- * ===========================================================================
- *    C wrapper
- * ===========================================================================
- */
-
-extern "C" {
-
-rocblas_status rocblas_port_sgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const float* alpha, const float* A,
-                                  rocblas_int lda, const float* B, rocblas_int ldb, const float* beta, float* C,
-                                  rocblas_int ldc)
-{
-    return rocblas_select(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-rocblas_status rocblas_port_dgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const double* alpha, const double* A,
-                                  rocblas_int lda, const double* B, rocblas_int ldb, const double* beta, double* C,
-                                  rocblas_int ldc)
-{
-    return rocblas_select(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-rocblas_status rocblas_port_cgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* A, rocblas_int lda, const hipFloatComplex* B, rocblas_int ldb,
-                                  const hipFloatComplex* beta, hipFloatComplex* C, rocblas_int ldc)
-{
-    return rocblas_select(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-
-rocblas_status rocblas_port_zgemm(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb,
-                                  rocblas_int m, rocblas_int n, rocblas_int k, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* A, rocblas_int lda, const hipDoubleComplex* B,
-                                  rocblas_int ldb, const hipDoubleComplex* beta, hipDoubleComplex* C, rocblas_int ldc)
-{
-    return rocblas_select(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-}
-} // extern "C"
diff --git a/src/gpu/hipblas_port/rocblas_port_gemv.hip.cpp b/src/gpu/hipblas_port/rocblas_port_gemv.hip.cpp
deleted file mode 100644
index dac6acd0a..000000000
--- a/src/gpu/hipblas_port/rocblas_port_gemv.hip.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_complex.h>
-#include "rocblas_port/rocblas-types.h"
-#include "rocblas_port/status.h"
-#include "rocblas_port/definitions.h"
-#include "rocblas_port/handle.h"
-#include "rocblas_port/utility.h"
-#include "rocblas_port/reduction.h"
-#include "rocblas_port/port_helper_func.h"
-
-namespace {
-
-template <rocblas_int DIM_X, rocblas_int DIM_Y, typename T, typename U>
-__global__ void gemvn_kernel(rocblas_int m, rocblas_int n, U alpha_device_host, const T* __restrict__ A,
-                             rocblas_int lda, const T* __restrict__ x, rocblas_int incx, U beta_device_host, T* y,
-                             rocblas_int incy)
-{
-    auto alpha              = load_scalar(alpha_device_host);
-    auto beta               = load_scalar(beta_device_host);
-    rocblas_int num_threads = hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
-
-    if (DIM_X * DIM_Y != num_threads)
-        return; // need to launch exactly the same number of threads as template parameters indicate
-
-    rocblas_int thread_id = hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x;
-
-    // threads are all configurated locally
-    rocblas_int tx = thread_id % DIM_X;
-    rocblas_int ty = thread_id / DIM_X;
-
-    rocblas_int ind;
-
-    __shared__ T sdata[DIM_X * 4 * DIM_Y];
-
-    T res_A[4]; // micor tile is 4 * 4
-    T res_x[4];
-
-    res_A[0] = res_x[0] = T(0.0);
-    res_A[1] = res_x[0] = T(0.0);
-    res_A[2] = res_x[0] = T(0.0);
-    res_A[3] = res_x[0] = T(0.0);
-
-    ind = hipBlockIdx_x * DIM_X * 4 + tx;
-
-    rocblas_int n_tail = n % (4 * DIM_Y);
-    rocblas_int col    = ty * 4;
-
-    for (col = ty * 4; col < (n - n_tail); col += 4 * DIM_Y) {
-        res_x[0] = x[(col + 0) * incx];
-        res_x[1] = x[(col + 1) * incx];
-        res_x[2] = x[(col + 2) * incx];
-        res_x[3] = x[(col + 3) * incx];
-
-        if (ind < m) {
-            res_A[0] += A[ind + (col + 0) * lda] * res_x[0];
-            res_A[0] += A[ind + (col + 1) * lda] * res_x[1];
-            res_A[0] += A[ind + (col + 2) * lda] * res_x[2];
-            res_A[0] += A[ind + (col + 3) * lda] * res_x[3];
-        }
-
-        if (ind + DIM_X < m) {
-            res_A[1] += A[ind + DIM_X + (col + 0) * lda] * res_x[0];
-            res_A[1] += A[ind + DIM_X + (col + 1) * lda] * res_x[1];
-            res_A[1] += A[ind + DIM_X + (col + 2) * lda] * res_x[2];
-            res_A[1] += A[ind + DIM_X + (col + 3) * lda] * res_x[3];
-        }
-
-        if (ind + 2 * DIM_X < m) {
-            res_A[2] += A[ind + 2 * DIM_X + (col + 0) * lda] * res_x[0];
-            res_A[2] += A[ind + 2 * DIM_X + (col + 1) * lda] * res_x[1];
-            res_A[2] += A[ind + 2 * DIM_X + (col + 2) * lda] * res_x[2];
-            res_A[2] += A[ind + 2 * DIM_X + (col + 3) * lda] * res_x[3];
-        }
-
-        if (ind + 3 * DIM_X < m) {
-            res_A[3] += A[ind + 3 * DIM_X + (col + 0) * lda] * res_x[0];
-            res_A[3] += A[ind + 3 * DIM_X + (col + 1) * lda] * res_x[1];
-            res_A[3] += A[ind + 3 * DIM_X + (col + 2) * lda] * res_x[2];
-            res_A[3] += A[ind + 3 * DIM_X + (col + 3) * lda] * res_x[3];
-        }
-    }
-
-    // if n  is not multiple of (DIM_Y * 4)
-    if (n_tail > 0) {
-        res_x[0] = (col + 0 < n) ? x[(col + 0) * incx] : T(0);
-        res_x[1] = (col + 1 < n) ? x[(col + 1) * incx] : T(0);
-        res_x[2] = (col + 2 < n) ? x[(col + 2) * incx] : T(0);
-        res_x[3] = (col + 3 < n) ? x[(col + 3) * incx] : T(0);
-
-        if (ind < m) {
-            res_A[0] += A[ind + (col + 0) * lda * (col + 0 < n)] * res_x[0];
-            res_A[0] += A[ind + (col + 1) * lda * (col + 1 < n)] * res_x[1];
-            res_A[0] += A[ind + (col + 2) * lda * (col + 2 < n)] * res_x[2];
-            res_A[0] += A[ind + (col + 3) * lda * (col + 3 < n)] * res_x[3];
-        }
-
-        if (ind + DIM_X < m) {
-            res_A[1] += A[ind + DIM_X + (col + 0) * lda * (col + 0 < n)] * res_x[0];
-            res_A[1] += A[ind + DIM_X + (col + 1) * lda * (col + 1 < n)] * res_x[1];
-            res_A[1] += A[ind + DIM_X + (col + 2) * lda * (col + 2 < n)] * res_x[2];
-            res_A[1] += A[ind + DIM_X + (col + 3) * lda * (col + 3 < n)] * res_x[3];
-        }
-
-        if (ind + 2 * DIM_X < m) {
-            res_A[2] += A[ind + 2 * DIM_X + (col + 0) * lda * (col + 0 < n)] * res_x[0];
-            res_A[2] += A[ind + 2 * DIM_X + (col + 1) * lda * (col + 1 < n)] * res_x[1];
-            res_A[2] += A[ind + 2 * DIM_X + (col + 2) * lda * (col + 2 < n)] * res_x[2];
-            res_A[2] += A[ind + 2 * DIM_X + (col + 3) * lda * (col + 3 < n)] * res_x[3];
-        }
-
-        if (ind + 3 * DIM_X < m) {
-            res_A[3] += A[ind + 3 * DIM_X + (col + 0) * lda * (col + 0 < n)] * res_x[0];
-            res_A[3] += A[ind + 3 * DIM_X + (col + 1) * lda * (col + 1 < n)] * res_x[1];
-            res_A[3] += A[ind + 3 * DIM_X + (col + 2) * lda * (col + 2 < n)] * res_x[2];
-            res_A[3] += A[ind + 3 * DIM_X + (col + 3) * lda * (col + 3 < n)] * res_x[3];
-        }
-    }
-
-    sdata[tx + ty * DIM_X * 4]             = res_A[0];
-    sdata[tx + DIM_X + ty * DIM_X * 4]     = res_A[1];
-    sdata[tx + 2 * DIM_X + ty * DIM_X * 4] = res_A[2];
-    sdata[tx + 3 * DIM_X + ty * DIM_X * 4] = res_A[3];
-
-    __syncthreads();
-
-    ind = hipBlockIdx_x * DIM_X * 4 + thread_id;
-    if (thread_id < DIM_X * 4) {
-        for (rocblas_int i = 1; i < DIM_Y; i++)
-            sdata[thread_id] += sdata[thread_id + DIM_X * 4 * i];
-
-        if (ind < m)
-            y[ind * incy] = alpha * sdata[thread_id] + beta * y[ind * incy];
-    }
-}
-
-template <rocblas_int NB_X, rocblas_operation OP, typename T, typename U>
-__global__ void gemvc_kernel(rocblas_int m, rocblas_int n, U alpha_device_host, const T* __restrict__ A,
-                             rocblas_int lda, const T* __restrict__ x, rocblas_int incx, U beta_device_host, T* y,
-                             rocblas_int incy)
-{
-    auto alpha     = load_scalar(alpha_device_host);
-    auto beta      = load_scalar(beta_device_host);
-    rocblas_int tx = hipThreadIdx_x;
-
-    if (tx < m)
-        A += tx;
-
-    rocblas_int col = hipBlockIdx_x;
-    A += col * lda;
-
-    T res(0);
-
-    __shared__ T sdata[NB_X];
-
-    // partial sums
-    rocblas_int m_full = (m / NB_X) * NB_X;
-
-    for (rocblas_int i = 0; i < m_full; i += NB_X)
-        res += ConjOp<OP, T>::eval(A[i]) * x[(tx + i) * incx];
-
-    if (tx + m_full < m)
-        res += ConjOp<OP, T>::eval(A[m_full]) * x[(tx + m_full) * incx];
-
-    sdata[tx] = res;
-
-    // tree reduction of partial sums,
-    if (NB_X > 16) {
-        rocblas_sum_reduce<NB_X>(tx, sdata);
-    } else {
-        __syncthreads();
-
-        if (tx == 0) {
-            for (rocblas_int i = 1; i < m && i < NB_X; i++)
-                sdata[0] += sdata[i];
-        }
-
-        __syncthreads();
-    }
-
-    if (tx == 0)
-        y[col * incy] = alpha * sdata[0] + beta * y[col * incy];
-}
-
-
-template <typename T>
-rocblas_status rocblas_gemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                            const T* alpha, const T* A, rocblas_int lda, const T* x, rocblas_int incx, const T* beta,
-                            T* y, rocblas_int incy)
-{
-    if (!handle)
-        return rocblas_status_invalid_handle;
-    if (!alpha || !beta)
-        return rocblas_status_invalid_pointer;
-
-    if (!A || !x || !y)
-        return rocblas_status_invalid_pointer;
-
-    if (m < 0 || n < 0 || lda < m || lda < 1 || !incx || !incy)
-        return rocblas_status_invalid_size;
-
-    /*
-     * Quick return if possible. Not Argument error
-     */
-    if (!m || !n)
-        return rocblas_status_success;
-
-    hipStream_t rocblas_stream = handle->rocblas_stream;
-
-    if (transA == rocblas_operation_none) {
-        // GEMVN_DIM_Y must be at least 4, 8 * 8 is very slow only 40Gflop/s
-        static constexpr int GEMVN_DIM_X = 32;
-        static constexpr int GEMVN_DIM_Y = 16;
-        rocblas_int blocks               = (m - 1) / (GEMVN_DIM_X * 4) + 1;
-
-        dim3 gemvn_grid(blocks);
-        dim3 gemvn_threads(GEMVN_DIM_X, GEMVN_DIM_Y);
-
-        if (incx < 0)
-            x -= ssize_t(incx) * (n - 1);
-        if (incy < 0)
-            y -= ssize_t(incy) * (m - 1);
-
-        if (handle->pointer_mode == rocblas_pointer_mode_device) {
-            hipLaunchKernelGGL((gemvn_kernel<GEMVN_DIM_X, GEMVN_DIM_Y>), gemvn_grid, gemvn_threads, 0, rocblas_stream,
-                               m, n, alpha, A, lda, x, incx, beta, y, incy);
-        } else {
-            if (rb_port_cmp_and_real_only(*alpha, 0.0) && rb_port_cmp_and_real_only(*beta, 1))
-                return rocblas_status_success;
-
-            hipLaunchKernelGGL((gemvn_kernel<GEMVN_DIM_X, GEMVN_DIM_Y>), gemvn_grid, gemvn_threads, 0, rocblas_stream,
-                               m, n, *alpha, A, lda, x, incx, *beta, y, incy);
-        }
-    } else {
-        // transpose
-        // number of columns on the y-dim of the grid, using gemvc because gemvt(transpose) is a
-        // instance of gemvc (conjugate)
-        static constexpr int NB = 256;
-        dim3 gemvc_grid(n);
-        dim3 gemvc_threads(NB);
-
-        if (incx < 0)
-            x -= ssize_t(incx) * (m - 1);
-        if (incy < 0)
-            y -= ssize_t(incy) * (n - 1);
-
-        if (handle->pointer_mode == rocblas_pointer_mode_device) {
-            if (transA == rocblas_operation_transpose)
-                hipLaunchKernelGGL(gemvc_kernel<NB, rocblas_operation_transpose>, gemvc_grid, gemvc_threads, 0,
-                                   rocblas_stream, m, n, alpha, A, lda, x, incx, beta, y, incy);
-            else
-                hipLaunchKernelGGL(gemvc_kernel<NB, rocblas_operation_conjugate_transpose>, gemvc_grid, gemvc_threads,
-                                   0, rocblas_stream, m, n, alpha, A, lda, x, incx, beta, y, incy);
-        } else {
-            if (rb_port_cmp_and_real_only(*alpha, 0) && rb_port_cmp_and_real_only(*beta, 1))
-                return rocblas_status_success;
-
-            if (transA == rocblas_operation_transpose)
-                hipLaunchKernelGGL(gemvc_kernel<NB, rocblas_operation_transpose>, gemvc_grid, gemvc_threads, 0,
-                                   rocblas_stream, m, n, *alpha, A, lda, x, incx, *beta, y, incy);
-            else
-                hipLaunchKernelGGL(gemvc_kernel<NB, rocblas_operation_conjugate_transpose>, gemvc_grid, gemvc_threads,
-                                   0, rocblas_stream, m, n, *alpha, A, lda, x, incx, *beta, y, incy);
-        }
-    }
-    return rocblas_status_success;
-}
-
-} // namespace
-
-/*
- * ===========================================================================
- *    C wrapper
- * ===========================================================================
- */
-
-extern "C" {
-
-rocblas_status rocblas_port_sgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const float* alpha, const float* A, rocblas_int lda, const float* x, rocblas_int incx,
-                                  const float* beta, float* y, rocblas_int incy)
-{
-    return rocblas_gemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-rocblas_status rocblas_port_dgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const double* alpha, const double* A, rocblas_int lda, const double* x,
-                                  rocblas_int incx, const double* beta, double* y, rocblas_int incy)
-{
-    return rocblas_gemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-rocblas_status rocblas_port_cgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const hipFloatComplex* alpha, const hipFloatComplex* A, rocblas_int lda,
-                                  const hipFloatComplex* x, rocblas_int incx, const hipFloatComplex* beta,
-                                  hipFloatComplex* y, rocblas_int incy)
-{
-    return rocblas_gemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-rocblas_status rocblas_port_zgemv(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n,
-                                  const hipDoubleComplex* alpha, const hipDoubleComplex* A, rocblas_int lda,
-                                  const hipDoubleComplex* x, rocblas_int incx, const hipDoubleComplex* beta,
-                                  hipDoubleComplex* y, rocblas_int incy)
-{
-    return rocblas_gemv(handle, transA, m, n, alpha, A, lda, x, incx, beta, y, incy);
-}
-
-} // extern "C"
diff --git a/src/gpu/hipblas_port/rocblas_port_ger.hip.cpp b/src/gpu/hipblas_port/rocblas_port_ger.hip.cpp
deleted file mode 100644
index 85d17de9f..000000000
--- a/src/gpu/hipblas_port/rocblas_port_ger.hip.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_complex.h>
-#include "rocblas_port/rocblas-types.h"
-#include "rocblas_port/status.h"
-#include "rocblas_port/definitions.h"
-#include "rocblas_port/handle.h"
-#include "rocblas_port/utility.h"
-#include "rocblas_port/reduction.h"
-#include "rocblas_port/port_helper_func.h"
-
-namespace {
-
-template <typename T, typename U>
-__global__ void rocblas_ger_kernel(const rocblas_int rows_A, const rocblas_int cols_A, const T* __restrict__ x,
-                                   const rocblas_int incx, const T* __restrict__ y, rocblas_int incy,
-                                   U alpha_device_host, T* A, const rocblas_int lda)
-{
-    auto alpha = load_scalar(alpha_device_host);
-
-    auto row = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    auto col = hipBlockIdx_y;
-
-    if (row < rows_A)
-        A[row + col * lda] += alpha * x[row * incx] * y[col * incy];
-}
-
-template <typename T>
-rocblas_status rocblas_ger(rocblas_handle handle, rocblas_int m, rocblas_int n, const T* alpha, const T* x, rocblas_int incx,
-            const T* y, rocblas_int incy, T* A, rocblas_int lda)
-{
-    if (!handle)
-        return rocblas_status_invalid_handle;
-    if (!alpha)
-        return rocblas_status_invalid_pointer;
-    if (!n || !m)
-        return rocblas_status_success;
-    if (!A || !x || !y)
-        return rocblas_status_invalid_pointer;
-
-    dim3 threads(256);
-    dim3 grid(m / 256 + (m % 256 != 0), n);
-    hipStream_t rocblas_stream = handle->rocblas_stream;
-
-    if (handle->pointer_mode == rocblas_pointer_mode_device) {
-            hipLaunchKernelGGL((rocblas_ger_kernel), grid, threads, 0, rocblas_stream,
-                               m, n, x, incx, y, incy, alpha, A, lda);
-    } else {
-            hipLaunchKernelGGL((rocblas_ger_kernel), grid, threads, 0, rocblas_stream,
-                               m, n, x, incx, y, incy, *alpha, A, lda);
-    }
-    return rocblas_status_success;
-}
-
-} // namespace
-
-/*
- * ===========================================================================
- *    C wrapper
- * ===========================================================================
- */
-
-extern "C" {
-
-rocblas_status rocblas_port_sger(rocblas_handle handle, rocblas_int m, rocblas_int n, const float* alpha,
-                                 const float* x, rocblas_int incx, const float* y, rocblas_int incy, float* A,
-                                 rocblas_int lda)
-{
-    return rocblas_ger(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-rocblas_status rocblas_port_dger(rocblas_handle handle, rocblas_int m, rocblas_int n, const double* alpha,
-                                 const double* x, rocblas_int incx, const double* y, rocblas_int incy, double* A,
-                                 rocblas_int lda)
-{
-    return rocblas_ger(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-rocblas_status rocblas_port_cgeru(rocblas_handle handle, rocblas_int m, rocblas_int n, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* x, rocblas_int incx, const hipFloatComplex* y,
-                                  rocblas_int incy, hipFloatComplex* A, rocblas_int lda)
-{
-    return rocblas_ger(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-rocblas_status rocblas_port_zgeru(rocblas_handle handle, rocblas_int m, rocblas_int n, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* x, rocblas_int incx, const hipDoubleComplex* y,
-                                  rocblas_int incy, hipDoubleComplex* A, rocblas_int lda)
-{
-    return rocblas_ger(handle, m, n, alpha, x, incx, y, incy, A, lda);
-}
-
-} // extern "C"
diff --git a/src/gpu/hipblas_port/rocblas_port_trmm.hip.cpp b/src/gpu/hipblas_port/rocblas_port_trmm.hip.cpp
deleted file mode 100644
index 47bae7b2a..000000000
--- a/src/gpu/hipblas_port/rocblas_port_trmm.hip.cpp
+++ /dev/null
@@ -1,438 +0,0 @@
-/* ************************************************************************
- * Copyright 2016 Advanced Micro Devices, Inc.
- * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- * documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
- * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
- * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ************************************************************************ */
-
-#include <hip/hip_runtime.h>
-#include <hip/hip_complex.h>
-#include <utility>
-#include <tuple>
-#include "rocblas_port/rocblas-types.h"
-#include "rocblas_port/status.h"
-#include "rocblas_port/definitions.h"
-#include "rocblas_port/handle.h"
-#include "rocblas_port/utility.h"
-#include "rocblas_port/reduction.h"
-#include "rocblas_port/port_helper_func.h"
-#include <exception>
-
-namespace {
-
-
-template<typename T>
-struct CreateReal {
-    template<typename U>
-    __device__ __host__ static inline T eval(const U& val) {
-        return T(val);
-    }
-};
-
-template<>
-struct CreateReal<hipFloatComplex> {
-    template<typename U>
-    __device__ __host__ static inline hipFloatComplex eval(const U& val) {
-        return hipFloatComplex((float)val, 0.f);
-    }
-};
-
-template<>
-struct CreateReal<hipDoubleComplex> {
-    template<typename U>
-    __device__ __host__ static inline hipDoubleComplex eval(const U& val) {
-        return hipDoubleComplex((double)val, 0.);
-    }
-};
-
-
-template<rocblas_fill MATRIX_TYPE, rocblas_diagonal DIAG_TYPE, rocblas_operation OP>
-struct MatrixLoad;
-
-
-/*
- * FULL Matrix
- */
-template<rocblas_diagonal DIAG_TYPE>
-struct MatrixLoad<rocblas_fill_full, DIAG_TYPE, rocblas_operation_none> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        return M[row * inc + col * ld];
-    }
-};
-
-//transposed
-template<rocblas_diagonal DIAG_TYPE, rocblas_operation OP>
-struct MatrixLoad<rocblas_fill_full, DIAG_TYPE, OP> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        return M[col * inc + row * ld];
-    }
-};
-
-/*
- * Lower Tri Matrix
- */
-// non-unit diag
-template<>
-struct MatrixLoad<rocblas_fill_lower, rocblas_diagonal_non_unit, rocblas_operation_none> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (col > row) return CreateReal<T>::eval(0);
-        return M[row * inc + col * ld];
-    }
-};
-
-// transposed non-unit diag
-template<rocblas_operation OP>
-struct MatrixLoad<rocblas_fill_lower, rocblas_diagonal_non_unit, OP> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (row > col) return CreateReal<T>::eval(0);
-        return M[col * inc + row * ld];
-    }
-};
-
-// unit diag
-template<>
-struct MatrixLoad<rocblas_fill_lower, rocblas_diagonal_unit, rocblas_operation_none> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (col == row) return CreateReal<T>::eval(1);
-        if (col > row) return CreateReal<T>::eval(0);
-        return M[row * inc + col * ld];
-    }
-};
-
-// transposed unit diag
-template<rocblas_operation OP>
-struct MatrixLoad<rocblas_fill_lower, rocblas_diagonal_unit, OP> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (col == row) return CreateReal<T>::eval(1);
-        if (row > col) return CreateReal<T>::eval(0);
-        return M[col * inc + row * ld];
-    }
-};
-
-/*
- * Upper Tri Matrix
- */
-// non-unit diag
-template<>
-struct MatrixLoad<rocblas_fill_upper, rocblas_diagonal_non_unit, rocblas_operation_none> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (col < row) return CreateReal<T>::eval(0);
-        return M[row * inc + col * ld];
-    }
-};
-// transposed non-unit diag
-template<rocblas_operation OP>
-struct MatrixLoad<rocblas_fill_upper, rocblas_diagonal_non_unit, OP> {
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (row < col) return CreateReal<T>::eval(0);
-        return M[col * inc + row * ld];
-    }
-};
-
-// unit diag
-template <>
-struct MatrixLoad<rocblas_fill_upper, rocblas_diagonal_unit, rocblas_operation_none>
-{
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (col == row)
-            return CreateReal<T>::eval(1);
-        if (col < row)
-            return CreateReal<T>::eval(0);
-        return M[row * inc + col * ld];
-    }
-};
-
-// transposed unit diag
-template <rocblas_operation OP>
-struct MatrixLoad<rocblas_fill_upper, rocblas_diagonal_unit, OP>
-{
-    template <typename T>
-    __device__ static inline T eval(const T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col)
-    {
-        if (col == row)
-            return CreateReal<T>::eval(1);
-        if (row < col)
-            return CreateReal<T>::eval(0);
-        return M[col * inc + row * ld];
-    }
-};
-
-
-template <rocblas_operation OP>
-struct MatrixRows {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return cols;
-    }
-};
-
-template <>
-struct MatrixRows<rocblas_operation_none> {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return rows;
-    }
-};
-
-template <rocblas_operation OP>
-struct MatrixCols {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return rows;
-    }
-};
-
-template <>
-struct MatrixCols<rocblas_operation_none> {
-    template <typename T, typename U>
-    __host__ __device__ static inline T eval(const T rows, const U cols) {
-        // transpose or hermitian.
-        return cols;
-    }
-};
-
-template <rocblas_operation OP>
-struct MatrixStore
-{
-    template <typename T, typename U>
-    __device__ static inline T eval(T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col, const U& val)
-    {
-        return M[col * inc + row * ld] = val;;
-    }
-};
-
-template <>
-struct MatrixStore<rocblas_operation_none>
-{
-    template <typename T, typename U>
-    __device__ static inline void eval(T* M, const rocblas_int ld, const rocblas_int inc, const rocblas_int row,
-                                    const rocblas_int col, const U& val)
-    {
-        M[row * inc + col * ld] = val;
-    }
-};
-
-/*
- *
- */
-template <rocblas_operation OP_A_ELEMENT, rocblas_operation OP_A, rocblas_operation OP_B, rocblas_operation OP_C,
-          rocblas_fill FILL_A, rocblas_fill FILL_B, rocblas_diagonal DIAG_A, rocblas_diagonal DIAG_B, typename U,
-          typename T>
-__global__ void trmmn_kernel_a_t_h(rocblas_int m, rocblas_int n, U alpha_device_host, const T* __restrict__ A,
-                                   rocblas_int lda, const T* __restrict__ B, rocblas_int ldb, T* C, rocblas_int ldc)
-{
-
-    const int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    if (tx >= m)
-        return;
-
-    const auto alpha     = load_scalar(alpha_device_host);
-    const int row_op_C = MatrixRows<OP_C>::eval(tx, hipBlockIdx_y);
-    const int col_op_C = MatrixCols<OP_C>::eval(tx, hipBlockIdx_y);
-
-    const int rows_B = MatrixRows<OP_B>::eval(m, n);
-
-    T res(0);
-    for (int col = 0; col < rows_B; ++col) {
-        res += ConjOp<OP_A_ELEMENT, T>::eval(MatrixLoad<FILL_A, DIAG_A, OP_A>::eval(A, lda, 1, row_op_C, col)) *
-               MatrixLoad<FILL_B, DIAG_B, OP_B>::eval(B, ldb, 1, col, col_op_C);
-    }
-
-    MatrixStore<OP_C>::eval(C, ldc, 1, row_op_C, col_op_C, res*alpha);
-}
-
-
-
-template <rocblas_operation OP_A_ELEMENT,rocblas_operation OP_A, rocblas_operation OP_B, rocblas_operation OP_C, rocblas_fill FILL_A,
-          rocblas_diagonal DIAG_A, typename T>
-rocblas_status rocblas_trmm(rocblas_handle handle, rocblas_int m, rocblas_int n, const T* alpha, const T* A,
-                            rocblas_int lda, const T* B, rocblas_int ldb, T* C, rocblas_int ldc)
-{
-    if (!handle)
-        return rocblas_status_invalid_handle;
-    if (!alpha)
-        return rocblas_status_invalid_pointer;
-
-    if (!A || !B || !C)
-        return rocblas_status_invalid_pointer;
-
-    if (!m || !n)
-        return rocblas_status_success;
-
-    hipStream_t rocblas_stream = handle->rocblas_stream;
-
-    dim3 threads(256);
-    dim3 grid((m + threads.x - 1) / threads.x, n);
-
-    if (handle->pointer_mode == rocblas_pointer_mode_device) {
-        hipLaunchKernelGGL(trmmn_kernel_a_t_h<OP_A_ELEMENT, OP_A, OP_B, OP_C, FILL_A, rocblas_fill_full, DIAG_A,
-                                              rocblas_diagonal_non_unit>,
-                           grid, threads, 0, rocblas_stream, m, n, alpha, A, lda, B, ldb, C, ldc);
-    } else {
-        if (rb_port_cmp_and_real_only(*alpha, 0))
-            return rocblas_status_success;
-        hipLaunchKernelGGL(trmmn_kernel_a_t_h<OP_A_ELEMENT, OP_A, OP_B, OP_C, FILL_A, rocblas_fill_full, DIAG_A,
-                                              rocblas_diagonal_non_unit>,
-                           grid, threads, 0, rocblas_stream, m, n, *alpha, A, lda, B, ldb, C, ldc);
-    }
-    return rocblas_status_success;
-}
-
-template <rocblas_operation OP_A_ELEMENT, rocblas_operation OP_A, rocblas_operation OP_B, rocblas_operation OP_C,
-          rocblas_fill FILL_A, typename T>
-rocblas_status rocblas_trmm_select_diag(rocblas_handle handle, rocblas_diagonal diag, rocblas_int m, rocblas_int n,
-                                        const T* alpha, const T* A, rocblas_int lda, const T* B, rocblas_int ldb, T* C,
-                                        rocblas_int ldc)
-{
-    if (diag == rocblas_diagonal_unit) {
-        return rocblas_trmm<OP_A_ELEMENT, OP_A, OP_B, OP_C, FILL_A, rocblas_diagonal_unit>(handle, m, n, alpha, A, lda,
-                                                                                           B, ldb, C, ldc);
-    } else {
-        return rocblas_trmm<OP_A_ELEMENT, OP_A, OP_B, OP_C, FILL_A, rocblas_diagonal_non_unit>(handle, m, n, alpha, A,
-                                                                                               lda, B, ldb, C, ldc);
-    }
-}
-
-template <rocblas_operation OP_A_ELEMENT, rocblas_operation OP_A, rocblas_operation OP_B, rocblas_operation OP_C,
-          typename T>
-rocblas_status rocblas_trmm_select_fill(rocblas_handle handle, rocblas_fill uplo, rocblas_diagonal diag, rocblas_int m,
-                                        rocblas_int n, const T* alpha, const T* A, rocblas_int lda, const T* B,
-                                        rocblas_int ldb, T* C, rocblas_int ldc)
-{
-    if (uplo == rocblas_fill_lower) {
-        return rocblas_trmm_select_diag<OP_A_ELEMENT, OP_A, OP_B, OP_C, rocblas_fill_lower>(handle, diag, m, n, alpha,
-                                                                                            A, lda, B, ldb, C, ldc);
-    } else if (uplo == rocblas_fill_upper) {
-        return rocblas_trmm_select_diag<OP_A_ELEMENT, OP_A, OP_B, OP_C, rocblas_fill_upper>(handle, diag, m, n, alpha,
-                                                                                            A, lda, B, ldb, C, ldc);
-    } else {
-        return rocblas_trmm_select_diag<OP_A_ELEMENT, OP_A, OP_B, OP_C, rocblas_fill_full>(handle, diag, m, n, alpha,
-                                                                                            A, lda, B, ldb, C, ldc);
-    }
-}
-
-template <typename T>
-rocblas_status rocblas_trmm_select(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                   rocblas_diagonal diag, rocblas_int m, rocblas_int n, const T* alpha, const T* A,
-                                   rocblas_int lda, const T* B, rocblas_int ldb, T* C, rocblas_int ldc)
-{
-    if (side == rocblas_side_left) {
-        if (trans == rocblas_operation_none) {
-            return rocblas_trmm_select_fill<rocblas_operation_none, rocblas_operation_none, rocblas_operation_none,
-                                            rocblas_operation_none>(handle, uplo, diag, m, n, alpha, A, lda, B, ldb, C,
-                                                                    ldc);
-        } else if (trans == rocblas_operation_transpose) {
-            return rocblas_trmm_select_fill<rocblas_operation_none, rocblas_operation_transpose, rocblas_operation_none,
-                                            rocblas_operation_none>(handle, uplo, diag, m, n, alpha, A, lda, B, ldb, C,
-                                                                    ldc);
-        } else {
-            return rocblas_trmm_select_fill<rocblas_operation_conjugate_transpose,
-                                            rocblas_operation_conjugate_transpose, rocblas_operation_none,
-                                            rocblas_operation_none>(handle, uplo, diag, m, n, alpha, A, lda, B, ldb, C,
-                                                                    ldc);
-        }
-    } else {
-        // Use the following identities:
-        // B*A = (AT*BT)T
-        // B*AT = (A*BT)T
-        if (trans == rocblas_operation_none) {
-            return rocblas_trmm_select_fill<rocblas_operation_none, rocblas_operation_transpose,
-                                            rocblas_operation_transpose, rocblas_operation_transpose>(
-                handle, uplo, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-        } else if (trans == rocblas_operation_transpose) {
-            return rocblas_trmm_select_fill<rocblas_operation_none, rocblas_operation_none, rocblas_operation_transpose,
-                                            rocblas_operation_transpose>(handle, uplo, diag, m, n, alpha, A, lda, B,
-                                                                         ldb, C, ldc);
-        } else {
-            return rocblas_trmm_select_fill<rocblas_operation_conjugate_transpose, rocblas_operation_none,
-                                            rocblas_operation_transpose, rocblas_operation_transpose>(
-                handle, uplo, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-        }
-    }
-}
-
-} // namespace
-
-/*
- * ===========================================================================
- *    C wrapper
- * ===========================================================================
- */
-
-extern "C" {
-
-rocblas_status rocblas_port_strmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const float* alpha,
-                                  const float* A, rocblas_int lda, const float* B, rocblas_int ldb, float* C,
-                                  rocblas_int ldc)
-{
-    return rocblas_trmm_select<float>(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-}
-
-rocblas_status rocblas_port_dtrmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const double* alpha,
-                                  const double* A, rocblas_int lda, const double* B, rocblas_int ldb, double* C,
-                                  rocblas_int ldc)
-{
-    return rocblas_trmm_select<double>(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-}
-
-rocblas_status rocblas_port_ctrmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const hipFloatComplex* alpha,
-                                  const hipFloatComplex* A, rocblas_int lda, const hipFloatComplex* B, rocblas_int ldb,
-                                  hipFloatComplex* C, rocblas_int ldc)
-{
-    return rocblas_trmm_select<hipFloatComplex>(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-}
-
-rocblas_status rocblas_port_ztrmm(rocblas_handle handle, rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
-                                  rocblas_diagonal diag, rocblas_int m, rocblas_int n, const hipDoubleComplex* alpha,
-                                  const hipDoubleComplex* A, rocblas_int lda, const hipDoubleComplex* B,
-                                  rocblas_int ldb, hipDoubleComplex* C, rocblas_int ldc)
-{
-    return rocblas_trmm_select<hipDoubleComplex>(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-}
-
-} // extern "C"
diff --git a/src/gpu/hipblas_port/tests/axpy_test.cpp b/src/gpu/hipblas_port/tests/axpy_test.cpp
deleted file mode 100644
index e324088c3..000000000
--- a/src/gpu/hipblas_port/tests/axpy_test.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-#ifdef __CUDA
-#include <cuda_runtime.h>
-#include <cuComplex.h>
-#include <cublas_v2.h>
-#else
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_complex.h>
-#include <hipblas.h>
-#include "hipblas_port.h"
-#endif
-
-
-#include <vector>
-#include "gtest/gtest.h"
-// #define CATCH_CONFIG_MAIN
-// #include "catch.hpp"
-
-using testing::Types;
-
-#ifdef __CUDA
-#define GPU_PREFIX(val) cuda##val
-#else
-#define GPU_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define BLAS_PREFIX(val) cu##val
-#else
-#define BLAS_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define GPU_PREFIX_CAPS(val) CU##val
-#else
-#define GPU_PREFIX_CAPS(val) HIP##val
-#endif
-
-template<typename T>
-struct create_real {
-    template<typename U>
-    static inline T eval(const U& val) {
-        return T(val);
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(FloatComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(FloatComplex) eval(const U& val) {
-        BLAS_PREFIX(FloatComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(DoubleComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(DoubleComplex) eval(const U& val) {
-        BLAS_PREFIX(DoubleComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template <typename T>
-struct create_complex
-{
-    template <typename U1, typename U2>
-    static inline T eval(const U1& val1, const U2& val2)
-    {
-        T c;
-        c.x = val1;
-        c.y = val2;
-        return c;
-    }
-};
-
-template<typename T>
-inline double get_real_double(const T& val) {
-    return double(val);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(FloatComplex)>(const BLAS_PREFIX(FloatComplex)& val) {
-    return double(val.x);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(DoubleComplex)>(const BLAS_PREFIX(DoubleComplex)& val) {
-    return double(val.x);
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_axpy(BLAS_PREFIX(blasHandle_t) handle, int n, const float* alpha, const float* x, int incx,
-                                      float* y, int incy)
-{
-
-#ifdef __CUDA
-    return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Saxpy)(handle, n, alpha, x, incx, y, incy);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_axpy(BLAS_PREFIX(blasHandle_t) handle, int n, const double* alpha, const double* x, int incx,
-                                      double* y, int incy)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasDaxpy)(handle, n, alpha, x, incx, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Daxpy)(handle, n, alpha, x, incx, y, incy);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_axpy(BLAS_PREFIX(blasHandle_t) handle, int n, const BLAS_PREFIX(FloatComplex)* alpha,
-                                      const BLAS_PREFIX(FloatComplex)* x, int incx, BLAS_PREFIX(FloatComplex)* y, int incy)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasCaxpy)(handle, n, alpha, x, incx, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Caxpy)(handle, n, alpha, x, incx, y, incy);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_axpy(BLAS_PREFIX(blasHandle_t) handle, int n, const BLAS_PREFIX(DoubleComplex)* alpha,
-                                      const BLAS_PREFIX(DoubleComplex)* x, int incx, BLAS_PREFIX(DoubleComplex)* y, int incy)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasZaxpy)(handle, n, alpha, x, incx, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Zaxpy)(handle, n, alpha, x, incx, y, incy);
-#endif
-}
-
-template <typename T>
-class AxpyTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-
-        /*     1
-         * x = *
-         *     2
-         *     *
-         */
-        x = {create_real<T>::eval(1), create_real<T>::eval(-10000), create_real<T>::eval(2),
-             create_real<T>::eval(-10000)};
-        GPU_PREFIX(Malloc)((void**)&x_device, x.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(x_device, x.data(), x.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     7
-         * y = *
-         *     8
-         *     *
-         */
-        y = {create_real<T>::eval(7), create_real<T>::eval(-10000), create_real<T>::eval(8),
-             create_real<T>::eval(-10000)};
-        GPU_PREFIX(Malloc)((void**)&y_device, y.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(y_device, y.data(), y.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(2);
-
-        y_result.resize(y.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(x_device);
-        GPU_PREFIX(Free)(y_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> x, y, y_result;
-    T* x_device;
-    T* y_device;
-    T alpha;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-
-
-typedef Types<float, double, BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> AxpyValueTypes;
-
-TYPED_TEST_CASE(AxpyTest, AxpyValueTypes);
-
-TYPED_TEST(AxpyTest, Strided) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_axpy(this->handle, 2, &(this->alpha), this->x_device, 2, this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[0]), 9.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[2]), 12.);
-
-}
diff --git a/src/gpu/hipblas_port/tests/gemm_test.cpp b/src/gpu/hipblas_port/tests/gemm_test.cpp
deleted file mode 100644
index 5223d87fd..000000000
--- a/src/gpu/hipblas_port/tests/gemm_test.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-#ifdef __CUDA
-#include <cuda_runtime.h>
-#include <cuComplex.h>
-#include <cublas_v2.h>
-#else
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_complex.h>
-#include <hipblas.h>
-#include "hipblas_port.h"
-#endif
-#include <vector>
-#include "gtest/gtest.h"
-// #define CATCH_CONFIG_MAIN
-// #include "catch.hpp"
-
-using testing::Types;
-#ifdef __CUDA
-#define GPU_PREFIX(val) cuda##val
-#else
-#define GPU_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define BLAS_PREFIX(val) cu##val
-#else
-#define BLAS_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define GPU_PREFIX_CAPS(val) CU##val
-#else
-#define GPU_PREFIX_CAPS(val) HIP##val
-#endif
-
-template<typename T>
-struct create_real {
-    template<typename U>
-    static inline T eval(const U& val) {
-        return T(val);
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(FloatComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(FloatComplex) eval(const U& val) {
-        BLAS_PREFIX(FloatComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(DoubleComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(DoubleComplex) eval(const U& val) {
-        BLAS_PREFIX(DoubleComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template <typename T>
-struct create_complex
-{
-    template <typename U1, typename U2>
-    static inline T eval(const U1& val1, const U2& val2)
-    {
-        T c;
-        c.x = val1;
-        c.y = val2;
-        return c;
-    }
-};
-
-template<typename T>
-inline double get_real_double(const T& val) {
-    return double(val);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(FloatComplex)>(const BLAS_PREFIX(FloatComplex)& val) {
-    return double(val.x);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(DoubleComplex)>(const BLAS_PREFIX(DoubleComplex)& val) {
-    return double(val.x);
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_gemm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) transa, BLAS_PREFIX(blasOperation_t) transb,
-                                      int m, int n, int k, const float* alpha, const float* A, int lda, const float* B,
-                                      int ldb, const float* beta, float* C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasSgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Sgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_gemm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) transa,
-              BLAS_PREFIX(blasOperation_t) transb, int m, int n, int k, const double* alpha, const double* A, int lda,
-              const double* B, int ldb, const double* beta, double* C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasDgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Dgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_gemm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) transa,
-              BLAS_PREFIX(blasOperation_t) transb, int m, int n, int k, const BLAS_PREFIX(FloatComplex) * alpha,
-              const BLAS_PREFIX(FloatComplex) * A, int lda, const BLAS_PREFIX(FloatComplex) * B, int ldb,
-              const BLAS_PREFIX(FloatComplex) * beta, BLAS_PREFIX(FloatComplex) * C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasCgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Cgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_gemm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) transa,
-              BLAS_PREFIX(blasOperation_t) transb, int m, int n, int k, const BLAS_PREFIX(DoubleComplex) * alpha,
-              const BLAS_PREFIX(DoubleComplex) * A, int lda, const BLAS_PREFIX(DoubleComplex) * B, int ldb,
-              const BLAS_PREFIX(DoubleComplex) * beta, BLAS_PREFIX(DoubleComplex) * C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasZgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Zgemm)(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
-#endif
-}
-
-template <typename T>
-class GemmRealTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 3
-         * A = 2 4
-         *     * *
-         */
-        A = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3
-         * B = 2 4
-         *     * *
-         */
-        B = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3
-         * C = 2 4
-         *     * *
-         */
-        C = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-        beta  = create_real<T>::eval(2);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha, beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-typedef Types<float, double, BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> GemmRealTypes;
-
-TYPED_TEST_CASE(GemmRealTest, GemmRealTypes);
-
-TYPED_TEST(GemmRealTest, AN_BN)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_N), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 9.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 21.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 30.);
-}
-
-TYPED_TEST(GemmRealTest, AT_BN)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_OP_N), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 17.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 33.);
-}
-
-TYPED_TEST(GemmRealTest, AN_BT)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_T), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 12.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 18.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 20.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 28.);
-}
-
-TYPED_TEST(GemmRealTest, AT_BT)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_OP_T), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 9.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 19.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 16.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 30.);
-}
-
-TYPED_TEST(GemmRealTest, AC_BC)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_C), GPU_PREFIX_CAPS(BLAS_OP_C), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 9.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 19.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 16.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 30.);
-}
-
-template <typename T>
-class GemmComplexTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 3
-         * A = 2 4
-         *     * *
-         */
-        A = {create_complex<T>::eval(1, 1), create_complex<T>::eval(1, 2), create_complex<T>::eval(1, -10000),
-             create_complex<T>::eval(1, 3), create_complex<T>::eval(1, 4), create_complex<T>::eval(1, -20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3
-         * B = 2 4
-         *     * *
-         */
-        B = {create_complex<T>::eval(1, 1), create_complex<T>::eval(1, 2), create_complex<T>::eval(1, -10000),
-             create_complex<T>::eval(1, 3), create_complex<T>::eval(1, 4), create_complex<T>::eval(1, -20000)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3
-         * C = 2 4
-         *     * *
-         */
-        C = {create_complex<T>::eval(1, 1), create_complex<T>::eval(1, 2), create_complex<T>::eval(1, -10000),
-             create_complex<T>::eval(1, 3), create_complex<T>::eval(1, 4), create_complex<T>::eval(1, -20000)};
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_complex<T>::eval(1, 0);
-        beta  = create_complex<T>::eval(2, 0);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha, beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-typedef Types<BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> GemmComplexTypes;
-
-TYPED_TEST_CASE(GemmComplexTest, GemmComplexTypes);
-
-TYPED_TEST(GemmComplexTest, AN_BN)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_N), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, -3.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, -6.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, -11.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, -18.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 9.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 13.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 17.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 21.);
-}
-
-TYPED_TEST(GemmComplexTest, AT_BN)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_OP_N), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, -1.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, -7.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, -7.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, -21.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 8.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 14.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 16.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 22.);
-}
-
-TYPED_TEST(GemmComplexTest, AN_BT)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_T), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, -6.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, -10.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, -10.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, -16.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 10.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 14.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 16.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 20.);
-}
-
-TYPED_TEST(GemmComplexTest, AT_BT)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_OP_T), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, -3.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, -11.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, -6.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, -18.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 9.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 15.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 15.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 21.);
-}
-
-TYPED_TEST(GemmComplexTest, AC_BC)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_C), GPU_PREFIX_CAPS(BLAS_OP_C), 2, 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, -3.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, -11.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, -6.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, -18.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, -5.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, -7.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, -3.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, -5.);
-}
-
-/*
- * Non-squared test
- */
-
-template <typename T>
-class GemmRealNonSquaredTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 3 5
-         * A = 2 4 6
-         *     * * *
-         */
-        A = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-10000),
-             create_real<T>::eval(5), create_real<T>::eval(6), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 4
-         * B = 2 5
-         *     3 6
-         *     * *
-         */
-        B = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(3), create_real<T>::eval(-10000),
-             create_real<T>::eval(4), create_real<T>::eval(5), create_real<T>::eval(6), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     * * *
-         * C = * * *
-         *     * * *
-         */
-        C = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-        beta  = create_real<T>::eval(0);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha, beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-TYPED_TEST_CASE(GemmRealNonSquaredTest, GemmRealTypes);
-
-TYPED_TEST(GemmRealNonSquaredTest, AN_BN)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_N), 2, 2, 3, &(this->alpha),
-                       this->A_device, 3, this->B_device, 4, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 22.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 28.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 49.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 64.);
-}
-
-TYPED_TEST(GemmRealNonSquaredTest, AT_BT)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_OP_T), 3, 3, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 4, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 9.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 19.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[2]), 29.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 12.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 26.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[5]), 40.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[6]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[7]), 33.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[8]), 51.);
-}
-
-/*
- * Non-squared test 2
- */
-
-template <typename T>
-class GemmRealNonSquaredTest2 : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 3 5
-         * A = 2 4 6
-         *     * * *
-         */
-        A = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-10000),
-             create_real<T>::eval(5), create_real<T>::eval(6), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3 5
-         * B = 2 4 6
-         */
-        B = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(3),
-             create_real<T>::eval(4), create_real<T>::eval(5), create_real<T>::eval(6)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     * * *
-         * C = * * *
-         *     * * *
-         */
-        C.resize(9, create_real<T>::eval(-1000));
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-        beta  = create_real<T>::eval(0);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha, beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-TYPED_TEST_CASE(GemmRealNonSquaredTest2, GemmRealTypes);
-
-TYPED_TEST(GemmRealNonSquaredTest2, AN_BT)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_T), 2, 2, 3, &(this->alpha),
-                       this->A_device, 3, this->B_device, 2, &(this->beta), this->C_device, 2);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 35.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 44.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[2]), 44.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 56.);
-}
-
-TYPED_TEST(GemmRealNonSquaredTest2, AT_BN)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_OP_N), 3, 3, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 2, &(this->beta), this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 5.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[2]), 17.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 25.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[5]), 39.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[6]), 17.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[7]), 39.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[8]), 61.);
-}
-
-template <typename T>
-class GemmSiriusTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*
-         * B * A
-         */
-        m      = 412;
-        n      = 15;
-        k      = 15;
-        lda    = m;
-        ldb    = k;
-        ldc    = m;
-        size_A = k * lda;
-        size_B = n * ldb;
-        size_C = n * ldc;
-
-        A.resize(size_A, create_real<T>::eval(-1000));
-        B.resize(size_B, create_real<T>::eval(-1000));
-        C.resize(size_C, create_real<T>::eval(-1000));
-
-        for (int i = 0; i < size_A; ++i) {
-            A[i] = create_real<T>::eval(i + 1);
-        }
-
-        for (int i = 0; i < size_B; ++i) {
-            B[i] = create_real<T>::eval(i + 1);
-        }
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-        beta  = create_real<T>::eval(0);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    int m, n, k, ldb, lda, ldc, size_B, size_A, size_C;
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha;
-    T beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-TYPED_TEST_CASE(GemmSiriusTest, GemmRealTypes);
-
-TYPED_TEST(GemmSiriusTest, SIRIUS_N_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_gemm(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_OP_N), this->m, this->n, this->k,
-                       &(this->alpha), this->A_device, this->lda, this->B_device, this->ldb, &(this->beta),
-                       this->C_device, this->ldc);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 461560.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[411]), 510880.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[414]), 1111375.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[825]), 1760380.);
-}
-
diff --git a/src/gpu/hipblas_port/tests/gemv_test.cpp b/src/gpu/hipblas_port/tests/gemv_test.cpp
deleted file mode 100644
index 2cf75b12a..000000000
--- a/src/gpu/hipblas_port/tests/gemv_test.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-#ifdef __CUDA
-#include <cuda_runtime.h>
-#include <cuComplex.h>
-#include <cublas_v2.h>
-#else
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_complex.h>
-#include <hipblas.h>
-#include "hipblas_port.h"
-#endif
-#include <vector>
-#include "gtest/gtest.h"
-// #define CATCH_CONFIG_MAIN
-// #include "catch.hpp"
-
-using testing::Types;
-#ifdef __CUDA
-#define GPU_PREFIX(val) cuda##val
-#else
-#define GPU_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define BLAS_PREFIX(val) cu##val
-#else
-#define BLAS_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define GPU_PREFIX_CAPS(val) CU##val
-#else
-#define GPU_PREFIX_CAPS(val) HIP##val
-#endif
-
-template<typename T>
-struct create_real {
-    template<typename U>
-    static inline T eval(const U& val) {
-        return T(val);
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(FloatComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(FloatComplex) eval(const U& val) {
-        BLAS_PREFIX(FloatComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(DoubleComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(DoubleComplex) eval(const U& val) {
-        BLAS_PREFIX(DoubleComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template <typename T>
-struct create_complex
-{
-    template <typename U1, typename U2>
-    static inline T eval(const U1& val1, const U2& val2)
-    {
-        T c;
-        c.x = val1;
-        c.y = val2;
-        return c;
-    }
-};
-
-template<typename T>
-inline double get_real_double(const T& val) {
-    return double(val);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(FloatComplex)>(const BLAS_PREFIX(FloatComplex)& val) {
-    return double(val.x);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(DoubleComplex)>(const BLAS_PREFIX(DoubleComplex)& val) {
-    return double(val.x);
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_gemv(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) trans, int m, int n, const float* alpha,
-                                   const float* A, int lda, const float* x, int incx, const float* beta, float* y,
-                                   int incy) {
-#ifdef __CUDA
-    return BLAS_PREFIX(blasSgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Sgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_gemv(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) trans, int m, int n, const double* alpha,
-                                   const double* A, int lda, const double* x, int incx, const double* beta, double* y,
-                                   int incy) {
-#ifdef __CUDA
-    return BLAS_PREFIX(blasDgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Dgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_gemv(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) trans, int m, int n,
-                               const BLAS_PREFIX(FloatComplex)* alpha, const BLAS_PREFIX(FloatComplex)* A, int lda,
-                               const BLAS_PREFIX(FloatComplex)* x, int incx, const BLAS_PREFIX(FloatComplex)* beta, BLAS_PREFIX(FloatComplex)* y,
-                               int incy)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasCgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Cgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_gemv(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasOperation_t) trans, int m, int n,
-                               const BLAS_PREFIX(DoubleComplex)* alpha, const BLAS_PREFIX(DoubleComplex)* A, int lda,
-                               const BLAS_PREFIX(DoubleComplex)* x, int incx, const BLAS_PREFIX(DoubleComplex)* beta, BLAS_PREFIX(DoubleComplex)* y,
-                               int incy)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasZgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#else
-    return BLAS_PREFIX(blas_port_Zgemv)(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
-#endif
-}
-
-template <typename T>
-class GemvRealTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 4
-         * A = 2 5
-         *     3 6
-         *     * *
-         */
-        A = {create_real<T>::eval(1),      create_real<T>::eval(2),     create_real<T>::eval(3),
-                                  create_real<T>::eval(-10000), create_real<T>::eval(4),     create_real<T>::eval(5),
-                                  create_real<T>::eval(6),      create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1
-         * x = *
-         *     2
-         *     *
-         *     3
-         *     *
-         */
-        x = {create_real<T>::eval(1), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(2), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(3), create_real<T>::eval(-10000)};
-        GPU_PREFIX(Malloc)((void**)&x_device, x.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(x_device, x.data(), x.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1
-         * y = *
-         *     1
-         *     *
-         *     1
-         *     *
-         */
-        y = {create_real<T>::eval(1), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(1), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(1), create_real<T>::eval(-10000)};
-        GPU_PREFIX(Malloc)((void**)&y_device, y.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(y_device, y.data(), y.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-        beta = create_real<T>::eval(2);
-
-        y_result.resize(y.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(x_device);
-        GPU_PREFIX(Free)(y_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, x, y, y_result;
-    T* A_device;
-    T* x_device;
-    T* y_device;
-    T alpha, beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-
-
-typedef Types<float, double, BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> GemvValueTypes;
-
-TYPED_TEST_CASE(GemvRealTest, GemvValueTypes);
-
-TYPED_TEST(GemvRealTest, OP_NONE) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_gemv(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), 3, 2, &(this->alpha), this->A_device, 4, this->x_device, 2, &(this->beta), this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[0]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[2]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[4]), 17.);
-}
-
-TYPED_TEST(GemvRealTest, OP_T) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_gemv(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), 3, 2, &(this->alpha), this->A_device, 4, this->x_device, 2, &(this->beta), this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[0]), 16.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[2]), 34.);
-}
-
-TYPED_TEST(GemvRealTest, OP_C) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_gemv(this->handle, GPU_PREFIX_CAPS(BLAS_OP_C), 3, 2, &(this->alpha), this->A_device, 4, this->x_device, 2, &(this->beta), this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[0]), 16.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->y_result[2]), 34.);
-}
-
-
-template <typename T>
-class GemvComplexTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 4
-         * A = 2 5
-         *     3 6
-         *     * *
-         */
-        A = {create_complex<T>::eval(1, 1), create_complex<T>::eval(1, 2), create_complex<T>::eval(1, 3), create_complex<T>::eval(1, -10000), create_complex<T>::eval(1, 4), create_complex<T>::eval(1, 5), create_complex<T>::eval(1, 6), create_complex<T>::eval(1, -20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1
-         * x = *
-         *     2
-         *     *
-         *     3
-         *     *
-         */
-        x = {create_complex<T>::eval(1, 1), create_complex<T>::eval(1, -10000), create_complex<T>::eval(1, 2), create_complex<T>::eval(1, -10000), create_complex<T>::eval(1, 3), create_complex<T>::eval(1, -10000)};
-        GPU_PREFIX(Malloc)((void**)&x_device, x.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(x_device, x.data(), x.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1
-         * y = *
-         *     1
-         *     *
-         *     1
-         *     *
-         */
-        y = {create_complex<T>::eval(1, 1), create_complex<T>::eval(1, -10000), create_complex<T>::eval(1, 1), create_complex<T>::eval(1, -10000), create_complex<T>::eval(1, 1), create_complex<T>::eval(1, -10000)};
-        GPU_PREFIX(Malloc)((void**)&y_device, y.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(y_device, y.data(), y.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-        beta  = create_real<T>::eval(2);
-
-        y_result.resize(y.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(x_device);
-        GPU_PREFIX(Free)(y_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, x, y, y_result;
-    T* A_device;
-    T* x_device;
-    T* y_device;
-    T alpha, beta;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-typedef Types<BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> GemvComplexValueTypes;
-
-TYPED_TEST_CASE(GemvComplexTest, GemvComplexValueTypes);
-
-TYPED_TEST(GemvComplexTest, OP_NONE) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_gemv(this->handle, GPU_PREFIX_CAPS(BLAS_OP_N), 3, 2, &(this->alpha), this->A_device, 4, this->x_device, 2, &(this->beta), this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->y_result[0]).y, 10.);
-    EXPECT_DOUBLE_EQ((this->y_result[2]).y, 12.);
-    EXPECT_DOUBLE_EQ((this->y_result[4]).y, 14.);
-}
-
-TYPED_TEST(GemvComplexTest, OP_T) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_gemv(this->handle, GPU_PREFIX_CAPS(BLAS_OP_T), 3, 2, &(this->alpha), this->A_device, 4, this->x_device, 2, &(this->beta), this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->y_result[0]).y, 14.);
-    EXPECT_DOUBLE_EQ((this->y_result[2]).y, 23.);
-}
-
-TYPED_TEST(GemvComplexTest, OP_C) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_gemv(this->handle, GPU_PREFIX_CAPS(BLAS_OP_C), 3, 2, &(this->alpha), this->A_device, 4, this->x_device, 2, &(this->beta), this->y_device, 2);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->y_result.data(), this->y_device, this->y_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->y_result[0]).y, 2.);
-    EXPECT_DOUBLE_EQ((this->y_result[2]).y, -7.);
-}
-
-
diff --git a/src/gpu/hipblas_port/tests/ger_test.cpp b/src/gpu/hipblas_port/tests/ger_test.cpp
deleted file mode 100644
index c47e8c247..000000000
--- a/src/gpu/hipblas_port/tests/ger_test.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-#ifdef __CUDA
-#include <cuda_runtime.h>
-#include <cuComplex.h>
-#include <cublas_v2.h>
-#else
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_complex.h>
-#include <hipblas.h>
-#include "hipblas_port.h"
-#endif
-#include <vector>
-#include "gtest/gtest.h"
-// #define CATCH_CONFIG_MAIN
-// #include "catch.hpp"
-
-using testing::Types;
-#ifdef __CUDA
-#define GPU_PREFIX(val) cuda##val
-#else
-#define GPU_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define BLAS_PREFIX(val) cu##val
-#else
-#define BLAS_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define GPU_PREFIX_CAPS(val) CU##val
-#else
-#define GPU_PREFIX_CAPS(val) HIP##val
-#endif
-
-template<typename T>
-struct create_real {
-    template<typename U>
-    static inline T eval(const U& val) {
-        return T(val);
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(FloatComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(FloatComplex) eval(const U& val) {
-        BLAS_PREFIX(FloatComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template<>
-struct create_real<BLAS_PREFIX(DoubleComplex)> {
-    template<typename U>
-    static inline BLAS_PREFIX(DoubleComplex) eval(const U& val) {
-        BLAS_PREFIX(DoubleComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template <typename T>
-struct create_complex
-{
-    template <typename U1, typename U2>
-    static inline T eval(const U1& val1, const U2& val2)
-    {
-        T c;
-        c.x = val1;
-        c.y = val2;
-        return c;
-    }
-};
-
-template<typename T>
-inline double get_real_double(const T& val) {
-    return double(val);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(FloatComplex)>(const BLAS_PREFIX(FloatComplex)& val) {
-    return double(val.x);
-}
-
-template<>
-inline double get_real_double<BLAS_PREFIX(DoubleComplex)>(const BLAS_PREFIX(DoubleComplex)& val) {
-    return double(val.x);
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_ger(BLAS_PREFIX(blasHandle_t) handle, int m, int n, const float* alpha, const float* x, int incx,
-                                     const float* y, int incy, float* A, int lda)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasSger)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#else
-    return BLAS_PREFIX(blas_port_Sger)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_ger(BLAS_PREFIX(blasHandle_t) handle, int m, int n, const double* alpha, const double* x,
-                                     int incx, const double* y, int incy, double* A, int lda)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasDger)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#else
-    return BLAS_PREFIX(blas_port_Dger)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_ger(BLAS_PREFIX(blasHandle_t) handle, int m, int n, const BLAS_PREFIX(FloatComplex)* alpha,
-                                     const BLAS_PREFIX(FloatComplex)* x, int incx, const BLAS_PREFIX(FloatComplex)* y, int incy,
-                                     BLAS_PREFIX(FloatComplex)* A, int lda)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasCgeru)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#else
-    return BLAS_PREFIX(blas_port_Cgeru)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t) call_ger(BLAS_PREFIX(blasHandle_t) handle, int m, int n, const BLAS_PREFIX(DoubleComplex)* alpha,
-                                     const BLAS_PREFIX(DoubleComplex)* x, int incx, const BLAS_PREFIX(DoubleComplex)* y, int incy,
-                                     BLAS_PREFIX(DoubleComplex)* A, int lda)
-{
-
-#ifdef __CUDA
-    return BLAS_PREFIX(blasZgeru)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#else
-    return BLAS_PREFIX(blas_port_Zgeru)(handle, m, n, alpha, x, incx, y, incy, A, lda);
-#endif
-}
-
-template <typename T>
-class GerTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 4
-         * A = 2 5
-         *     3 6
-         *     * *
-         */
-        A = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(3), create_real<T>::eval(-10000),
-             create_real<T>::eval(4), create_real<T>::eval(5), create_real<T>::eval(6), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1
-         * x = *
-         *     2
-         *     *
-         *     3
-         *     *
-         */
-        x = {create_real<T>::eval(1), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(2), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(3), create_real<T>::eval(-10000)};
-        GPU_PREFIX(Malloc)((void**)&x_device, x.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(x_device, x.data(), x.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     7
-         * y = *
-         *     8
-         *     *
-         */
-        y = {create_real<T>::eval(7), create_real<T>::eval(-10000),
-                                  create_real<T>::eval(8), create_real<T>::eval(-10000)};
-        GPU_PREFIX(Malloc)((void**)&y_device, y.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(y_device, y.data(), y.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(2);
-
-        A_result.resize(A.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(x_device);
-        GPU_PREFIX(Free)(y_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, x, y, A_result;
-    T* A_device;
-    T* x_device;
-    T* y_device;
-    T alpha;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-
-
-typedef Types<float, double, BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> GemvValueTypes;
-
-TYPED_TEST_CASE(GerTest, GemvValueTypes);
-
-TYPED_TEST(GerTest, NON_SQUARED) {
-    BLAS_PREFIX(blasStatus_t) status =
-        call_ger(this->handle, 3, 2, &(this->alpha), this->x_device, 2, this->y_device, 2, this->A_device, 4);
-    EXPECT_TRUE(status == GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(Memcpy)(this->A_result.data(), this->A_device, this->A_result.size() * sizeof(typename TestFixture::value_type), GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->A_result[0]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->A_result[1]), 30.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->A_result[2]), 45.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->A_result[4]), 20.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->A_result[5]), 37.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->A_result[6]), 54.);
-}
diff --git a/src/gpu/hipblas_port/tests/main.cpp b/src/gpu/hipblas_port/tests/main.cpp
deleted file mode 100644
index b95674848..000000000
--- a/src/gpu/hipblas_port/tests/main.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "gtest/gtest.h"
-
-int main(int argc, char *argv[])
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-}
-
diff --git a/src/gpu/hipblas_port/tests/trmm_test.cpp b/src/gpu/hipblas_port/tests/trmm_test.cpp
deleted file mode 100644
index 637f93268..000000000
--- a/src/gpu/hipblas_port/tests/trmm_test.cpp
+++ /dev/null
@@ -1,951 +0,0 @@
-#ifdef __CUDA
-#include <cuda_runtime.h>
-#include <cuComplex.h>
-#include <cublas_v2.h>
-#else
-#include <hip/hip_runtime_api.h>
-#include <hip/hip_complex.h>
-#include <hipblas.h>
-#include "hipblas_port.h"
-#endif
-#include <vector>
-#include "gtest/gtest.h"
-// #define CATCH_CONFIG_MAIN
-// #include "catch.hpp"
-
-using testing::Types;
-#ifdef __CUDA
-#define GPU_PREFIX(val) cuda##val
-#else
-#define GPU_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define BLAS_PREFIX(val) cu##val
-#else
-#define BLAS_PREFIX(val) hip##val
-#endif
-
-#ifdef __CUDA
-#define GPU_PREFIX_CAPS(val) CU##val
-#else
-#define GPU_PREFIX_CAPS(val) HIP##val
-#endif
-
-template <typename T>
-struct create_real
-{
-    template <typename U>
-    static inline T eval(const U& val)
-    {
-        return T(val);
-    }
-};
-
-template <>
-struct create_real<BLAS_PREFIX(FloatComplex)>
-{
-    template <typename U>
-    static inline BLAS_PREFIX(FloatComplex) eval(const U& val)
-    {
-        BLAS_PREFIX(FloatComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template <>
-struct create_real<BLAS_PREFIX(DoubleComplex)>
-{
-    template <typename U>
-    static inline BLAS_PREFIX(DoubleComplex) eval(const U& val)
-    {
-        BLAS_PREFIX(DoubleComplex) c;
-        c.x = val;
-        c.y = 0;
-        return c;
-    }
-};
-
-template <typename T>
-struct create_complex
-{
-    template <typename U1, typename U2>
-    static inline T eval(const U1& val1, const U2& val2)
-    {
-        T c;
-        c.x = val1;
-        c.y = val2;
-        return c;
-    }
-};
-template <typename T>
-inline double get_real_double(const T& val)
-{
-    return double(val);
-}
-
-template <>
-inline double get_real_double<BLAS_PREFIX(FloatComplex)>(const BLAS_PREFIX(FloatComplex) & val)
-{
-    return double(val.x);
-}
-
-template <>
-inline double get_real_double<BLAS_PREFIX(DoubleComplex)>(const BLAS_PREFIX(DoubleComplex) & val)
-{
-    return double(val.x);
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_trmm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasSideMode_t) side, BLAS_PREFIX(blasFillMode_t) uplo,
-              BLAS_PREFIX(blasOperation_t) trans, BLAS_PREFIX(blasDiagType_t) diag, int m, int n, const float* alpha,
-              const float* A, int lda, const float* B, int ldb, float* C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasStrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Strmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_trmm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasSideMode_t) side, BLAS_PREFIX(blasFillMode_t) uplo,
-              BLAS_PREFIX(blasOperation_t) trans, BLAS_PREFIX(blasDiagType_t) diag, int m, int n, const double* alpha,
-              const double* A, int lda, const double* B, int ldb, double* C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasDtrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Dtrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_trmm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasSideMode_t) side, BLAS_PREFIX(blasFillMode_t) uplo,
-              BLAS_PREFIX(blasOperation_t) trans, BLAS_PREFIX(blasDiagType_t) diag, int m, int n,
-              const BLAS_PREFIX(FloatComplex) * alpha, const BLAS_PREFIX(FloatComplex) * A, int lda,
-              const BLAS_PREFIX(FloatComplex) * B, int ldb, BLAS_PREFIX(FloatComplex) * C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasCtrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Ctrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#endif
-}
-
-inline BLAS_PREFIX(blasStatus_t)
-    call_trmm(BLAS_PREFIX(blasHandle_t) handle, BLAS_PREFIX(blasSideMode_t) side, BLAS_PREFIX(blasFillMode_t) uplo,
-              BLAS_PREFIX(blasOperation_t) trans, BLAS_PREFIX(blasDiagType_t) diag, int m, int n,
-              const BLAS_PREFIX(DoubleComplex) * alpha, const BLAS_PREFIX(DoubleComplex) * A, int lda,
-              const BLAS_PREFIX(DoubleComplex) * B, int ldb, BLAS_PREFIX(DoubleComplex) * C, int ldc)
-{
-#ifdef __CUDA
-    return BLAS_PREFIX(blasZtrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#else
-    return BLAS_PREFIX(blas_port_Ztrmm)(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
-#endif
-}
-
-template <typename T>
-class TrmmRealTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 3
-         * A = 2 4
-         *     * *
-         */
-        A = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3
-         * B = 2 4
-         *     * *
-         */
-        B = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3
-         * C = 2 4
-         *     * *
-         */
-        C = {create_real<T>::eval(1), create_real<T>::eval(2), create_real<T>::eval(-10000),
-             create_real<T>::eval(3), create_real<T>::eval(4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-// typedef Types<float> TrmmRealTypes;
-typedef Types<float, double, BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> TrmmRealTypes;
-
-TYPED_TEST_CASE(TrmmRealTest, TrmmRealTypes);
-
-TYPED_TEST(TrmmRealTest, LEFT_FULL_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 22.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_LOWER_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 3.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 22.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_UPPER_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 8.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 16.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_UPPER_U_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 2.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 4.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_LOWER_U_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 4.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 3.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 10.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_UPPER_U_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 5.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 3.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 13.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_LOWER_U_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 5.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 2.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 4.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_UPPER_NU_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 3.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 25.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_LOWER_NU_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 5.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 8.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 16.);
-}
-
-TYPED_TEST(TrmmRealTest, LEFT_FULL_NU_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 5.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 11.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 25.);
-}
-
-/*
- * RIGHT SIDE
- */
-TYPED_TEST(TrmmRealTest, RIGHT_FULL_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 22.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_LOWER_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 12.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 16.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_UPPER_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 2.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 15.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 22.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_UPPER_U_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 2.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 6.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 10.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_LOWER_U_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 7.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 3.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 4.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_UPPER_U_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 3.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 4.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_LOWER_U_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 2.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 5.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 8.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_UPPER_NU_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 12.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 16.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_LOWER_NU_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_LOWER),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 2.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 20.);
-}
-
-TYPED_TEST(TrmmRealTest, RIGHT_FULL_NU_T)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_T), GPU_PREFIX_CAPS(BLAS_DIAG_UNIT), 2, 2, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 10.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[1]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[3]), 14.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[4]), 20.);
-}
-
-/**************************
- * Complex only
- **************************/
-
-template <typename T>
-class TrmmComplexLeftTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 3
-         * A = 2 4
-         *     * *
-         */
-        A = {create_complex<T>::eval(1, 1), create_complex<T>::eval(2, 2), create_real<T>::eval(-10000),
-             create_complex<T>::eval(3, 3), create_complex<T>::eval(4, 4), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3 5
-         * B = 2 4 6
-         *     * * *
-         */
-        B = {create_complex<T>::eval(1, 1), create_complex<T>::eval(2, 2), create_real<T>::eval(-10000),
-             create_complex<T>::eval(3, 3), create_complex<T>::eval(4, 4), create_real<T>::eval(-20000),
-             create_complex<T>::eval(5, 5), create_complex<T>::eval(6, 6), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     * * *
-         * C = * * *
-         *     * * *
-         */
-        C = {create_real<T>::eval(-1000), create_real<T>::eval(-1000), create_real<T>::eval(-10000),
-             create_real<T>::eval(-1000), create_real<T>::eval(-1000), create_real<T>::eval(-20000),
-             create_real<T>::eval(-1000), create_real<T>::eval(-1000), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-typedef Types<BLAS_PREFIX(FloatComplex), BLAS_PREFIX(DoubleComplex)> TrmmComplexTypes;
-
-TYPED_TEST_CASE(TrmmComplexLeftTest, TrmmComplexTypes);
-
-TYPED_TEST(TrmmComplexLeftTest, COMPLEX_LEFT_FULL_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 3, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 14.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 20.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 30.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 44.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[6]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[6]).y, 46.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[7]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[7]).y, 68.);
-}
-
-TYPED_TEST(TrmmComplexLeftTest, COMPLEX_LEFT_FULL_C)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_LEFT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_C), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 3, &(this->alpha),
-                       this->A_device, 3, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, 10.);
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, 22.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, 22.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, 50.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[6]).x, 34.);
-    EXPECT_DOUBLE_EQ((this->C_result[6]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[7]).x, 78.);
-    EXPECT_DOUBLE_EQ((this->C_result[7]).y, 0.);
-}
-
-template <typename T>
-class TrmmComplexRightTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*     1 4 7
-         * A = 2 5 8
-         *     3 6 9
-         *     * * *
-         */
-        A = {
-            create_complex<T>::eval(1, 1), create_complex<T>::eval(2, 2), create_complex<T>::eval(3, 3),
-            create_real<T>::eval(-10000),  create_complex<T>::eval(4, 4), create_complex<T>::eval(5, 5),
-            create_complex<T>::eval(6, 6), create_real<T>::eval(-20000),  create_complex<T>::eval(7, 7),
-            create_complex<T>::eval(8, 8), create_complex<T>::eval(9, 9), create_real<T>::eval(-30000),
-        };
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     1 3 5
-         * B = 2 4 6
-         *     * * *
-         */
-        B = {create_complex<T>::eval(1, 1), create_complex<T>::eval(2, 2), create_real<T>::eval(-10000),
-             create_complex<T>::eval(3, 3), create_complex<T>::eval(4, 4), create_real<T>::eval(-20000),
-             create_complex<T>::eval(5, 5), create_complex<T>::eval(6, 6), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        /*     * * *
-         * C = * * *
-         *     * * *
-         */
-        C = {create_real<T>::eval(-1000), create_real<T>::eval(-1000), create_real<T>::eval(-10000),
-             create_real<T>::eval(-1000), create_real<T>::eval(-1000), create_real<T>::eval(-20000),
-             create_real<T>::eval(-1000), create_real<T>::eval(-1000), create_real<T>::eval(-20000)};
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-TYPED_TEST_CASE(TrmmComplexRightTest, TrmmComplexTypes);
-
-TYPED_TEST(TrmmComplexRightTest, COMPLEX_RIGHT_FULL_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 3, &(this->alpha),
-                       this->A_device, 4, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 44.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 56.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 98.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 128.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[6]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[6]).y, 152.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[7]).x, 0.);
-    EXPECT_DOUBLE_EQ((this->C_result[7]).y, 200.);
-}
-
-TYPED_TEST(TrmmComplexRightTest, COMPLEX_RIGHT_FULL_C)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_FULL),
-                       GPU_PREFIX_CAPS(BLAS_OP_C), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), 2, 3, &(this->alpha),
-                       this->A_device, 4, this->B_device, 3, this->C_device, 3);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ((this->C_result[0]).x, 96.);
-    EXPECT_DOUBLE_EQ((this->C_result[0]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[1]).x, 120.);
-    EXPECT_DOUBLE_EQ((this->C_result[1]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[3]).x, 114.);
-    EXPECT_DOUBLE_EQ((this->C_result[3]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[4]).x, 144.);
-    EXPECT_DOUBLE_EQ((this->C_result[4]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[6]).x, 132.);
-    EXPECT_DOUBLE_EQ((this->C_result[6]).y, 0.);
-
-    EXPECT_DOUBLE_EQ((this->C_result[7]).x, 168.);
-    EXPECT_DOUBLE_EQ((this->C_result[7]).y, 0.);
-}
-
-template <typename T>
-class TrmmSiriusTest : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        /*
-         * B * A
-         */
-        m      = 412;
-        n      = 15;
-        ldb    = m;
-        lda    = 60;
-        size_B = ldb * n;
-        size_A = n * lda;
-
-        A.resize(size_A, create_real<T>::eval(-1000));
-        B.resize(size_B, create_real<T>::eval(-1000));
-        C.resize(size_B, create_real<T>::eval(-1000));
-
-        {
-            int i = 1;
-            for (int col = 0; col < n; ++col) {
-                for (int row = 0; row < n; ++row) {
-                    if (col >= row)
-                        A[row + col * lda] = create_real<T>::eval(i);
-
-                    ++i;
-                }
-            }
-        }
-
-        for (int i = 0; i < size_B; ++i) {
-            B[i] = create_real<T>::eval(i + 1);
-        }
-
-        GPU_PREFIX(Malloc)((void**)&A_device, A.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(A_device, A.data(), A.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        GPU_PREFIX(Malloc)((void**)&B_device, B.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(B_device, B.data(), B.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        GPU_PREFIX(Malloc)((void**)&C_device, C.size() * sizeof(T));
-        GPU_PREFIX(Memcpy)(C_device, C.data(), C.size() * sizeof(T), GPU_PREFIX(MemcpyHostToDevice));
-
-        BLAS_PREFIX(blasCreate)(&handle);
-
-        alpha = create_real<T>::eval(1);
-
-        C_result.resize(C.size());
-    }
-
-    void TearDown() override
-    {
-        GPU_PREFIX(Free)(A_device);
-        GPU_PREFIX(Free)(B_device);
-        GPU_PREFIX(Free)(C_device);
-        BLAS_PREFIX(blasDestroy)(handle);
-    }
-
-    int m, n, ldb, lda, size_B, size_A;
-    std::vector<T> A, B, C, C_result;
-    T* A_device;
-    T* B_device;
-    T* C_device;
-    T alpha;
-    BLAS_PREFIX(blasHandle_t) handle;
-    using value_type = T;
-};
-
-TYPED_TEST_CASE(TrmmSiriusTest, TrmmRealTypes);
-
-TYPED_TEST(TrmmSiriusTest, LEFT_LOWER_NU_N)
-{
-    BLAS_PREFIX(blasStatus_t)
-    status = call_trmm(this->handle, GPU_PREFIX_CAPS(BLAS_SIDE_RIGHT), GPU_PREFIX_CAPS(BLAS_FILL_MODE_UPPER),
-                       GPU_PREFIX_CAPS(BLAS_OP_N), GPU_PREFIX_CAPS(BLAS_DIAG_NON_UNIT), this->m, this->n,
-                       &(this->alpha), this->A_device, this->lda, this->B_device, this->ldb, this->C_device, this->ldb);
-    ASSERT_EQ(status, GPU_PREFIX_CAPS(BLAS_STATUS_SUCCESS));
-    GPU_PREFIX(DeviceSynchronize());
-    ASSERT_EQ(GPU_PREFIX(GetLastError)(), GPU_PREFIX(Success));
-    GPU_PREFIX(Memcpy)
-    (this->C_result.data(), this->C_device, this->C_result.size() * sizeof(typename TestFixture::value_type),
-     GPU_PREFIX(MemcpyDeviceToHost));
-
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[0]), 1.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[411]), 412.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[412]), 7037.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[413]), 7070.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[414]), 7103.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[824]), 40472.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[825]), 40568.);
-    EXPECT_DOUBLE_EQ(get_real_double<typename TestFixture::value_type>(this->C_result[826]), 40664.);
-}
-
diff --git a/src/gpu/magma.hpp b/src/gpu/magma.hpp
index ea2ef434b..0949a805b 100644
--- a/src/gpu/magma.hpp
+++ b/src/gpu/magma.hpp
@@ -27,7 +27,6 @@
 
 #include <stdio.h>
 #include <assert.h>
-#include <cuda.h>
 #include <magma.h>
 #include <magma_z.h>
 #include <magma_d.h>
diff --git a/src/gpu/spherical_harmonics.cu b/src/gpu/spherical_harmonics.cu
index 3f8d26e38..159f5534d 100644
--- a/src/gpu/spherical_harmonics.cu
+++ b/src/gpu/spherical_harmonics.cu
@@ -22,6 +22,7 @@
  *  \brief CUDA kernels to generate spherical harminics.
  */
 
+#include <cmath>
 #include "gpu/cuda_common.hpp"
 #include "gpu/acc_runtime.hpp"
 
@@ -56,8 +57,8 @@ __global__ void spherical_harmonics_ylm_gpu_kernel(int lmax__, int ntp__, double
         }
         for (int m = 0; m <= lmax__ - 2; m++) {
             for (int l = m + 2; l <= lmax__; l++) {
-                double alm = std::sqrt(static_cast<double>((2 * l - 1) * (2 * l + 1)) / (l * l - m * m));
-                double blm = std::sqrt(static_cast<double>((l - 1 - m) * (l - 1 + m)) / ((2 * l - 3) * (2 * l - 1)));
+                double alm = sqrt(static_cast<double>((2 * l - 1) * (2 * l + 1)) / (l * l - m * m));
+                double blm = sqrt(static_cast<double>((l - 1 - m) * (l - 1 + m)) / ((2 * l - 3) * (2 * l - 1)));
                 ylm[lmidx(l, m)].x = alm * (cost * ylm[lmidx(l - 1, m)].x - blm * ylm[lmidx(l - 2, m)].x);
                 ylm[lmidx(l, m)].y = 0;
             }
@@ -111,19 +112,19 @@ __global__ void spherical_harmonics_rlm_gpu_kernel(int lmax__, int ntp__, double
         }
         for (int m = 0; m <= lmax__ - 2; m++) {
             for (int l = m + 2; l <= lmax__; l++) {
-                double alm = std::sqrt(static_cast<double>((2 * l - 1) * (2 * l + 1)) / (l * l - m * m));
-                double blm = std::sqrt(static_cast<double>((l - 1 - m) * (l - 1 + m)) / ((2 * l - 3) * (2 * l - 1)));
+                double alm = sqrt(static_cast<double>((2 * l - 1) * (2 * l + 1)) / (l * l - m * m));
+                double blm = sqrt(static_cast<double>((l - 1 - m) * (l - 1 + m)) / ((2 * l - 3) * (2 * l - 1)));
                 rlm[lmidx(l, m)] = alm * (cost * rlm[lmidx(l - 1, m)] - blm * rlm[lmidx(l - 2, m)]);
             }
         }
 
-        double c0 = std::cos(phi);
+        double c0 = cos(phi);
         double c1 = 1;
-        double s0 = -std::sin(phi);
+        double s0 = -sin(phi);
         double s1 = 0;
         double c2 = 2 * c0;
 
-        double const t = std::sqrt(2.0);
+        double const t = sqrt(2.0);
 
         for (int m = 1; m <= lmax__; m++) {
             double c = c2 * c1 - c0;
diff --git a/src/hamiltonian/hamiltonian_k.cpp b/src/hamiltonian/hamiltonian_k.cpp
index 8a8483857..4ced2a300 100644
--- a/src/hamiltonian/hamiltonian_k.cpp
+++ b/src/hamiltonian/hamiltonian_k.cpp
@@ -28,6 +28,7 @@
 #include "hamiltonian/non_local_operator.hpp"
 #include "potential/potential.hpp"
 #include "SDDK/wave_functions.hpp"
+#include "SDDK/omp.hpp"
 #include "k_point/k_point.hpp"
 #include "utils/profiler.hpp"
 #include <chrono>
@@ -301,30 +302,51 @@ Hamiltonian_k::set_fv_h_o(sddk::dmatrix<double_complex>& h__, sddk::dmatrix<doub
     /* current processing unit */
     auto pu = H0_.ctx().processing_unit();
 
-    mdarray<double_complex, 3> alm_row;
-    mdarray<double_complex, 3> alm_col;
-    mdarray<double_complex, 3> halm_col;
+    auto la = linalg_t::none;
+    auto mt = memory_t::none;
+    auto mt1 = memory_t::none;
+    int nb = 0;
+    switch (pu) {
+        case device_t::CPU: {
+            la = linalg_t::blas;
+            mt = memory_t::host;
+            mt1 = memory_t::host;
+            nb = 1;
+            break;
+        }
+        case device_t::GPU: {
+            la = linalg_t::cublasxt;
+            mt = memory_t::host_pinned;
+            mt1 = memory_t::device;
+            nb = 1;
+            break;
+        }
+    }
+
+    sddk::mdarray<double_complex, 3> alm_row(kp.num_gkvec_row(), max_mt_aw, nb, H0_.ctx().mem_pool(mt));
+    sddk::mdarray<double_complex, 3> alm_col(kp.num_gkvec_col(), max_mt_aw, nb, H0_.ctx().mem_pool(mt));
+    sddk::mdarray<double_complex, 3> halm_col(kp.num_gkvec_col(), max_mt_aw, nb, H0_.ctx().mem_pool(mt));
+
+    H0_.ctx().print_memory_usage(__FILE__, __LINE__);
 
     h__.zero();
     o__.zero();
-    switch (pu) { // TODO: replace with allocations from memory pool
+    switch (pu) {
         case device_t::GPU: {
-            alm_row = mdarray<double_complex, 3>(kp.num_gkvec_row(), max_mt_aw, 2, memory_t::host_pinned);
-            alm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned);
-            halm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 2, memory_t::host_pinned);
-            alm_row.allocate(memory_t::device);
-            alm_col.allocate(memory_t::device);
-            halm_col.allocate(memory_t::device);
-            //h__.allocate(memory_t::device).zero(memory_t::device);
-            //o__.allocate(memory_t::device).zero(memory_t::device);
-            h__.zero(memory_t::device);
-            o__.zero(memory_t::device);
+    //        alm_row = mdarray<double_complex, 3>(kp.num_gkvec_row(), max_mt_aw, 2, H0_.ctx().mem_pool(memory_t::host_pinned));
+    //        alm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 2, H0_.ctx().mem_pool(memory_t::host_pinned));
+    //        halm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 2, H0_.ctx().mem_pool(memory_t::host_pinned));
+            alm_row.allocate(H0_.ctx().mem_pool(memory_t::device));
+            alm_col.allocate(H0_.ctx().mem_pool(memory_t::device));
+            halm_col.allocate(H0_.ctx().mem_pool(memory_t::device));
+    //        h__.zero(memory_t::device);
+    //        o__.zero(memory_t::device);
             break;
         }
         case device_t::CPU: {
-            alm_row = mdarray<double_complex, 3>(kp.num_gkvec_row(), max_mt_aw, 1);
-            alm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 1);
-            halm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 1);
+    //        alm_row = mdarray<double_complex, 3>(kp.num_gkvec_row(), max_mt_aw, 1, H0_.ctx().mem_pool(memory_t::host));
+    //        alm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 1, H0_.ctx().mem_pool(memory_t::host));
+    //        halm_col = mdarray<double_complex, 3>(kp.num_gkvec_col(), max_mt_aw, 1, H0_.ctx().mem_pool(memory_t::host));
             break;
         }
     }
@@ -346,6 +368,7 @@ Hamiltonian_k::set_fv_h_o(sddk::dmatrix<double_complex>& h__, sddk::dmatrix<doub
         }
 
         int s = (pu == device_t::GPU) ? (iblk % 2) : 0;
+        s = 0;
 
         if (H0_.ctx().control().print_checksum_) {
             alm_row.zero();
@@ -362,9 +385,16 @@ Hamiltonian_k::set_fv_h_o(sddk::dmatrix<double_complex>& h__, sddk::dmatrix<doub
                 auto& type = atom.type();
                 int naw = type.mt_aw_basis_size();
 
-                mdarray<double_complex, 2> alm_row_atom;
-                mdarray<double_complex, 2> alm_col_atom;
-                mdarray<double_complex, 2> halm_col_atom;
+                //sddk::mdarray<double_complex, 2> alm_row_atom(alm_row.at(memory_t::host, 0, offsets[ia], s),
+                //                                              kp.num_gkvec_row(), naw);
+                //sddk::mdarray<double_complex, 2> alm_col_atom(alm_col.at(memory_t::host, 0, offsets[ia], s),
+                //                                              kp.num_gkvec_col(), naw);
+                //sddk::mdarray<double_complex, 2> halm_col_atom(halm_col.at(memory_t::host, 0, offsets[ia], s),
+                //                                               kp.num_gkvec_col(), naw);
+
+                sddk::mdarray<double_complex, 2> alm_row_atom;
+                sddk::mdarray<double_complex, 2> alm_col_atom;
+                sddk::mdarray<double_complex, 2> halm_col_atom;
 
                 switch (pu) {
                     case device_t::CPU: {
@@ -423,7 +453,7 @@ Hamiltonian_k::set_fv_h_o(sddk::dmatrix<double_complex>& h__, sddk::dmatrix<doub
             }
             acc::sync_stream(stream_id(tid));
         }
-        acc::sync_stream(stream_id(omp_get_max_threads()));
+        //acc::sync_stream(stream_id(omp_get_max_threads()));
 
         if (H0_.ctx().control().print_checksum_) {
             double_complex z1 = alm_row.checksum();
@@ -433,46 +463,39 @@ Hamiltonian_k::set_fv_h_o(sddk::dmatrix<double_complex>& h__, sddk::dmatrix<doub
             utils::print_checksum("alm_col", z2);
             utils::print_checksum("halm_col", z3);
         }
-        auto la = linalg_t::none;
-        auto mt = memory_t::none;
-        switch (pu) {
-            case device_t::CPU: {
-                la = linalg_t::blas;
-                mt = memory_t::host;
-                break;
-            }
-            case device_t::GPU: {
-                la = linalg_t::gpublas;
-                mt = memory_t::device;
-                break;
-            }
-        }
 
         linalg(la).gemm('N', 'T',kp.num_gkvec_row(), kp.num_gkvec_col(), num_mt_aw,
                          &linalg_const<double_complex>::one(),
-                         alm_row.at(mt, 0, 0, s), alm_row.ld(),
-                         alm_col.at(mt, 0, 0, s), alm_col.ld(),
+                         alm_row.at(mt1, 0, 0, s), alm_row.ld(),
+                         alm_col.at(mt1, 0, 0, s), alm_col.ld(),
                          &linalg_const<double_complex>::one(),
                          o__.at(mt), o__.ld());
 
         linalg(la).gemm('N', 'T', kp.num_gkvec_row(), kp.num_gkvec_col(), num_mt_aw,
                          &linalg_const<double_complex>::one(),
-                         alm_row.at(mt, 0, 0, s), alm_row.ld(),
-                         halm_col.at(mt, 0, 0, s), halm_col.ld(),
+                         alm_row.at(mt1, 0, 0, s), alm_row.ld(),
+                         halm_col.at(mt1, 0, 0, s), halm_col.ld(),
                          &linalg_const<double_complex>::one(),
                          h__.at(mt), h__.ld());
     }
 
-    if (pu == device_t::GPU) { // TODO: if solver is cusolver, this is an additional copy, not necessary
-        acc::copyout(h__.at(memory_t::host), h__.ld(), h__.at(memory_t::device), h__.ld(), kp.num_gkvec_row(), kp.num_gkvec_col());
-        acc::copyout(o__.at(memory_t::host), o__.ld(), o__.at(memory_t::device), o__.ld(), kp.num_gkvec_row(), kp.num_gkvec_col());
-        //h__.deallocate(memory_t::device);
-        //o__.deallocate(memory_t::device);
-    }
+    // TODO: fix the logic of matrices setup
+    // problem: for magma we start on CPU, for cusoler - on GPU
+    // one solution: start from gpu for magma as well
+    // add starting pointer type in the Eigensolver() class
+
+    //if (pu == device_t::GPU) {
+    //    acc::copyout(h__.at(memory_t::host), h__.ld(), h__.at(memory_t::device), h__.ld(), kp.num_gkvec_row(),
+    //        kp.num_gkvec_col());
+    //    acc::copyout(o__.at(memory_t::host), o__.ld(), o__.at(memory_t::device), o__.ld(), kp.num_gkvec_row(),
+    //        kp.num_gkvec_col());
+    //}
     PROFILE_STOP("sirius::Hamiltonian_k::set_fv_h_o|zgemm");
     std::chrono::duration<double> tval = std::chrono::high_resolution_clock::now() - t1;
-    if (kp.comm().rank() == 0 && H0_.ctx().control().print_performance_) {
-        kp.message(1, __function_name__, "effective zgemm performance: %12.6f GFlops",
+    auto pp = utils::get_env<int>("SIRIUS_PRINT_PERFORMANCE");
+
+    if (kp.comm().rank() == 0 && (H0_.ctx().control().print_performance_ || (pp && *pp))) {
+        kp.message((pp && *pp) ? 0 : 1, __function_name__, "effective zgemm performance: %12.6f GFlops\n",
                2 * 8e-9 * kp.num_gkvec() * kp.num_gkvec() * uc.mt_aw_basis_size() / tval.count());
     }
 
@@ -481,6 +504,14 @@ Hamiltonian_k::set_fv_h_o(sddk::dmatrix<double_complex>& h__, sddk::dmatrix<doub
 
     /* setup lo-lo block */
     set_fv_h_o_lo_lo(h__, o__);
+
+    ///*  copy back to GPU */ // TODO: optimize the copys
+    //if (pu == device_t::GPU) {
+    //    acc::copyin(h__.at(memory_t::device), h__.ld(), h__.at(memory_t::host), h__.ld(), kp.gklo_basis_size_row(),
+    //        kp.gklo_basis_size_col());
+    //    acc::copyin(o__.at(memory_t::device), o__.ld(), o__.at(memory_t::host), o__.ld(), kp.gklo_basis_size_row(),
+    //        kp.gklo_basis_size_col());
+    //}
 }
 
 /* alm_row comes in already conjugated */
diff --git a/src/hamiltonian/non_local_operator.cpp b/src/hamiltonian/non_local_operator.cpp
index 73047e812..6db519f93 100644
--- a/src/hamiltonian/non_local_operator.cpp
+++ b/src/hamiltonian/non_local_operator.cpp
@@ -22,6 +22,7 @@
  *  \brief Contains implementation of sirius::Non_local_operator class.
  */
 
+#include "SDDK/omp.hpp"
 #include "non_local_operator.hpp"
 #include "beta_projectors/beta_projectors.hpp"
 
diff --git a/src/input.hpp b/src/input.hpp
index 4374e598f..686001a99 100644
--- a/src/input.hpp
+++ b/src/input.hpp
@@ -193,7 +193,7 @@ struct Unit_cell_input
     "mixer" : {
       "beta" : (float) beta,
       "beta0" : beta0,
-      "linear_mix_rms_tol" : 
+      "linear_mix_rms_tol" :
     }
     \endcode
  */
@@ -260,6 +260,9 @@ struct Iterative_solver_input
     /// Tolerance for the residual L2 norm.
     double residual_tolerance_{1e-6};
 
+    /// Relative tolerance for the residual L2 norm. (0 means this criterion is effectively not used)
+    double relative_tolerance_{0};
+
     /// Additional tolerance for empty states.
     /** Setting this variable to 0 will treat empty states with the same tolerance as occupied states. */
     double empty_states_tolerance_{0};
@@ -298,6 +301,7 @@ struct Iterative_solver_input
             subspace_size_          = section.value("subspace_size", subspace_size_);
             energy_tolerance_       = section.value("energy_tolerance", energy_tolerance_);
             residual_tolerance_     = section.value("residual_tolerance", residual_tolerance_);
+            relative_tolerance_     = section.value("relative_tolerance", relative_tolerance_);
             empty_states_tolerance_ = section.value("empty_states_tolerance", empty_states_tolerance_);
             converge_by_energy_     = section.value("converge_by_energy", converge_by_energy_);
             min_num_res_            = section.value("min_num_res", min_num_res_);
@@ -504,6 +508,9 @@ struct Parameters_input
     /// Shift in the k-point grid.
     std::vector<int> shiftk_{0, 0, 0};
 
+    /// optional k-point coordinates
+    std::vector<vector3d<double>> vk_;
+
     /// Number of SCF iterations.
     int num_dft_iter_{100};
 
@@ -586,6 +593,13 @@ struct Parameters_input
             ngridk_         = section.value("ngridk", ngridk_);
             shiftk_         = section.value("shiftk", shiftk_);
             num_dft_iter_   = section.value("num_dft_iter", num_dft_iter_);
+            auto vk         = section.value("vk", std::vector<std::vector<double>>{});
+            for (auto& vki : vk) {
+                if (vki.size() != 3) {
+                    throw std::runtime_error("parameters.vk expected to be of size 3");
+                }
+                vk_.emplace_back(vector3d<double>(vki));
+            }
             energy_tol_     = section.value("energy_tol", energy_tol_);
             /* potential_tol is obsolete */
             density_tol_    = section.value("potential_tol", density_tol_);
@@ -611,6 +625,41 @@ struct Parameters_input
     }
 };
 
+struct NLCG_input
+{
+    /// CG max iterations
+    int maxiter_{100};
+    /// CG restart
+    int restart_{20};
+    /// backtracking search, step parameter
+    double tau_{0.1};
+    /// temperature in Kelvin
+    double T_{300};
+    /// scalar preconditioning of pseudo Hamiltonian
+    double kappa_{0.3};
+    /// CG tolerance
+    double tol_{1e-9};
+    /// smearing
+    std::string smearing_{"FD"};
+    /// Main processing unit to run on.
+    std::string processing_unit_{""};
+
+    void read(json const& parser)
+    {
+        if (parser.count("nlcg")) {
+            auto section     = parser["nlcg"];
+            maxiter_         = section.value("maxiter", maxiter_);
+            restart_         = section.value("restart", restart_);
+            tau_             = section.value("tau", tau_);
+            T_               = section.value("T", T_);
+            kappa_           = section.value("kappa", kappa_);
+            tol_             = section.value("tol", tol_);
+            smearing_        = section.value("smearing", smearing_);
+            processing_unit_ = section.value("processing_unit", processing_unit_);
+        }
+    }
+};
+
 /// Settings control the internal parameters related to the numerical implementation.
 /** Changing of setting parameters will have an impact on the final result. */
 struct Settings_input
diff --git a/src/k_point/generate_spinor_wave_functions.cpp b/src/k_point/generate_spinor_wave_functions.cpp
index dd523a2df..817739b3b 100644
--- a/src/k_point/generate_spinor_wave_functions.cpp
+++ b/src/k_point/generate_spinor_wave_functions.cpp
@@ -43,15 +43,15 @@ void K_point::generate_spinor_wave_functions()
         int nbnd = (ctx_.num_mag_dims() == 3) ? ctx_.num_bands() : nfv;
 
         if (ctx_.processing_unit() == device_t::GPU) {
-            fv_states().allocate(spin_range(0), memory_t::device);
+            fv_states().allocate(spin_range(0), ctx_.mem_pool(memory_t::device));
             fv_states().copy_to(spin_range(0), memory_t::device, 0, nfv);
-            sv_eigen_vectors_[0].allocate(memory_t::device).copy_to(memory_t::device);
+            sv_eigen_vectors_[0].allocate(ctx_.mem_pool(memory_t::device)).copy_to(memory_t::device);
             if (ctx_.num_mag_dims() == 1) {
-                sv_eigen_vectors_[1].allocate(memory_t::device).copy_to(memory_t::device);
+                sv_eigen_vectors_[1].allocate(ctx_.mem_pool(memory_t::device)).copy_to(memory_t::device);
             }
             if (is_device_memory(ctx_.preferred_memory_t())) {
                 for (int ispn = 0; ispn < ctx_.num_spins(); ispn++) {
-                    spinor_wave_functions().allocate(spin_range(ispn), memory_t::device);
+                    spinor_wave_functions().allocate(spin_range(ispn), ctx_.mem_pool(memory_t::device));
                     spinor_wave_functions().copy_to(spin_range(ispn), memory_t::device, 0, nbnd);
                 }
             }
diff --git a/src/k_point/k_point.cpp b/src/k_point/k_point.cpp
index 51fae8316..24e4ce296 100644
--- a/src/k_point/k_point.cpp
+++ b/src/k_point/k_point.cpp
@@ -71,8 +71,8 @@ K_point::initialize()
      */
     int nst = ctx_.num_bands();
 
-    auto mem_type_evp  = (ctx_.std_evp_solver_type() == ev_solver_t::magma) ? memory_t::host_pinned : memory_t::host;
-    auto mem_type_gevp = (ctx_.gen_evp_solver_type() == ev_solver_t::magma) ? memory_t::host_pinned : memory_t::host;
+    auto mem_type_evp  = ctx_.std_evp_solver().host_memory_t();
+    auto mem_type_gevp = ctx_.gen_evp_solver().host_memory_t();
 
     /* build a full list of G+k vectors for all MPI ranks */
     generate_gkvec(ctx_.gk_cutoff());
@@ -113,8 +113,13 @@ K_point::initialize()
             }
             if (ctx_.iterative_solver_input().type_ == "exact") {
                 /* ELPA needs a full matrix of eigen-vectors as it uses it as a work space */
-                fv_eigen_vectors_ = dmatrix<double_complex>(gklo_basis_size(), gklo_basis_size(), ctx_.blacs_grid(), bs,
-                                                            bs, mem_type_gevp);
+                if (ctx_.gen_evp_solver().type() == ev_solver_t::elpa) {
+                    fv_eigen_vectors_ = dmatrix<double_complex>(gklo_basis_size(), gklo_basis_size(),
+                                                                ctx_.blacs_grid(), bs, bs, mem_type_gevp);
+                } else{
+                    fv_eigen_vectors_ = dmatrix<double_complex>(gklo_basis_size(), ctx_.num_fv_states(),
+                                                                ctx_.blacs_grid(), bs, bs, mem_type_gevp);
+                }
             } else {
                 int ncomp = ctx_.iterative_solver_input().num_singular_;
                 if (ncomp < 0) {
@@ -152,16 +157,16 @@ K_point::initialize()
                                    [this](int ia) { return unit_cell_.atom(ia).mt_basis_size(); }, ctx_.num_fv_states(),
                                    ctx_.preferred_memory_t()));
 
-            spinor_wave_functions_ = std::unique_ptr<Wave_functions>(
-                new Wave_functions(gkvec_partition(), unit_cell_.num_atoms(),
-                                   [this](int ia) { return unit_cell_.atom(ia).mt_basis_size(); }, nst,
-                                   ctx_.preferred_memory_t(), ctx_.num_spins()));
+            spinor_wave_functions_ = std::make_shared<Wave_functions>(
+                gkvec_partition(), unit_cell_.num_atoms(),
+                [this](int ia) { return unit_cell_.atom(ia).mt_basis_size(); }, nst, ctx_.preferred_memory_t(),
+                ctx_.num_spins());
         } else {
             throw std::runtime_error("not implemented");
         }
     } else {
-        spinor_wave_functions_ = std::unique_ptr<Wave_functions>(
-            new Wave_functions(gkvec_partition(), nst, ctx_.preferred_memory_t(), ctx_.num_spins()));
+        spinor_wave_functions_ =
+            std::make_shared<Wave_functions>(gkvec_partition(), nst, ctx_.preferred_memory_t(), ctx_.num_spins());
         if (ctx_.hubbard_correction()) {
             auto r = unit_cell_.num_wf_with_U();
             const int num_sc = ctx_.num_mag_dims() == 3 ? 2 : 1;
@@ -216,7 +221,7 @@ K_point::orthogonalize_hubbard_orbitals(Wave_functions& phi__)
         if (ctx_.hubbard_input().orthogonalize_hubbard_orbitals_ ) {
             dmatrix<double_complex> Z(nwfu, nwfu);
 
-            auto ev_solver = Eigensolver_factory(ev_solver_t::lapack);
+            auto ev_solver = Eigensolver_factory("lapack", nullptr);
 
             std::vector<double> eigenvalues(nwfu, 0.0);
 
diff --git a/src/k_point/k_point.hpp b/src/k_point/k_point.hpp
index ba67de4d7..5bbd6c909 100644
--- a/src/k_point/k_point.hpp
+++ b/src/k_point/k_point.hpp
@@ -85,7 +85,7 @@ class K_point
     std::unique_ptr<Wave_functions> fv_states_{nullptr};
 
     /// Two-component (spinor) wave functions describing the bands.
-    std::unique_ptr<Wave_functions> spinor_wave_functions_{nullptr};
+    std::shared_ptr<Wave_functions> spinor_wave_functions_{nullptr};
 
     /// Two-component (spinor) hubbard wave functions where the S matrix is applied (if ppus).
     std::unique_ptr<Wave_functions> hubbard_wave_functions_{nullptr}; // TODO: remove in future
@@ -460,6 +460,11 @@ class K_point
         return *spinor_wave_functions_;
     }
 
+    inline std::shared_ptr<Wave_functions> spinor_wave_functions_ptr()
+    {
+        return spinor_wave_functions_;
+    }
+
     inline Wave_functions& hubbard_wave_functions()
     {
         assert(hubbard_wave_functions_ != nullptr);
diff --git a/src/lapw/matching_coefficients.hpp b/src/lapw/matching_coefficients.hpp
index c82b28531..743653c3e 100644
--- a/src/lapw/matching_coefficients.hpp
+++ b/src/lapw/matching_coefficients.hpp
@@ -33,19 +33,22 @@ namespace sirius {
 
 /** The following matching conditions must be fulfilled:
  *  \f[
- *      \frac{\partial^j}{\partial r^j} \sum_{L \nu} A_{L \nu}^{\bf k}({\bf G})u_{\ell \nu}(r)
- *       Y_{L}(\hat {\bf r}) \bigg|_{R^{MT}}  = \frac{\partial^j}{\partial r^j} \frac{4 \pi}{\sqrt \Omega}
- *       e^{i{\bf (G+k)\tau}} \sum_{L}i^{\ell} j_{\ell}(|{\bf G+k}|r) Y_{L}^{*}(\widehat {\bf G+k}) Y_{L}(\hat {\bf r})
- * \bigg|_{R^{MT}} \f] where \f$ L = \{ \ell, m \} \f$. Dropping sum over L we arrive to the following system of linear
- * equations: \f[ \sum_{\nu} \frac{\partial^j u_{\ell \nu}(r)}{\partial r^j} \bigg|_{R^{MT}} A_{L \nu}^{\bf k}({\bf G})
- * = \frac{4 \pi}{\sqrt \Omega} e^{i{\bf (G+k)\tau}} i^{\ell} \frac{\partial^j j_{\ell}(|{\bf G+k}|r)}{\partial r^j}
+ *   \frac{\partial^j}{\partial r^j} \sum_{L \nu} A_{L \nu}^{\bf k}({\bf G})u_{\ell \nu}(r)
+ *   Y_{L}(\hat {\bf r}) \bigg|_{R^{MT}}  = \frac{\partial^j}{\partial r^j} \frac{4 \pi}{\sqrt \Omega}
+ *   e^{i{\bf (G+k)\tau}} \sum_{L}i^{\ell} j_{\ell}(|{\bf G+k}|r) Y_{L}^{*}(\widehat {\bf G+k}) Y_{L}(\hat {\bf r})
+ *   \bigg|_{R^{MT}}
+ *  \f]
+ *  where \f$ L = \{ \ell, m \} \f$. Dropping sum over L we arrive to the following system of linear equations:
+ *  \f[ \sum_{\nu} \frac{\partial^j u_{\ell \nu}(r)}{\partial r^j} \bigg|_{R^{MT}} A_{L \nu}^{\bf k}({\bf G})
+ *   = \frac{4 \pi}{\sqrt \Omega} e^{i{\bf (G+k)\tau}} i^{\ell} \frac{\partial^j j_{\ell}(|{\bf G+k}|r)}{\partial r^j}
  *      \bigg|_{R^{MT}} Y_{L}^{*}(\widehat {\bf G+k})
  *  \f]
  *  The matching coefficients are then equal to:
  *  \f[
- *      A_{L \nu}^{\bf k}({\bf G}) = \sum_{j} \bigg[ \frac{\partial^j u_{\ell \nu}(r)}{\partial r^j} \bigg|_{R^{MT}}
- * \bigg]_{\nu j}^{-1} \frac{\partial^j j_{\ell}(|{\bf G+k}|r)}{\partial r^j} \bigg|_{R^{MT}} \frac{4 \pi}{\sqrt \Omega}
- * i^{\ell} e^{i{\bf (G+k)\tau}} Y_{L}^{*}(\widehat {\bf G+k}) \f]
+ *   A_{L \nu}^{\bf k}({\bf G}) = \sum_{j} \bigg[ \frac{\partial^j u_{\ell \nu}(r)}{\partial r^j} \bigg|_{R^{MT}}
+ *   \bigg]_{\nu j}^{-1} \frac{\partial^j j_{\ell}(|{\bf G+k}|r)}{\partial r^j} \bigg|_{R^{MT}}
+ *   \frac{4 \pi}{\sqrt \Omega} i^{\ell} e^{i{\bf (G+k)\tau}} Y_{L}^{*}(\widehat {\bf G+k})
+ *  \f]
  */
 class Matching_coefficients // TODO: compute on GPU
 {
diff --git a/src/linalg/eigenproblem.hpp b/src/linalg/eigenproblem.hpp
index 2da02e291..f401ff900 100644
--- a/src/linalg/eigenproblem.hpp
+++ b/src/linalg/eigenproblem.hpp
@@ -25,15 +25,12 @@
 #ifndef __EIGENPROBLEM_HPP__
 #define __EIGENPROBLEM_HPP__
 
-#include <omp.h>
 #include "utils/profiler.hpp"
 #include "linalg.hpp"
+#include "SDDK/omp.hpp"
 
 #if defined(__ELPA)
-#include <elpa_constants.h>
-extern "C" {
-#include "elpa.h"
-}
+#include "elpa.hpp"
 #endif
 
 #if defined(__GPU) && defined(__MAGMA)
@@ -48,145 +45,12 @@ using namespace sddk;
 
 //TODO use ELPA functions to transform to standard eigen-problem
 
-/// Type of eigen-value solver.
-enum class ev_solver_t
-{
-    /// LAPACK
-    lapack,
-
-    /// ScaLAPACK
-    scalapack,
-
-    /// ELPA 1-stage solver
-    elpa1,
-
-    /// ELPA 2-stage solver
-    elpa2,
-
-    /// MAGMA with CPU pointers
-    magma,
-
-    /// MAGMA with GPU pointers
-    magma_gpu,
-
-    /// PLASMA
-    plasma,
-
-    /// CUDA eigen-solver
-    cusolver
-};
-
-inline ev_solver_t get_ev_solver_t(std::string name__)
-{
-    std::transform(name__.begin(), name__.end(), name__.begin(), ::tolower);
-
-    static const std::map<std::string, ev_solver_t> map_to_type = {
-        {"lapack", ev_solver_t::lapack}, {"scalapack", ev_solver_t::scalapack}, {"elpa1", ev_solver_t::elpa1},
-        {"elpa2", ev_solver_t::elpa2},   {"magma", ev_solver_t::magma},         {"magma_gpu", ev_solver_t::magma_gpu},
-        {"plasma", ev_solver_t::plasma}, {"cusolver", ev_solver_t::cusolver}};
-
-    if (map_to_type.count(name__) == 0) {
-        std::stringstream s;
-        s << "wrong label of eigen-solver : " << name__;
-        TERMINATE(s);
-    }
-
-    return map_to_type.at(name__);
-}
-
-const std::string error_msg_not_implemented = "solver is not implemented";
-
-class Eigensolver
-{
-  protected:
-    /// Memory pool for CPU work buffers.
-    memory_pool mp_h_;
-    /// Memory pool for CPU work buffers using pinned memory.
-    memory_pool mp_hp_;
-    /// Memory pool for GPU work buffers.
-    memory_pool mp_d_;
-
-  public:
-    Eigensolver()
-        : mp_h_(memory_pool(memory_t::host))
-        , mp_hp_(memory_pool(memory_t::host_pinned))
-        , mp_d_(memory_pool(memory_t::device))
-    {
-    }
-
-    virtual ~Eigensolver()
-    {
-    }
-
-    /// Solve a standard eigen-value problem for all eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, dmatrix<double>& A__, double* eval__, dmatrix<double>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-    /// Solve a standard eigen-value problem for all eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, dmatrix<double_complex>& A__, double* eval__, dmatrix<double_complex>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    /// Solve a standard eigen-value problem for N lowest eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double>& A__, double* eval__, dmatrix<double>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    /// Solve a standard eigen-value problem for N lowest eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double_complex>& A__, double* eval__,
-                      dmatrix<double_complex>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    /// Solve a generalized eigen-value problem for all eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, dmatrix<double>& A__, dmatrix<double>& B__, double* eval__,
-                      dmatrix<double>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    /// Solve a generalized eigen-value problem for all eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, dmatrix<double_complex>& A__, dmatrix<double_complex>& B__, double* eval__,
-                      dmatrix<double_complex>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    /// Solve a generalized eigen-value problem for N lowest eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double>& A__, dmatrix<double>& B__, double* eval__,
-                      dmatrix<double>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    /// Solve a generalized eigen-value problem for N lowest eigen-pairs.
-    virtual int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double_complex>& A__, dmatrix<double_complex>& B__,
-                      double* eval__, dmatrix<double_complex>& Z__)
-    {
-        TERMINATE(error_msg_not_implemented);
-        return -1;
-    }
-
-    virtual bool is_parallel() = 0;
-};
-
 class Eigensolver_lapack : public Eigensolver
 {
   public:
-    inline bool is_parallel()
+    Eigensolver_lapack()
+        : Eigensolver(ev_solver_t::lapack, nullptr, false, memory_t::host, memory_t::host)
     {
-        return false;
     }
 
     /// Solve a standard eigen-value problem for all eigen-pairs.
@@ -274,8 +138,8 @@ class Eigensolver_lapack : public Eigensolver
         if (m != nev__) {
             std::stringstream s;
             s << "not all eigen-values are found" << std::endl
-              << "target number of eign-values: " << nev__ << std::endl
-              << "number of eign-values found: " << m;
+              << "target number of eigen-values: " << nev__ << std::endl
+              << "number of eigen-values found: " << m;
             WARNING(s);
             return 1;
         }
@@ -328,8 +192,8 @@ class Eigensolver_lapack : public Eigensolver
         if (m != nev__) {
             std::stringstream s;
             s << "not all eigen-values are found" << std::endl
-              << "target number of eign-values: " << nev__ << std::endl
-              << "number of eign-values found: " << m;
+              << "target number of eigen-values: " << nev__ << std::endl
+              << "number of eigen-values found: " << m;
             WARNING(s);
             return 1;
         }
@@ -377,8 +241,8 @@ class Eigensolver_lapack : public Eigensolver
         if (m != nev__) {
             std::stringstream s;
             s << "not all eigen-values are found" << std::endl
-              << "target number of eign-values: " << nev__ << std::endl
-              << "number of eign-values found: " << m;
+              << "target number of eigen-values: " << nev__ << std::endl
+              << "number of eigen-values found: " << m;
             WARNING(s);
             return 1;
         }
@@ -428,8 +292,8 @@ class Eigensolver_lapack : public Eigensolver
         if (m != nev__) {
             std::stringstream s;
             s << "not all eigen-values are found" << std::endl
-              << "target number of eign-values: " << nev__ << std::endl
-              << "number of eign-values found: " << m;
+              << "target number of eigen-values: " << nev__ << std::endl
+              << "number of eigen-values found: " << m;
             WARNING(s);
             return 1;
         }
@@ -505,18 +369,14 @@ class Eigensolver_elpa : public Eigensolver
     }
   public:
     Eigensolver_elpa(int stage__)
-        : stage_(stage__)
+        : Eigensolver(ev_solver_t::elpa, nullptr, true, memory_t::host, memory_t::host)
+        , stage_(stage__)
     {
         if (!(stage_ == 1 || stage_ == 2)) {
             TERMINATE("wrong type of ELPA solver");
         }
     }
 
-    inline bool is_parallel()
-    {
-        return true;
-    }
-
     /// Solve a generalized eigen-value problem for N lowest eigen-pairs.
     int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double>& A__, dmatrix<double>& B__, double* eval__,
               dmatrix<double>& Z__)
@@ -590,6 +450,12 @@ class Eigensolver_elpa : public Eigensolver
         elpa_set_integer(handle, "process_row", A__.blacs_grid().comm_row().rank(), &error);
         elpa_set_integer(handle, "process_col", A__.blacs_grid().comm_col().rank(), &error);
         elpa_setup(handle);
+        elpa_set_integer(handle, "omp_threads", nt, &error);
+        if (error != ELPA_OK) {
+            TERMINATE("can't set elpa threads");
+        }
+        elpa_set_integer(handle, "gpu", 1, &error);
+
         if (stage_ == 1) {
             elpa_set_integer(handle, "solver", ELPA_SOLVER_1STAGE, &error);
         } else {
@@ -641,6 +507,12 @@ class Eigensolver_elpa : public Eigensolver
         elpa_set_integer(handle, "process_row", A__.blacs_grid().comm_row().rank(), &error);
         elpa_set_integer(handle, "process_col", A__.blacs_grid().comm_col().rank(), &error);
         elpa_setup(handle);
+        elpa_set_integer(handle, "omp_threads", nt, &error);
+        if (error != ELPA_OK) {
+            TERMINATE("can't set elpa threads");
+        }
+        elpa_set_integer(handle, "gpu", 1, &error);
+
         if (stage_ == 1) {
             elpa_set_integer(handle, "solver", ELPA_SOLVER_1STAGE, &error);
         } else {
@@ -650,7 +522,8 @@ class Eigensolver_elpa : public Eigensolver
 
         auto w = mp_h_.get_unique_ptr<double>(matrix_size__);
 
-        elpa_eigenvectors_dc(handle, A__.at(memory_t::host), w.get(), Z__.at(memory_t::host), &error);
+        using CT = double _Complex;
+        elpa_eigenvectors_dc(handle, (CT*)A__.at(memory_t::host), w.get(), (CT*)Z__.at(memory_t::host), &error);
 
         elpa_deallocate(handle, &error);
 
@@ -684,13 +557,9 @@ class Eigensolver_elpa : public Eigensolver
 {
   public:
     Eigensolver_elpa(int stage__)
+        : Eigensolver(ev_solver_t::elpa, nullptr, true, memory_t::host, memory_t::host)
     {
     }
-
-    inline bool is_parallel()
-    {
-        return true;
-    }
 };
 #endif
 
@@ -702,9 +571,9 @@ class Eigensolver_scalapack : public Eigensolver
     double const abstol_{1e-12};
 
   public:
-    inline bool is_parallel()
+    Eigensolver_scalapack()
+        : Eigensolver(ev_solver_t::scalapack, nullptr, true, memory_t::host, memory_t::host)
     {
-        return true;
     }
 
     /// Solve a standard eigen-value problem for all eigen-pairs.
@@ -1134,9 +1003,9 @@ class Eigensolver_scalapack : public Eigensolver
 class Eigensolver_scalapack : public Eigensolver
 {
   public:
-    inline bool is_parallel()
+    Eigensolver_scalapack()
+        : Eigensolver(ev_solver_t::scalapack, nullptr, true, memory_t::host, memory_t::host)
     {
-        return true;
     }
 };
 #endif
@@ -1146,15 +1015,17 @@ class Eigensolver_magma: public Eigensolver
 {
   public:
 
-    inline bool is_parallel()
+    Eigensolver_magma()
+        : Eigensolver(ev_solver_t::magma, nullptr, false, memory_t::host_pinned, memory_t::host)
     {
-        return false;
     }
 
     /// Solve a generalized eigen-value problem for N lowest eigen-pairs.
     int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double>& A__, dmatrix<double>& B__, double* eval__,
               dmatrix<double>& Z__)
     {
+        PROFILE("Eigensolver_magma|dsygvdx");
+
         int nt = omp_get_max_threads();
         int lda = A__.ld();
         int ldb = B__.ld();
@@ -1200,6 +1071,8 @@ class Eigensolver_magma: public Eigensolver
     int solve(ftn_int matrix_size__, ftn_int nev__, dmatrix<double_complex>& A__, dmatrix<double_complex>& B__,
               double* eval__, dmatrix<double_complex>& Z__)
     {
+        PROFILE("Eigensolver_magma|zhegvdx");
+
         int nt = omp_get_max_threads();
         int lda = A__.ld();
         int ldb = B__.ld();
@@ -1337,9 +1210,9 @@ class Eigensolver_magma_gpu: public Eigensolver
 {
   public:
 
-    inline bool is_parallel()
+    Eigensolver_magma_gpu()
+        : Eigensolver(ev_solver_t::magma, nullptr, false, memory_t::host_pinned, memory_t::device)
     {
-        return false;
     }
 
     ///// Solve a generalized eigen-value problem for N lowest eigen-pairs.
@@ -1536,17 +1409,18 @@ class Eigensolver_magma_gpu: public Eigensolver
 class Eigensolver_magma: public Eigensolver
 {
   public:
-    inline bool is_parallel()
+    Eigensolver_magma()
+        : Eigensolver(ev_solver_t::magma, nullptr, false, memory_t::host_pinned, memory_t::host)
     {
-        return false;
     }
 };
+
 class Eigensolver_magma_gpu: public Eigensolver
 {
   public:
-    inline bool is_parallel()
+    Eigensolver_magma_gpu()
+        : Eigensolver(ev_solver_t::magma, nullptr, false, memory_t::host_pinned, memory_t::device)
     {
-        return false;
     }
 };
 #endif
@@ -1555,34 +1429,41 @@ class Eigensolver_magma_gpu: public Eigensolver
 class Eigensolver_cuda: public Eigensolver
 {
   public:
-    inline bool is_parallel()
+    Eigensolver_cuda(memory_pool* mpd__)
+        : Eigensolver(ev_solver_t::cusolver, mpd__, false, memory_t::host_pinned, memory_t::device)
     {
-        return false;
     }
 
     int solve(ftn_int matrix_size__, int nev__, dmatrix<double_complex>& A__, double* eval__,
               dmatrix<double_complex>& Z__)
     {
-        PROFILE("Eigensolver_cuda|zheevd");
+        PROFILE("Eigensolver_cuda|zheevdx");
 
         cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
         cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+        cusolverEigRange_t range = CUSOLVER_EIG_RANGE_I;
 
-        auto w = mp_d_.get_unique_ptr<double>(matrix_size__);
+        auto w = mp_d_->get_unique_ptr<double>(matrix_size__);
         acc::copyin(A__.at(memory_t::device), A__.ld(), A__.at(memory_t::host), A__.ld(), matrix_size__, matrix_size__);
 
         int lwork;
-        CALL_CUSOLVER(cusolverDnZheevd_bufferSize, (cusolver::cusolver_handle(), jobz, uplo, matrix_size__,
+        int h_meig;
+        auto vl = -std::numeric_limits<double>::infinity();
+        auto vu = std::numeric_limits<double>::infinity();
+
+        CALL_CUSOLVER(cusolverDnZheevdx_bufferSize, (cusolver::cusolver_handle(), jobz, range, uplo, matrix_size__,
                                                     reinterpret_cast<cuDoubleComplex*>(A__.at(memory_t::device)), A__.ld(),
-                                                    w.get(), &lwork));
+                                                    vl, vu, 1, nev__, &h_meig, w.get(), &lwork));
 
-        auto work = mp_d_.get_unique_ptr<double_complex>(lwork);
+        auto work = mp_d_->get_unique_ptr<double_complex>(lwork);
 
         int info;
-        auto dinfo = mp_d_.get_unique_ptr<int>(1);
-        CALL_CUSOLVER(cusolverDnZheevd, (cusolver::cusolver_handle(), jobz, uplo, matrix_size__,
+        auto dinfo = mp_d_->get_unique_ptr<int>(1);
+        CALL_CUSOLVER(cusolverDnZheevdx, (cusolver::cusolver_handle(), jobz, range, uplo, matrix_size__,
                                          reinterpret_cast<cuDoubleComplex*>(A__.at(memory_t::device)), A__.ld(),
-                                         w.get(), reinterpret_cast<cuDoubleComplex*>(work.get()), lwork, dinfo.get()));
+                                         vl, vu, 1, nev__, &h_meig, w.get(), reinterpret_cast<cuDoubleComplex*>(work.get()), lwork,
+                                         dinfo.get()));
+
         acc::copyout(&info, dinfo.get(), 1);
         if (!info) {
             acc::copyout(eval__, w.get(), nev__);
@@ -1599,25 +1480,30 @@ class Eigensolver_cuda: public Eigensolver
     int solve(ftn_int matrix_size__, int nev__, dmatrix<double>& A__, double* eval__,
               dmatrix<double>& Z__)
     {
-        PROFILE("Eigensolver_cuda|dsyevd");
+        PROFILE("Eigensolver_cuda|dsyevdx");
 
         cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
         cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+        cusolverEigRange_t range = CUSOLVER_EIG_RANGE_I;
 
-        auto w = mp_d_.get_unique_ptr<double>(matrix_size__);
+        auto w = mp_d_->get_unique_ptr<double>(matrix_size__);
         acc::copyin(A__.at(memory_t::device), A__.ld(), A__.at(memory_t::host), A__.ld(), matrix_size__, matrix_size__);
 
         int lwork;
-        CALL_CUSOLVER(cusolverDnDsyevd_bufferSize, (cusolver::cusolver_handle(), jobz, uplo, matrix_size__,
-                                                    A__.at(memory_t::device), A__.ld(),
+        int h_meig;
+        auto vl = -std::numeric_limits<double>::infinity();
+        auto vu = std::numeric_limits<double>::infinity();
+
+        CALL_CUSOLVER(cusolverDnDsyevdx_bufferSize, (cusolver::cusolver_handle(), jobz, range, uplo, matrix_size__,
+                                                    A__.at(memory_t::device), A__.ld(), vl, vu, 1, nev__, &h_meig,
                                                     w.get(), &lwork));
 
-        auto work = mp_d_.get_unique_ptr<double>(lwork);
+        auto work = mp_d_->get_unique_ptr<double>(lwork);
 
         int info;
-        auto dinfo = mp_d_.get_unique_ptr<int>(1);
-        CALL_CUSOLVER(cusolverDnDsyevd, (cusolver::cusolver_handle(), jobz, uplo, matrix_size__,
-                                         A__.at(memory_t::device), A__.ld(),
+        auto dinfo = mp_d_->get_unique_ptr<int>(1);
+        CALL_CUSOLVER(cusolverDnDsyevdx, (cusolver::cusolver_handle(), jobz, range, uplo, matrix_size__,
+                                         A__.at(memory_t::device), A__.ld(), vl, vu, 1, nev__, &h_meig,
                                          w.get(), work.get(), lwork, dinfo.get()));
         acc::copyout(&info, dinfo.get(), 1);
         if (!info) {
@@ -1635,31 +1521,36 @@ class Eigensolver_cuda: public Eigensolver
     int solve(ftn_int matrix_size__, int nev__, dmatrix<double_complex>& A__, dmatrix<double_complex>& B__, double* eval__,
               dmatrix<double_complex>& Z__)
     {
-        PROFILE("Eigensolver_cuda|zhegvd");
+        PROFILE("Eigensolver_cuda|zhegvdx");
 
         cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; // A*x = (lambda)*B*x
         cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
         cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+        cusolverEigRange_t range = CUSOLVER_EIG_RANGE_I;
 
-        auto w = mp_d_.get_unique_ptr<double>(matrix_size__);
+        auto w = mp_d_->get_unique_ptr<double>(matrix_size__);
         acc::copyin(A__.at(memory_t::device), A__.ld(), A__.at(memory_t::host), A__.ld(), matrix_size__, matrix_size__);
         acc::copyin(B__.at(memory_t::device), B__.ld(), B__.at(memory_t::host), B__.ld(), matrix_size__, matrix_size__);
 
         int lwork;
-        CALL_CUSOLVER(cusolverDnZhegvd_bufferSize, (cusolver::cusolver_handle(), itype, jobz, uplo, matrix_size__,
+        int h_meig;
+        auto vl = -std::numeric_limits<double>::infinity();
+        auto vu = std::numeric_limits<double>::infinity();
+
+        CALL_CUSOLVER(cusolverDnZhegvdx_bufferSize, (cusolver::cusolver_handle(), itype, jobz, range, uplo, matrix_size__,
                                                     reinterpret_cast<cuDoubleComplex*>(A__.at(memory_t::device)), A__.ld(),
                                                     reinterpret_cast<cuDoubleComplex*>(B__.at(memory_t::device)), B__.ld(),
-                                                    w.get(), &lwork));
-
-        auto work = mp_d_.get_unique_ptr<double_complex>(lwork);
+                                                    vl, vu, 1, nev__, &h_meig, w.get(), &lwork));
 
+        auto work = mp_d_->get_unique_ptr<double_complex>(lwork);
 
         int info;
-        auto dinfo = mp_d_.get_unique_ptr<int>(1);
-        CALL_CUSOLVER(cusolverDnZhegvd, (cusolver::cusolver_handle(), itype, jobz, uplo, matrix_size__,
+        auto dinfo = mp_d_->get_unique_ptr<int>(1);
+        CALL_CUSOLVER(cusolverDnZhegvdx, (cusolver::cusolver_handle(), itype, jobz, range, uplo, matrix_size__,
                                          reinterpret_cast<cuDoubleComplex*>(A__.at(memory_t::device)), A__.ld(),
                                          reinterpret_cast<cuDoubleComplex*>(B__.at(memory_t::device)), B__.ld(),
-                                         w.get(), reinterpret_cast<cuDoubleComplex*>(work.get()), lwork, dinfo.get()));
+                                         vl, vu, 1, nev__, &h_meig, w.get(), reinterpret_cast<cuDoubleComplex*>(work.get()), 
+                                         lwork, dinfo.get()));
         acc::copyout(&info, dinfo.get(), 1);
         if (!info) {
             acc::copyout(eval__, w.get(), nev__);
@@ -1678,30 +1569,35 @@ class Eigensolver_cuda: public Eigensolver
     int solve(ftn_int matrix_size__, int nev__, dmatrix<double>& A__, dmatrix<double>& B__, double* eval__,
               dmatrix<double>& Z__)
     {
-        PROFILE("Eigensolver_cuda|dsygvd");
+        PROFILE("Eigensolver_cuda|dsygvdx");
 
         cusolverEigType_t itype = CUSOLVER_EIG_TYPE_1; // A*x = (lambda)*B*x
         cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_VECTOR;
         cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+        cusolverEigRange_t range = CUSOLVER_EIG_RANGE_I;
 
-        auto w = mp_d_.get_unique_ptr<double>(matrix_size__);
+        auto w = mp_d_->get_unique_ptr<double>(matrix_size__);
         acc::copyin(A__.at(memory_t::device), A__.ld(), A__.at(memory_t::host), A__.ld(), matrix_size__, matrix_size__);
         acc::copyin(B__.at(memory_t::device), B__.ld(), B__.at(memory_t::host), B__.ld(), matrix_size__, matrix_size__);
 
         int lwork;
-        CALL_CUSOLVER(cusolverDnDsygvd_bufferSize, (cusolver::cusolver_handle(), itype, jobz, uplo, matrix_size__,
+        int h_meig;
+        auto vl = -std::numeric_limits<double>::infinity();
+        auto vu = std::numeric_limits<double>::infinity();
+
+        CALL_CUSOLVER(cusolverDnDsygvdx_bufferSize, (cusolver::cusolver_handle(), itype, jobz, range, uplo, matrix_size__,
                                                     A__.at(memory_t::device), A__.ld(),
                                                     B__.at(memory_t::device), B__.ld(),
-                                                    w.get(), &lwork));
+                                                    vl, vu, 1, nev__, &h_meig, w.get(), &lwork));
 
-        auto work = mp_d_.get_unique_ptr<double>(lwork);
+        auto work = mp_d_->get_unique_ptr<double>(lwork);
 
         int info;
-        auto dinfo = mp_d_.get_unique_ptr<int>(1);
-        CALL_CUSOLVER(cusolverDnDsygvd, (cusolver::cusolver_handle(), itype, jobz, uplo, matrix_size__,
+        auto dinfo = mp_d_->get_unique_ptr<int>(1);
+        CALL_CUSOLVER(cusolverDnDsygvdx, (cusolver::cusolver_handle(), itype, jobz, range, uplo, matrix_size__,
                                          A__.at(memory_t::device), A__.ld(),
                                          B__.at(memory_t::device), B__.ld(),
-                                         w.get(), work.get(), lwork, dinfo.get()));
+                                         vl, vu, 1, nev__, &h_meig, w.get(), work.get(), lwork, dinfo.get()));
         acc::copyout(&info, dinfo.get(), 1);
         if (!info) {
             acc::copyout(eval__, w.get(), nev__);
@@ -1719,51 +1615,53 @@ class Eigensolver_cuda: public Eigensolver
 class Eigensolver_cuda: public Eigensolver
 {
   public:
-    inline bool is_parallel()
+    Eigensolver_cuda(memory_pool* mpd__)
+        : Eigensolver(ev_solver_t::cusolver, mpd__, false, memory_t::host_pinned, memory_t::device)
     {
-        return false;
     }
 };
 #endif
 
-inline std::unique_ptr<Eigensolver> Eigensolver_factory(ev_solver_t ev_solver_type__)
-{
-    Eigensolver* ptr;
-    switch (ev_solver_type__) {
-        case ev_solver_t::lapack: {
-            ptr = new Eigensolver_lapack();
-            break;
-        }
-        case ev_solver_t::scalapack: {
-            ptr = new Eigensolver_scalapack();
-            break;
-        }
-        case ev_solver_t::elpa1: {
-            ptr = new Eigensolver_elpa(1);
-            break;
-        }
-        case ev_solver_t::elpa2: {
-            ptr = new Eigensolver_elpa(2);
-            break;
-        }
-        case ev_solver_t::magma: {
-            ptr = new Eigensolver_magma();
-            break;
-        }
-        case ev_solver_t::magma_gpu: {
-            ptr = new Eigensolver_magma_gpu();
-            break;
-        }
-        case ev_solver_t::cusolver: {
-            ptr = new Eigensolver_cuda();
-            break;
-        }
-        default: {
-            TERMINATE("not implemented");
-        }
-    }
-    return std::unique_ptr<Eigensolver>(ptr);
-}
+//inline std::unique_ptr<Eigensolver> Eigensolver_factory(std::string name__, memory_pool* mpd__)
+//{
+//    std::transform(name__.begin(), name__.end(), name__.begin(), ::tolower);
+//
+//    Eigensolver* ptr;
+//    switch (get_ev_solver_t(name__)) {
+//        case ev_solver_t::lapack: {
+//            ptr = new Eigensolver_lapack();
+//            break;
+//        }
+//        case ev_solver_t::scalapack: {
+//            ptr = new Eigensolver_scalapack();
+//            break;
+//        }
+//        case ev_solver_t::elpa: {
+//            if (name__ == "elpa1") {
+//                ptr = new Eigensolver_elpa(1);
+//            } else {
+//                ptr = new Eigensolver_elpa(2);
+//            }
+//            break;
+//        }
+//        case ev_solver_t::magma: {
+//            ptr = new Eigensolver_magma();
+//            break;
+//        }
+//        case ev_solver_t::magma_gpu: {
+//            ptr = new Eigensolver_magma_gpu();
+//            break;
+//        }
+//        case ev_solver_t::cusolver: {
+//            ptr = new Eigensolver_cuda(mpd__);
+//            break;
+//        }
+//        default: {
+//            TERMINATE("not implemented");
+//        }
+//    }
+//    return std::unique_ptr<Eigensolver>(ptr);
+//}
 
 //== #ifdef __PLASMA
 //== extern "C" void plasma_zheevd_wrapper(int32_t matrix_size, void* a, int32_t lda, void* z,
diff --git a/src/linalg/eigensolver.cpp b/src/linalg/eigensolver.cpp
new file mode 100644
index 000000000..abc9bf381
--- /dev/null
+++ b/src/linalg/eigensolver.cpp
@@ -0,0 +1,68 @@
+// Copyright (c) 2013-2019 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file eigensolver.cpp
+ *
+ *  \brief Contains implementation of eigensolver factory.
+ */
+
+#include "eigensolver.hpp"
+#include "eigenproblem.hpp"
+
+std::unique_ptr<Eigensolver> Eigensolver_factory(std::string name__, memory_pool* mpd__)
+{
+    std::transform(name__.begin(), name__.end(), name__.begin(), ::tolower);
+
+    Eigensolver* ptr = nullptr;
+    switch (get_ev_solver_t(name__)) {
+        case ev_solver_t::lapack: {
+            ptr = new Eigensolver_lapack();
+            break;
+        }
+        case ev_solver_t::scalapack: {
+            ptr = new Eigensolver_scalapack();
+            break;
+        }
+        case ev_solver_t::elpa: {
+            if (name__ == "elpa1") {
+                ptr = new Eigensolver_elpa(1);
+            } else {
+                ptr = new Eigensolver_elpa(2);
+            }
+            break;
+        }
+        case ev_solver_t::magma: {
+            ptr = new Eigensolver_magma();
+            break;
+        }
+        case ev_solver_t::magma_gpu: {
+            ptr = new Eigensolver_magma_gpu();
+            break;
+        }
+        case ev_solver_t::cusolver: {
+            ptr = new Eigensolver_cuda(mpd__);
+            break;
+        }
+        default: {
+            TERMINATE("not implemented");
+        }
+    }
+    return std::unique_ptr<Eigensolver>(ptr);
+}
+
diff --git a/src/linalg/eigensolver.hpp b/src/linalg/eigensolver.hpp
new file mode 100644
index 000000000..7ac91c5df
--- /dev/null
+++ b/src/linalg/eigensolver.hpp
@@ -0,0 +1,213 @@
+// Copyright (c) 2013-2019 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+/** \file eigensolver.hpp
+ *
+ *  \brief Contains definition of eigensolver factory.
+ */
+
+#ifndef __EIGENSOLVER_HPP__
+#define __EIGENSOLVER_HPP__
+
+#include "SDDK/memory.hpp"
+#include "SDDK/dmatrix.hpp"
+
+using double_complex = std::complex<double>;
+
+/// Type of eigen-value solver.
+enum class ev_solver_t
+{
+    /// LAPACK
+    lapack,
+
+    /// ScaLAPACK
+    scalapack,
+
+    /// ELPA solver
+    elpa,
+
+    /// MAGMA with CPU pointers
+    magma,
+
+    /// MAGMA with GPU pointers
+    magma_gpu,
+
+    /// PLASMA
+    plasma,
+
+    /// CUDA eigen-solver
+    cusolver
+};
+
+/// Get type of an eigen solver by name (provided as a string).
+inline ev_solver_t get_ev_solver_t(std::string name__)
+{
+    std::transform(name__.begin(), name__.end(), name__.begin(), ::tolower);
+
+    static const std::map<std::string, ev_solver_t> map_to_type = {
+        {"lapack", ev_solver_t::lapack}, {"scalapack", ev_solver_t::scalapack}, {"elpa1", ev_solver_t::elpa},
+        {"elpa2", ev_solver_t::elpa},   {"magma", ev_solver_t::magma},         {"magma_gpu", ev_solver_t::magma_gpu},
+        {"plasma", ev_solver_t::plasma}, {"cusolver", ev_solver_t::cusolver}};
+
+    if (map_to_type.count(name__) == 0) {
+        std::stringstream s;
+        s << "wrong label of eigen-solver : " << name__;
+        TERMINATE(s);
+    }
+
+    return map_to_type.at(name__);
+}
+
+/// Interface to different eigen-solvers.
+class Eigensolver
+{
+  protected:
+    /// Type of the eigen-value solver.
+    ev_solver_t ev_solver_type_;
+    /// Common error message.
+    const std::string error_msg_not_implemented = "solver is not implemented";
+    /// Memory pool for CPU work buffers.
+    sddk::memory_pool mp_h_;
+    /// Memory pool for CPU work buffers using pinned memory.
+    sddk::memory_pool mp_hp_;
+    /// Memory pool for GPU work buffers.
+    std::shared_ptr<sddk::memory_pool> mp_d_{nullptr};
+    /// True if solver is MPI parallel.
+    bool is_parallel_{false};
+    /// Type of host memory needed for the solver.
+    /** Some solvers, for example MAGMA, require host pilnned memory. */
+    sddk::memory_t host_memory_t_{sddk::memory_t::none};
+    /// Type of input data memory.
+    /** CPU solvers start from host memory, MAGMA can start from host or device memory, cuSolver starts from
+     *  devide memoryi. */
+    sddk::memory_t data_memory_t_{sddk::memory_t::none};
+
+  public:
+    /// Constructor.
+    Eigensolver(ev_solver_t type__, sddk::memory_pool* mpd__, bool is_parallel__, sddk::memory_t host_memory_t__,
+                sddk::memory_t data_memory_t__)
+        : ev_solver_type_(type__)
+        , mp_h_(sddk::memory_pool(sddk::memory_t::host))
+        , mp_hp_(sddk::memory_pool(sddk::memory_t::host_pinned))
+        , is_parallel_(is_parallel__)
+        , host_memory_t_(host_memory_t__)
+        , data_memory_t_(data_memory_t__)
+    {
+        if (mpd__) {
+            mp_d_ = std::shared_ptr<sddk::memory_pool>(mpd__, [](sddk::memory_pool*){});
+        } else {
+            mp_d_ = std::shared_ptr<sddk::memory_pool>(new sddk::memory_pool(sddk::memory_t::host));
+        }
+    }
+
+    /// Destructor.
+    virtual ~Eigensolver()
+    {
+    }
+
+    /// Solve a standard eigen-value problem for all eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, sddk::dmatrix<double>& A__, double* eval__, sddk::dmatrix<double>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+    /// Solve a standard eigen-value problem for all eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, sddk::dmatrix<double_complex>& A__, double* eval__,
+                      sddk::dmatrix<double_complex>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Solve a standard eigen-value problem for N lowest eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, ftn_int nev__, sddk::dmatrix<double>& A__, double* eval__,
+                      sddk::dmatrix<double>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Solve a standard eigen-value problem for N lowest eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, ftn_int nev__, sddk::dmatrix<double_complex>& A__, double* eval__,
+                      sddk::dmatrix<double_complex>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Solve a generalized eigen-value problem for all eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, sddk::dmatrix<double>& A__, sddk::dmatrix<double>& B__, double* eval__,
+                      sddk::dmatrix<double>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Solve a generalized eigen-value problem for all eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, sddk::dmatrix<double_complex>& A__, sddk::dmatrix<double_complex>& B__,
+                      double* eval__, sddk::dmatrix<double_complex>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Solve a generalized eigen-value problem for N lowest eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, ftn_int nev__, sddk::dmatrix<double>& A__, sddk::dmatrix<double>& B__,
+                      double* eval__, sddk::dmatrix<double>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Solve a generalized eigen-value problem for N lowest eigen-pairs.
+    virtual int solve(ftn_int matrix_size__, ftn_int nev__, sddk::dmatrix<double_complex>& A__,
+                      sddk::dmatrix<double_complex>& B__, double* eval__, sddk::dmatrix<double_complex>& Z__)
+    {
+        TERMINATE(error_msg_not_implemented);
+        return -1;
+    }
+
+    /// Parallel or sequential solver.
+    bool is_parallel() const
+    {
+        return is_parallel_;
+    }
+
+    /// Type of host memory, required by the solver.
+    inline sddk::memory_t host_memory_t() const
+    {
+        return host_memory_t_;
+    }
+
+    /// Type of input memory for the solver.
+    inline sddk::memory_t data_memory_t() const
+    {
+        return data_memory_t_;
+    }
+
+    /// Type of eigen-solver.
+    inline ev_solver_t type() const
+    {
+        return ev_solver_type_;
+    }
+};
+
+std::unique_ptr<Eigensolver> Eigensolver_factory(std::string name__, sddk::memory_pool* mpd__);
+
+#endif
diff --git a/src/linalg/elpa.h b/src/linalg/elpa.h
deleted file mode 100644
index 9927613b3..000000000
--- a/src/linalg/elpa.h
+++ /dev/null
@@ -1,695 +0,0 @@
-/** \file elpa.h
- *
- *  \brief Interface to ELPA library.
- */
-
-using elpa_t = void*;
-
-int elpa_init(int);
-int elpa_uninit(int*);
-
-elpa_t elpa_allocate(int *error);
-void elpa_deallocate(elpa_t handle, int *error);
-int elpa_setup(elpa_t handle);
-void elpa_set_integer(elpa_t handle, const char *name, int value, int *error);
-void elpa_eigenvectors_d(elpa_t handle, double *a, double *ev, double *q, int *error);
-void elpa_eigenvectors_f(elpa_t handle, float *a, float *ev, float *q, int *error);
-void elpa_eigenvectors_dc(elpa_t handle, std::complex<double> *a, double *ev, std::complex<double> *q, int *error);
-void elpa_eigenvectors_fc(elpa_t handle, std::complex<float> *a, float *ev, std::complex<float> *q, int *error);
-
- /*! \brief C interface to driver function "elpa_solve_evp_real_double"
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useQR                      use QR decomposition 1 = yes, 0 = no
- *  \param useGPU                     use GPU (1=yes, 0=No)
- *  \param method                     choose whether to use ELPA 1stage or 2stage solver
- *                                    possible values: "1stage" => use ELPA 1stage solver
- *                                                      "2stage" => use ELPA 2stage solver
- *                                                       "auto"   => (at the moment) use ELPA 2stage solver
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_real_double(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR, int useGPU, char *method);
- /*! \brief C interface to driver function "elpa_solve_evp_real_single"
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useQR                      use QR decomposition 1 = yes, 0 = no
- *  \param useGPU                     use GPU (1=yes, 0=No)
- *  \param method                     choose whether to use ELPA 1stage or 2stage solver
- *                                    possible values: "1stage" => use ELPA 1stage solver
- *                                                      "2stage" => use ELPA 2stage solver
- *                                                       "auto"   => (at the moment) use ELPA 2stage solver
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_real_single(int na, int nev, float *a, int lda, float *ev, float *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR, int useGPU, char *method);
- /*! \brief C interface to driver function "elpa_solve_evp_complex_double"
- *
- *  \param  na                           Order of matrix a
- *  \param  nev                          Number of eigenvalues needed.
- *                                       The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                            Distributed matrix for which eigenvalues are to be computed.
- *                                       Distribution is like in Scalapack.
- *                                       The full matrix must be set (not only one half like in scalapack).
- *  \param lda                           Leading dimension of a
- *  \param ev(na)                        On output: eigenvalues of a, every processor gets the complete set
- *  \param q                             On output: Eigenvectors of a
- *                                       Distribution is like in Scalapack.
- *                                       Must be always dimensioned to the full size (corresponding to (na,na))
- *                                       even if only a part of the eigenvalues is needed.
- *  \param ldq                           Leading dimension of q
- *  \param nblk                          blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                    distributed number of matrix columns
- *  \param mpi_comm_rows                 MPI-Communicator for rows
- *  \param mpi_comm_cols                 MPI-Communicator for columns
- *  \param mpi_coll_all                  MPI communicator for the total processor set
- *  \param THIS_COMPLEX_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useGPU                        use GPU (1=yes, 0=No)
- *  \param method                        choose whether to use ELPA 1stage or 2stage solver
- *                                       possible values: "1stage" => use ELPA 1stage solver
- *                                                        "2stage" => use ELPA 2stage solver
- *                                                         "auto"   => (at the moment) use ELPA 2stage solver
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_double(int na, int nev, std::complex<double> *a, int lda, double *ev, std::complex<double> *q, int ldq, int nblk, int matrixCols,
-                                   int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API, int useGPU, char *method);
- /*! \brief C interface to driver function "elpa_solve_evp_complex_single"
- *
- *  \param  na                           Order of matrix a
- *  \param  nev                          Number of eigenvalues needed.
- *                                       The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                            Distributed matrix for which eigenvalues are to be computed.
- *                                       Distribution is like in Scalapack.
- *                                       The full matrix must be set (not only one half like in scalapack).
- *  \param lda                           Leading dimension of a
- *  \param ev(na)                        On output: eigenvalues of a, every processor gets the complete set
- *  \param q                             On output: Eigenvectors of a
- *                                       Distribution is like in Scalapack.
- *                                       Must be always dimensioned to the full size (corresponding to (na,na))
- *                                       even if only a part of the eigenvalues is needed.
- *  \param ldq                           Leading dimension of q
- *  \param nblk                          blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                    distributed number of matrix columns
- *  \param mpi_comm_rows                 MPI-Communicator for rows
- *  \param mpi_comm_cols                 MPI-Communicator for columns
- *  \param mpi_coll_all                  MPI communicator for the total processor set
- *  \param THIS_COMPLEX_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useGPU                        use GPU (1=yes, 0=No)
- *  \param method                        choose whether to use ELPA 1stage or 2stage solver
- *                                       possible values: "1stage" => use ELPA 1stage solver
- *                                                        "2stage" => use ELPA 2stage solver
- *                                                         "auto"   => (at the moment) use ELPA 2stage solver
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_single(int na, int nev, std::complex<float> *a, int lda, float *ev, std::complex<float> *q, int ldq, int nblk, int matrixCols,
-                                   int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API, int useGPU, char *method);
- /*! \brief C old, deprecated interface, will be deleted. Use "elpa_get_communicators"
- *
- * \param mpi_comm_word    MPI global communicator (in)
- * \param my_prow          Row coordinate of the calling process in the process grid (in)
- * \param my_pcol          Column coordinate of the calling process in the process grid (in)
- * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
- * \result int             integer error value of mpi_comm_split function
- */
- int get_elpa_row_col_comms(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
- /*! \brief C old, deprecated interface, will be deleted. Use "elpa_get_communicators"
- *
- * \param mpi_comm_word    MPI global communicator (in)
- * \param my_prow          Row coordinate of the calling process in the process grid (in)
- * \param my_pcol          Column coordinate of the calling process in the process grid (in)
- * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
- * \result int             integer error value of mpi_comm_split function
- */
- int get_elpa_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
- /*! \brief C interface to create ELPA communicators
- *
- * \param mpi_comm_word    MPI global communicator (in)
- * \param my_prow          Row coordinate of the calling process in the process grid (in)
- * \param my_pcol          Column coordinate of the calling process in the process grid (in)
- * \param mpi_comm_rows    Communicator for communicating within rows of processes (out)
- * \result int             integer error value of mpi_comm_split function
- */
- int elpa_get_communicators(int mpi_comm_world, int my_prow, int my_pcol, int *mpi_comm_rows, int *mpi_comm_cols);
-  /*! \brief C interface to solve the double-precision real eigenvalue problem with 1-stage solver
-  *
- *  \param  na                   Order of matrix a
- *  \param  nev                  Number of eigenvalues needed.
- *                               The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
- *                               Distribution is like in Scalapack.
- *                               The full matrix must be set (not only one half like in scalapack).
- *  \param lda                   Leading dimension of a
- *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
- *  \param q                     On output: Eigenvectors of a
- *                               Distribution is like in Scalapack.
- *                               Must be always dimensioned to the full size (corresponding to (na,na))
- *                               even if only a part of the eigenvalues is needed.
- *  \param ldq                   Leading dimension of q
- *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols           distributed number of matrix columns
- *  \param mpi_comm_rows        MPI-Communicator for rows
- *  \param mpi_comm_cols        MPI-Communicator for columns
- *  \param useGPU               use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
-*/
- int elpa_solve_evp_real_1stage_double_precision(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int useGPU);
-  /*! \brief C interface to solve the single-precision real eigenvalue problem with 1-stage solver
-  *
- *  \param  na                   Order of matrix a
- *  \param  nev                  Number of eigenvalues needed.
- *                               The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
- *                               Distribution is like in Scalapack.
- *                               The full matrix must be set (not only one half like in scalapack).
- *  \param lda                   Leading dimension of a
- *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
- *  \param q                     On output: Eigenvectors of a
- *                               Distribution is like in Scalapack.
- *                               Must be always dimensioned to the full size (corresponding to (na,na))
- *                               even if only a part of the eigenvalues is needed.
- *  \param ldq                   Leading dimension of q
- *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols           distributed number of matrix columns
- *  \param mpi_comm_rows        MPI-Communicator for rows
- *  \param mpi_comm_cols        MPI-Communicator for columns
- *  \param useGPU               use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
-*/
- int elpa_solve_evp_real_1stage_single_precision(int na, int nev, float *a, int lda, float *ev, float *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int useGPU);
- /*! \brief C interface to solve the double-precision complex eigenvalue problem with 1-stage solver
- *
- *  \param  na                   Order of matrix a
- *  \param  nev                  Number of eigenvalues needed.
- *                               The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
- *                               Distribution is like in Scalapack.
- *                               The full matrix must be set (not only one half like in scalapack).
- *  \param lda                   Leading dimension of a
- *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
- *  \param q                     On output: Eigenvectors of a
- *                               Distribution is like in Scalapack.
- *                               Must be always dimensioned to the full size (corresponding to (na,na))
- *                               even if only a part of the eigenvalues is needed.
- *  \param ldq                   Leading dimension of q
- *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols           distributed number of matrix columns
- *  \param mpi_comm_rows        MPI-Communicator for rows
- *  \param mpi_comm_cols        MPI-Communicator for columns
- *  \param useGPU               use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_1stage_double_precision(int na, int nev, std::complex<double> *a, int lda, double *ev, std::complex<double> *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int useGPU);
- /*! \brief C interface to solve the single-precision complex eigenvalue problem with 1-stage solver
- *
- *  \param  na                   Order of matrix a
- *  \param  nev                  Number of eigenvalues needed.
- *                               The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                    Distributed matrix for which eigenvalues are to be computed.
- *                               Distribution is like in Scalapack.
- *                               The full matrix must be set (not only one half like in scalapack).
- *  \param lda                   Leading dimension of a
- *  \param ev(na)                On output: eigenvalues of a, every processor gets the complete set
- *  \param q                     On output: Eigenvectors of a
- *                               Distribution is like in Scalapack.
- *                               Must be always dimensioned to the full size (corresponding to (na,na))
- *                               even if only a part of the eigenvalues is needed.
- *  \param ldq                   Leading dimension of q
- *  \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols           distributed number of matrix columns
- *  \param mpi_comm_rows        MPI-Communicator for rows
- *  \param mpi_comm_cols        MPI-Communicator for columns
- *  \param useGPU               use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_1stage_single_precision(int na, int nev,  std::complex<float> *a, int lda, float *ev, std::complex<float> *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int useGPU);
- /*
- \brief  C interface to solve double-precision tridiagonal eigensystem with divide and conquer method
- \details
-
- *\param na                    Matrix dimension
- *\param nev                   number of eigenvalues/vectors to be computed
- *\param d                     array d(na) on input diagonal elements of tridiagonal matrix, on
- *                             output the eigenvalues in ascending order
- *\param e                     array e(na) on input subdiagonal elements of matrix, on exit destroyed
- *\param q                     on exit : matrix q(ldq,matrixCols) contains the eigenvectors
- *\param ldq                   leading dimension of matrix q
- *\param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- *\param matrixCols            columns of matrix q
- *\param mpi_comm_rows         MPI communicator for rows
- *\param mpi_comm_cols         MPI communicator for columns
- *\param wantDebug             give more debug information if 1, else 0
- *\result success              int 1 on success, else 0
- */
- int elpa_solve_tridi_double(int na, int nev, double *d, double *e, double *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface to solve single-precision tridiagonal eigensystem with divide and conquer method
- \details
-
- \param na                    Matrix dimension
- \param nev                   number of eigenvalues/vectors to be computed
- \param d                     array d(na) on input diagonal elements of tridiagonal matrix, on
-                              output the eigenvalues in ascending order
- \param e                     array e(na) on input subdiagonal elements of matrix, on exit destroyed
- \param q                     on exit : matrix q(ldq,matrixCols) contains the eigenvectors
- \param ldq                   leading dimension of matrix q
- \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- \param matrixCols            columns of matrix q
- \param mpi_comm_rows         MPI communicator for rows
- \param mpi_comm_cols         MPI communicator for columns
- \param wantDebug             give more debug information if 1, else 0
- \result success              int 1 on success, else 0
- */
- int elpa_solve_tridi_single(int na, int nev, float *d, float *e, float *q, int ldq, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface for elpa_mult_at_b_real_double: Performs C : = A**T * B for double-precision matrices
-         where   A is a square matrix (na,na) which is optionally upper or lower triangular
-                 B is a (na,ncb) matrix
-                 C is a (na,ncb) matrix where optionally only the upper or lower
-                   triangle may be computed
- \details
- \param  uplo_a               'U' if A is upper triangular
-                              'L' if A is lower triangular
-                              anything else if A is a full matrix
-                              Please note: This pertains to the original A (as set in the calling program)
-                                           whereas the transpose of A is used for calculations
-                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-                              i.e. it may contain arbitrary numbers
- \param uplo_c                'U' if only the upper diagonal part of C is needed
-                              'L' if only the upper diagonal part of C is needed
-                              anything else if the full matrix C is needed
-                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-                                            written to a certain extent, i.e. one shouldn't rely on the content there!
- \param na                    Number of rows/columns of A, number of rows of B and C
- \param ncb                   Number of columns  of B and C
- \param a                     matrix a
- \param lda                   leading dimension of matrix a
- \param ldaCols               columns of matrix a
- \param b                     matrix b
- \param ldb                   leading dimension of matrix b
- \param ldbCols               columns of matrix b
- \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param c                     matrix c
- \param ldc                   leading dimension of matrix c
- \param ldcCols               columns of matrix c
- \result success              int report success (1) or failure (0)
- */
- int elpa_mult_at_b_real_double(char uplo_a, char uplo_c, int na, int ncb, double *a, int lda, int ldaCols, double *b, int ldb, int ldbCols, int nlbk, int mpi_comm_rows, int mpi_comm_cols, double *c, int ldc, int ldcCols);
- /*
- \brief  C interface for elpa_mult_at_b_real_single: Performs C : = A**T * B for single-precision matrices
-         where   A is a square matrix (na,na) which is optionally upper or lower triangular
-                 B is a (na,ncb) matrix
-                 C is a (na,ncb) matrix where optionally only the upper or lower
-                   triangle may be computed
- \details
- \param  uplo_a               'U' if A is upper triangular
-                              'L' if A is lower triangular
-                              anything else if A is a full matrix
-                              Please note: This pertains to the original A (as set in the calling program)
-                                           whereas the transpose of A is used for calculations
-                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-                              i.e. it may contain arbitrary numbers
- \param uplo_c                'U' if only the upper diagonal part of C is needed
-                              'L' if only the upper diagonal part of C is needed
-                              anything else if the full matrix C is needed
-                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-                                            written to a certain extent, i.e. one shouldn't rely on the content there!
- \param na                    Number of rows/columns of A, number of rows of B and C
- \param ncb                   Number of columns  of B and C
- \param a                     matrix a
- \param lda                   leading dimension of matrix a
- \param ldaCols               columns of matrix a
- \param b                     matrix b
- \param ldb                   leading dimension of matrix b
- \param ldbCols               columns of matrix b
- \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param c                     matrix c
- \param ldc                   leading dimension of matrix c
- \result success              int report success (1) or failure (0)
- */
- int elpa_mult_at_b_real_single(char uplo_a, char uplo_c, int na, int ncb, float *a, int lda, int ldaCols, float *b, int ldb, int ldbCols, int nlbk, int mpi_comm_rows, int mpi_comm_cols, float *c, int ldc, int ldcCols);
- /*
- \brief C interface for elpa_mult_ah_b_complex_double: Performs C : = A**H * B for double-precision matrices
-         where   A is a square matrix (na,na) which is optionally upper or lower triangular
-                 B is a (na,ncb) matrix
-                 C is a (na,ncb) matrix where optionally only the upper or lower
-                   triangle may be computed
- \details
-
- \param  uplo_a               'U' if A is upper triangular
-                              'L' if A is lower triangular
-                              anything else if A is a full matrix
-                              Please note: This pertains to the original A (as set in the calling program)
-                                           whereas the transpose of A is used for calculations
-                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-                              i.e. it may contain arbitrary numbers
- \param uplo_c                'U' if only the upper diagonal part of C is needed
-                              'L' if only the upper diagonal part of C is needed
-                              anything else if the full matrix C is needed
-                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-                                            written to a certain extent, i.e. one shouldn't rely on the content there!
- \param na                    Number of rows/columns of A, number of rows of B and C
- \param ncb                   Number of columns  of B and C
- \param a                     matrix a
- \param lda                   leading dimension of matrix a
- \param b                     matrix b
- \param ldb                   leading dimension of matrix b
- \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param c                     matrix c
- \param ldc                   leading dimension of matrix c
- \result success              int reports success (1) or failure (0)
- */
- int elpa_mult_ah_b_complex_double(char uplo_a, char uplo_c, int na, int ncb, std::complex<double> *a, int lda, int ldaCols, std::complex<double> *b, int ldb, int ldbCols, int nblk, int mpi_comm_rows, int mpi_comm_cols, std::complex<double> *c, int ldc, int ldcCols);
- /*
- \brief C interface for elpa_mult_ah_b_complex_single: Performs C : = A**H * B for single-precision matrices
-         where   A is a square matrix (na,na) which is optionally upper or lower triangular
-                 B is a (na,ncb) matrix
-                 C is a (na,ncb) matrix where optionally only the upper or lower
-                   triangle may be computed
- \details
-
- \param  uplo_a               'U' if A is upper triangular
-                              'L' if A is lower triangular
-                              anything else if A is a full matrix
-                              Please note: This pertains to the original A (as set in the calling program)
-                                           whereas the transpose of A is used for calculations
-                              If uplo_a is 'U' or 'L', the other triangle is not used at all,
-                              i.e. it may contain arbitrary numbers
- \param uplo_c                'U' if only the upper diagonal part of C is needed
-                              'L' if only the upper diagonal part of C is needed
-                              anything else if the full matrix C is needed
-                              Please note: Even when uplo_c is 'U' or 'L', the other triangle may be
-                                            written to a certain extent, i.e. one shouldn't rely on the content there!
- \param na                    Number of rows/columns of A, number of rows of B and C
- \param ncb                   Number of columns  of B and C
- \param a                     matrix a
- \param lda                   leading dimension of matrix a
- \param b                     matrix b
- \param ldb                   leading dimension of matrix b
- \param nblk                  blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param c                     matrix c
- \param ldc                   leading dimension of matrix c
- \result success              int reports success (1) or failure (0)
- */
- int elpa_mult_ah_b_complex_single(char uplo_a, char uplo_c, int na, int ncb, std::complex<float> *a, int lda, int ldaCols, std::complex<float> *b, int ldb, int ldbCols, int nblk, int mpi_comm_rows, int mpi_comm_cols, std::complex<float> *c, int ldc, int ldcCols);
- /*
- \brief  C interface to elpa_invert_trm_real_double: Inverts a real double-precision upper triangular matrix
- \details
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be inverted
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              The lower triangle is not referenced.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_invert_trm_real_double(int na, double *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface to elpa_invert_trm_real_single: Inverts a real single-precision upper triangular matrix
- \details
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be inverted
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              The lower triangle is not referenced.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_invert_trm_real_single(int na, double *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface to elpa_invert_trm_complex_double: Inverts a double-precision complex upper triangular matrix
- \details
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be inverted
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              The lower triangle is not referenced.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_invert_trm_complex_double(int na, std::complex<double> *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface to elpa_invert_trm_complex_single: Inverts a single-precision complex upper triangular matrix
- \details
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be inverted
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              The lower triangle is not referenced.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_invert_trm_complex_single(int na, std::complex<float> *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  elpa_cholesky_real_double: Cholesky factorization of a double-precision real symmetric matrix
- \details
-
- *\param  na                   Order of matrix
- *\param  a(lda,matrixCols)    Distributed matrix which should be factorized.
- *                             Distribution is like in Scalapack.
- *                             Only upper triangle is needs to be set.
- *                             On return, the upper triangle contains the Cholesky factor
- *                             and the lower triangle is set to 0.
- *\param  lda                  Leading dimension of a
- *\param  matrixCols           local columns of matrix a
- *\param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- *\param  mpi_comm_rows        MPI communicator for rows
- *\param  mpi_comm_cols        MPI communicator for columns
- *\param wantDebug             int more debug information on failure if 1, else 0
- *\result succes               int reports success (1) or failure (0)
- */
- int elpa_cholesky_real_double(int na, double *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  elpa_cholesky_real_single: Cholesky factorization of a single-precision real symmetric matrix
- \details
-
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              On return, the upper triangle contains the Cholesky factor
-                              and the lower triangle is set to 0.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_cholesky_real_single(int na, float *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface elpa_cholesky_complex_double: Cholesky factorization of a double-precision complex hermitian matrix
- \details
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              On return, the upper triangle contains the Cholesky factor
-                              and the lower triangle is set to 0.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure, if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_cholesky_complex_double(int na, std::complex<double> *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*
- \brief  C interface elpa_cholesky_complex_single: Cholesky factorization of a single-precision complex hermitian matrix
- \details
- \param  na                   Order of matrix
- \param  a(lda,matrixCols)    Distributed matrix which should be factorized.
-                              Distribution is like in Scalapack.
-                              Only upper triangle is needs to be set.
-                              On return, the upper triangle contains the Cholesky factor
-                              and the lower triangle is set to 0.
- \param  lda                  Leading dimension of a
- \param                       matrixCols  local columns of matrix a
- \param  nblk                 blocksize of cyclic distribution, must be the same in both directions!
- \param  mpi_comm_rows        MPI communicator for rows
- \param  mpi_comm_cols        MPI communicator for columns
- \param wantDebug             int more debug information on failure, if 1, else 0
- \result succes               int reports success (1) or failure (0)
- */
- int elpa_cholesky_complex_single(int na, std::complex<float> *a, int lda, int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int wantDebug);
- /*! \brief C interface to solve the double-precision real eigenvalue problem with 2-stage solver
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useQR                      use QR decomposition 1 = yes, 0 = no
- *  \param useGPU                     use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_real_2stage_double_precision(int na, int nev, double *a, int lda, double *ev, double *q, int ldq, int nblk, 
- int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR, int useGPU);
- /*! \brief C interface to solve the single-precision real eigenvalue problem with 2-stage solver
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useQR                      use QR decomposition 1 = yes, 0 = no
- *  \param useGPU                     use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_real_2stage_single_precision(int na, int nev, float *a, int lda, float *ev, float *q, int ldq, int nblk, 
- int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_REAL_ELPA_KERNEL_API, int useQR, int useGPU);
- /*! \brief C interface to solve the double-precision complex eigenvalue problem with 2-stage solver
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_COMPLEX_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useGPU                     use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_2stage_double_precision(int na, int nev, std::complex<double> *a, int lda, double *ev, std::complex<double> *q, int ldq, 
- int nblk, int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API, int useGPU);
- /*! \brief C interface to solve the single-precision complex eigenvalue problem with 2-stage solver
- *
- *  \param  na                        Order of matrix a
- *  \param  nev                       Number of eigenvalues needed.
- *                                    The smallest nev eigenvalues/eigenvectors are calculated.
- *  \param  a                         Distributed matrix for which eigenvalues are to be computed.
- *                                    Distribution is like in Scalapack.
- *                                    The full matrix must be set (not only one half like in scalapack).
- *  \param lda                        Leading dimension of a
- *  \param ev(na)                     On output: eigenvalues of a, every processor gets the complete set
- *  \param q                          On output: Eigenvectors of a
- *                                    Distribution is like in Scalapack.
- *                                    Must be always dimensioned to the full size (corresponding to (na,na))
- *                                    even if only a part of the eigenvalues is needed.
- *  \param ldq                        Leading dimension of q
- *  \param nblk                       blocksize of cyclic distribution, must be the same in both directions!
- *  \param matrixCols                 distributed number of matrix columns
- *  \param mpi_comm_rows              MPI-Communicator for rows
- *  \param mpi_comm_cols              MPI-Communicator for columns
- *  \param mpi_coll_all               MPI communicator for the total processor set
- *  \param THIS_REAL_ELPA_KERNEL_API  specify used ELPA2 kernel via API
- *  \param useGPU                     use GPU (1=yes, 0=No)
- *
- *  \result                     int: 1 if error occured, otherwise 0
- */
- int elpa_solve_evp_complex_2stage_single_precision(int na, int nev, std::complex<float> *a, int lda, float *ev, std::complex<float> *q, int ldq, int nblk, 
- int matrixCols, int mpi_comm_rows, int mpi_comm_cols, int mpi_comm_all, int THIS_COMPLEX_ELPA_KERNEL_API, int useGPU);
diff --git a/src/linalg/elpa.hpp b/src/linalg/elpa.hpp
new file mode 100644
index 000000000..90c40c0fb
--- /dev/null
+++ b/src/linalg/elpa.hpp
@@ -0,0 +1,53 @@
+// Copyright (c) 2013-2020 Anton Kozhevnikov, Thomas Schulthess
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without modification, are permitted provided that
+// the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
+//    following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions
+//    and the following disclaimer in the documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef __ELPA_HPP__
+#define __ELPA_HPP__
+
+/** \file elpa.hpp
+ *
+ *  \brief Interface to ELPA library.
+ */
+
+extern "C" {
+
+struct elpa_struct;
+typedef struct elpa_struct* elpa_t;
+
+struct elpa_autotune_struct;
+typedef struct elpa_autotune_struct* elpa_autotune_t;
+
+#include <elpa/elpa_constants.h>
+#define complex _Complex
+#include <elpa/elpa_generated.h>
+#undef complex
+
+}
+
+
+//using elpa_t = void*;
+//using elpa_autotune_t = void*;
+//
+//extern "C" {
+//#define complex
+//#include <elpa/elpa_generated.h>
+//#undef complex
+//}
+
+#endif
diff --git a/src/linalg/linalg.hpp b/src/linalg/linalg.hpp
index 17914f546..53b830f20 100644
--- a/src/linalg/linalg.hpp
+++ b/src/linalg/linalg.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2013-2016 Anton Kozhevnikov, Thomas Schulthess
+// Copyright (c) 2013-2020 Anton Kozhevnikov, Thomas Schulthess
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without modification, are permitted provided that
@@ -27,7 +27,7 @@
 
 #include <stdint.h>
 #ifdef __GPU
-#include "gpu/gpublas_interface.hpp"
+#include "gpu/acc_blas.hpp"
 #endif
 #ifdef __MAGMA
 #include "gpu/magma.hpp"
@@ -225,7 +225,7 @@ inline void linalg::gemm<ftn_double>(char transa, char transb, ftn_int m, ftn_in
         }
         case linalg_t::gpublas: {
 #if defined(__GPU)
-            gpublas::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, sid());
+            accblas::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, sid());
 #else
             throw std::runtime_error("not compiled with GPU blas support!");
 #endif
@@ -233,7 +233,7 @@ inline void linalg::gemm<ftn_double>(char transa, char transb, ftn_int m, ftn_in
         }
         case linalg_t::cublasxt: {
 #if defined(__GPU) && defined(__CUDA)
-            gpublas::xt::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+            accblas::xt::dgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 #else
             throw std::runtime_error("not compiled with cublasxt");
 #endif
@@ -268,7 +268,7 @@ inline void linalg::gemm<ftn_double_complex>(char transa, char transb, ftn_int m
         }
         case linalg_t::gpublas: {
 #if defined(__GPU)
-            gpublas::zgemm(transa, transb, m, n, k, reinterpret_cast<acc_complex_double_t const*>(alpha),
+            accblas::zgemm(transa, transb, m, n, k, reinterpret_cast<acc_complex_double_t const*>(alpha),
                           reinterpret_cast<acc_complex_double_t const*>(A), lda, reinterpret_cast<acc_complex_double_t const*>(B),
                           ldb, reinterpret_cast<acc_complex_double_t const*>(beta),
                           reinterpret_cast<acc_complex_double_t*>(C), ldc, sid());
@@ -280,7 +280,7 @@ inline void linalg::gemm<ftn_double_complex>(char transa, char transb, ftn_int m
         }
         case linalg_t::cublasxt: {
 #if defined(__GPU) && defined(__CUDA)
-            gpublas::xt::zgemm(transa, transb, m, n, k, reinterpret_cast<acc_complex_double_t const*>(alpha),
+            accblas::xt::zgemm(transa, transb, m, n, k, reinterpret_cast<acc_complex_double_t const*>(alpha),
                               reinterpret_cast<acc_complex_double_t const*>(A), lda,
                               reinterpret_cast<acc_complex_double_t const*>(B), ldb,
                               reinterpret_cast<acc_complex_double_t const*>(beta),
@@ -399,7 +399,7 @@ inline void linalg::ger<ftn_double>(ftn_int m, ftn_int n, ftn_double const* alph
         }
         case linalg_t::gpublas: {
 #if defined(__GPU)
-            gpublas::dger(m, n, alpha, x, incx, y, incy, A, lda, sid());
+            accblas::dger(m, n, alpha, x, incx, y, incy, A, lda, sid());
 #else
             throw std::runtime_error("not compiled with GPU blas support!");
 #endif
@@ -428,7 +428,7 @@ inline void linalg::trmm<ftn_double>(char side, char uplo, char transa, ftn_int
         }
         case  linalg_t::gpublas: {
 #if defined(__GPU)
-            gpublas::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb, sid());
+            accblas::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb, sid());
 #else
             throw std::runtime_error("not compiled with GPU blas support!");
 #endif
@@ -436,7 +436,7 @@ inline void linalg::trmm<ftn_double>(char side, char uplo, char transa, ftn_int
         }
         case linalg_t::cublasxt: {
 #if defined(__GPU) && defined(__CUDA)
-            gpublas::xt::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb);
+            accblas::xt::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb);
 #else
             throw std::runtime_error("not compiled with cublasxt");
 #endif
@@ -463,7 +463,7 @@ inline void linalg::trmm<ftn_double_complex>(char side, char uplo, char transa,
         }
         case  linalg_t::gpublas: {
 #if defined(__GPU)
-            gpublas::ztrmm(side, uplo, transa, 'N', m, n, reinterpret_cast<acc_complex_double_t const*>(alpha),
+            accblas::ztrmm(side, uplo, transa, 'N', m, n, reinterpret_cast<acc_complex_double_t const*>(alpha),
                           reinterpret_cast<acc_complex_double_t const*>(A), lda,
                           reinterpret_cast<acc_complex_double_t*>(B), ldb, sid());
 #else
@@ -473,7 +473,7 @@ inline void linalg::trmm<ftn_double_complex>(char side, char uplo, char transa,
         }
         case linalg_t::cublasxt: {
 #if defined(__GPU) && defined(__CUDA)
-            gpublas::xt::ztrmm(side, uplo, transa, 'N', m, n, reinterpret_cast<acc_complex_double_t const*>(alpha),
+            accblas::xt::ztrmm(side, uplo, transa, 'N', m, n, reinterpret_cast<acc_complex_double_t const*>(alpha),
                               reinterpret_cast<acc_complex_double_t const*>(A), lda, reinterpret_cast<acc_complex_double_t*>(B), ldb);
 #else
             throw std::runtime_error("not compiled with cublasxt");
diff --git a/src/nlcglib/adaptor.cpp b/src/nlcglib/adaptor.cpp
new file mode 100644
index 000000000..71353f050
--- /dev/null
+++ b/src/nlcglib/adaptor.cpp
@@ -0,0 +1,378 @@
+#ifdef __NLCGLIB
+#include <stdexcept>
+
+#include "adaptor.hpp"
+#include "apply_hamiltonian.hpp"
+#include "hamiltonian/local_operator.hpp"
+#include "hamiltonian/hamiltonian.hpp"
+#include "dft/energy.hpp"
+#include "SDDK/wf_inner.hpp"
+
+using namespace nlcglib;
+
+namespace sirius {
+
+std::shared_ptr<Matrix> make_vector(const std::vector<std::shared_ptr<sddk::Wave_functions>>& wfct,
+                                    const Simulation_context& ctx,
+                                    const K_point_set& kset,
+                                    nlcglib::memory_type memory = nlcglib::memory_type::none)
+{
+    std::map<memory_t, nlcglib::memory_type> memtype = {{memory_t::device, nlcglib::memory_type::device},
+                                                        {memory_t::host, nlcglib::memory_type::host},
+                                                        {memory_t::host_pinned, nlcglib::memory_type::host}};
+    std::map<nlcglib::memory_type, memory_t> memtype_lookup = {{nlcglib::memory_type::none, memory_t::none},
+                                                               {nlcglib::memory_type::device, memory_t::device},
+                                                               {nlcglib::memory_type::host, memory_t::host},
+                                                               {nlcglib::memory_type::host, memory_t::host_pinned}};
+
+    memory_t target_memory = memtype_lookup.at(memory);
+    if (target_memory == memory_t::none) {
+        target_memory = ctx.preferred_memory_t();
+    }
+
+    std::vector<Matrix::buffer_t> data;
+    std::vector<std::pair<int, int>> kpoint_indices;
+    // sddk::memory_t preferred_memory = ctx.preferred_memory_t();
+    int num_spins                     = ctx.num_spins();
+    int nb                            = ctx.num_bands();
+    for (auto i = 0u; i < wfct.size(); ++i) {
+        auto gidk = kset.spl_num_kpoints(i); // global k-point index
+        for (int ispn = 0; ispn < num_spins; ++ispn) {
+            auto& array = wfct[i]->pw_coeffs(ispn).prime();
+            int lda              = array.size(0);
+            MPI_Comm comm        = wfct[i]->comm().mpi_comm();
+            // check that wfct has been allocated
+            if (is_device_memory(target_memory)) {
+                // make sure that array is on device
+                if (! array.on_device()) {
+                    throw std::runtime_error("Error: expected device storage, but got nullptr");
+                }
+            }
+            kpoint_indices.emplace_back(std::make_pair(gidk, ispn));
+            data.emplace_back(std::array<int, 2>{1, lda},   /* stride */
+                              std::array<int, 2>{lda, nb},  /* size */
+                              array.at(target_memory), /* pointer */
+                              memtype.at(target_memory),
+                              comm /* mpi communicator */);
+        }
+    }
+    return std::make_shared<Matrix>(std::move(data), std::move(kpoint_indices), kset.comm().mpi_comm());
+} // namespace sirius
+
+Matrix::buffer_t Matrix::get(int i)
+{
+    return data[i];
+}
+
+const Matrix::buffer_t Matrix::get(int i) const
+{
+    return data[i];
+}
+
+Energy::Energy(K_point_set& kset, Density& density, Potential& potential)
+    : kset(kset)
+    , density(density)
+    , potential(potential)
+{
+    // intialize hphi and sphi and allocate (device) memory
+    int nk    = kset.spl_num_kpoints().local_size();
+    auto& ctx = kset.ctx();
+    // auto& mpd = ctx.mem_pool(ctx.preferred_memory_t());
+    hphis.resize(nk);
+    sphis.resize(nk);
+    cphis.resize(nk);
+    for (int i = 0; i < nk; ++i) {
+        auto global_kpoint_index = kset.spl_num_kpoints(i);
+        auto& kp = *kset[global_kpoint_index];
+        int num_wf                        = ctx.num_bands();
+        sddk::memory_t preferred_memory_t = ctx.preferred_memory_t();
+        int num_spins                     = ctx.num_spins();
+        // make a new wf for Hamiltonian apply...
+        hphis[i] = std::make_shared<sddk::Wave_functions>(kp.gkvec_partition(), num_wf, preferred_memory_t, num_spins);
+        hphis[i]->allocate(sddk::spin_range(num_spins), ctx.preferred_memory_t());
+        sphis[i] = std::make_shared<sddk::Wave_functions>(kp.gkvec_partition(), num_wf, preferred_memory_t, num_spins);
+        sphis[i]->allocate(sddk::spin_range(num_spins), ctx.preferred_memory_t());
+        cphis[i] = kp.spinor_wave_functions_ptr();
+        // allocate on device
+        if (is_device_memory(ctx.preferred_memory_t())) {
+            const int num_sc = (ctx.num_mag_dims() == 3) ? 2 : 1;
+            auto& mpd = ctx.mem_pool(memory_t::device);
+            for (int ispn = 0; ispn < num_sc; ispn++) {
+                hphis[i]->pw_coeffs(ispn).allocate(mpd);
+                sphis[i]->pw_coeffs(ispn).allocate(mpd);
+            }
+        }
+    }
+    // need to allocate wavefunctions on GPU
+}
+
+void Energy::compute()
+{
+    auto& ctx = kset.ctx();
+    int num_spins = ctx.num_spins();
+    int num_bands = ctx.num_bands();
+    int nk = kset.spl_num_kpoints().local_size();
+
+    // // // transfer from device to host (only if data on GPU is present)
+    // if(is_device_memory(ctx.preferred_memory_t())) {
+    //     for (int ik = 0; ik < nk; ++ik) {
+    //         for (int ispn = 0; ispn < num_spins; ++ispn) {
+    //             int num_wf = cphis[ik]->num_wf();
+    //             if (cphis[ik]->pw_coeffs(ispn).prime().on_device()) {
+    //                 // std::cout << "copying wfc from DEVICE -> HOST" << "\n";
+    //                 cphis[ik]->pw_coeffs(ispn).copy_to(memory_t::host, 0, num_wf);
+    //             }
+
+    //         }
+    //     }
+    // }
+
+    density.generate(kset, true /* add core */, false /* transform to rg */);
+
+    if (ctx.use_symmetry()) {
+        density.symmetrize();
+        density.symmetrize_density_matrix();
+    }
+
+    density.fft_transform(1);
+    potential.generate(density);
+
+    if (ctx.use_symmetry()) {
+        potential.symmetrize();
+    }
+    potential.fft_transform(1);
+
+
+    /* compute H@X and new band energies */
+    memory_t mem{memory_t::host};
+    linalg_t la{linalg_t::blas};
+    if (ctx.processing_unit() == device_t::GPU) {
+        mem = memory_t::device;
+        la  = linalg_t::gpublas;
+    }
+    auto H0 = Hamiltonian0(potential);
+    // apply Hamiltonian
+    for (int i = 0; i < nk; ++i) {
+        auto& kp = *kset[kset.spl_num_kpoints(i)];
+        std::vector<double> band_energies(num_bands);
+
+        if (is_device_memory(ctx.preferred_memory_t())) {
+            auto& mpd        = ctx.mem_pool(memory_t::device);
+            for (int ispn = 0; ispn < num_spins; ispn++) {
+                cphis[i]->pw_coeffs(ispn).allocate(mpd);
+                // copy to device
+                int num_wf = cphis[i]->num_wf();
+                cphis[i]->pw_coeffs(ispn).copy_to(memory_t::device, 0, num_wf);
+            }
+        }
+
+        assert(cphis[i] == kp.spinor_wave_functions_ptr());
+        apply_hamiltonian(H0, kp, *hphis[i], kp.spinor_wave_functions(), sphis[i]);
+        // compute band energies
+        for (int ispn = 0; ispn < num_spins; ++ispn) {
+            for (int jj = 0; jj < num_bands; ++jj) {
+                dmatrix<std::complex<double>> dmat(1, 1, memory_t::host);
+                dmat.allocate(memory_t::device);
+                sddk::inner(mem, la, ispn,
+                            /* bra */ kp.spinor_wave_functions(), jj, 1,
+                            /* ket */ *hphis[i], jj, 1,
+                            /* out */ dmat, 0, 0);
+                // deal with memory...
+                // assert(std::abs(dmat(0, 0).imag()) < 1e-10);
+                kp.band_energy(jj, ispn, dmat(0, 0).real());
+            }
+        }
+    }
+    kset.sync_band_energies();
+
+    // evaluate total energy
+    double eewald = ewald_energy(ctx, ctx.gvec(), ctx.unit_cell());
+    this->etot    = total_energy(ctx, kset, density, potential, eewald);
+}
+
+
+void Energy::set_occupation_numbers(const std::vector<std::vector<double>>& fn)
+{
+    auto nk      = kset.spl_num_kpoints().local_size();
+    const int ns = kset.ctx().num_spins();
+    if (nk * ns != int(fn.size())) {
+        throw std::runtime_error("set_occupation_numbers: wrong number of k-points");
+    }
+
+    for (auto i = 0u; i < fn.size(); ++i) {
+        int ik   = i / ns;
+        int ispn = i % ns;
+        auto& kp = *kset[kset.spl_num_kpoints(ik)];
+        // BEWARE: nothing is allocated, it must be done outside.
+        for (auto j = 0u; j < fn[i].size(); ++j) {
+            kp.band_occupancy(j, ispn, fn[i][j]);
+        }
+    }
+}
+
+int Energy::occupancy()
+{
+    return kset.ctx().max_occupancy();
+}
+
+int Energy::nelectrons()
+{
+    return kset.unit_cell().num_electrons();
+}
+
+std::shared_ptr<nlcglib::MatrixBaseZ> Energy::get_hphi()
+{
+    return make_vector(this->hphis, this->kset.ctx(), this->kset);
+}
+
+std::shared_ptr<nlcglib::MatrixBaseZ> Energy::get_sphi()
+{
+    return make_vector(this->sphis, this->kset.ctx(), this->kset);
+}
+
+std::shared_ptr<nlcglib::MatrixBaseZ> Energy::get_C(nlcglib::memory_type memory = nlcglib::memory_type::none)
+{
+    return make_vector(this->cphis, this->kset.ctx(), this->kset, memory);
+}
+
+std::shared_ptr<nlcglib::VectorBaseZ> Energy::get_fn()
+{
+    auto nk      = kset.spl_num_kpoints().local_size();
+    const int ns = kset.ctx().num_spins();
+    int nbands = kset.ctx().num_bands();
+    std::vector<std::vector<double>> fn;
+    std::vector<std::pair<int, int>> kindices;
+    for (int ik = 0; ik < nk; ++ik) {
+        // global k-point index
+        auto gidk = kset.spl_num_kpoints(ik);
+        auto& kp = *kset[gidk];
+        for (int ispn = 0; ispn < ns; ++ispn) {
+            std::vector<double> fn_local(nbands);
+            for (int i = 0; i < nbands; ++i) {
+                fn_local[i] = kp.band_occupancy(i, ispn);
+            }
+            fn.push_back(std::move(fn_local));
+            kindices.emplace_back(gidk, ispn);
+        }
+    }
+    return std::make_shared<Array1d>(fn, kindices, kset.comm().mpi_comm());
+}
+
+void Energy::set_fn(const std::vector<std::vector<double>>& fn)
+{
+    auto nk      = kset.spl_num_kpoints().local_size();
+    const int ns = kset.ctx().num_spins();
+    const int nbands   = kset.ctx().num_bands();
+    #ifdef DEBUG
+    const double max_occ = ns == 1 ? 2.0 : 1.0;
+    #endif
+
+    assert(static_cast<int>(fn.size()) == nk*ns);
+    for (int ik = 0; ik < nk; ++ik) {
+        // global k-point index
+        auto gidk = kset.spl_num_kpoints(ik);
+        auto& kp  = *kset[gidk];
+        for (int ispn = 0; ispn < ns; ++ispn) {
+            const auto& fn_loc = fn[ik * ns + ispn];
+            assert(static_cast<int>(fn_loc.size()) == nbands);
+            for (int i = 0; i < nbands; ++i)
+            {
+                assert(fn_loc[i] >= 0 && fn_loc[i] <= max_occ);
+                kp.band_occupancy(i, ispn, fn_loc[i]);
+            }
+        }
+    }
+    kset.sync_band_occupancies();
+}
+
+std::shared_ptr<nlcglib::VectorBaseZ> Energy::get_ek()
+{
+    auto nk      = kset.spl_num_kpoints().local_size();
+    const int ns = kset.ctx().num_spins();
+    int nbands   = kset.ctx().num_bands();
+    std::vector<std::vector<double>> ek;
+    std::vector<std::pair<int, int>> kindices;
+    for (int ik = 0; ik < nk; ++ik) {
+        // global k-point index
+        auto gidk = kset.spl_num_kpoints(ik);
+        auto& kp  = *kset[gidk];
+        for (int ispn = 0; ispn < ns; ++ispn) {
+            std::vector<double> ek_local(nbands);
+            for (int i = 0; i < nbands; ++i) {
+                ek_local[i] = kp.band_energy(i, ispn);
+            }
+            ek.push_back(std::move(ek_local));
+            kindices.emplace_back(gidk, ispn);
+        }
+    }
+    return std::make_shared<Array1d>(ek, kindices, kset.comm().mpi_comm());
+}
+
+std::shared_ptr<nlcglib::VectorBaseZ> Energy::get_gkvec_ekin()
+{
+    auto nk      = kset.spl_num_kpoints().local_size();
+    const int ns = kset.ctx().num_spins();
+    std::vector<std::vector<double>> gkvec_cart;
+    std::vector<std::pair<int, int>> kindices;
+    for (int ik = 0; ik < nk; ++ik) {
+        // global k-point index
+        auto gidk = kset.spl_num_kpoints(ik);
+        auto& kp  = *kset[gidk];
+        for (int ispn = 0; ispn < ns; ++ispn) {
+            int gkvec_count = kp.gkvec().count();
+            auto& gkvec = kp.gkvec();
+            std::vector<double> gkvec_local(gkvec_count);
+            for (int i = 0; i < gkvec_count; ++i) {
+                gkvec_local[i] = gkvec.gkvec_cart<index_domain_t::global>(i).length();
+            }
+            gkvec_cart.push_back(std::move(gkvec_local));
+            kindices.emplace_back(gidk, ispn);
+        }
+    }
+    return std::make_shared<Array1d>(gkvec_cart, kindices, kset.comm().mpi_comm());
+}
+
+std::shared_ptr<nlcglib::ScalarBaseZ> Energy::get_kpoint_weights()
+{
+    auto nk = kset.spl_num_kpoints().local_size();
+    const int ns = kset.ctx().num_spins();
+    std::vector<double> weights;
+    std::vector<std::pair<int, int>> kindices;
+    for (int ik = 0; ik < nk; ++ik) {
+        // global k-point index
+        auto gidk = kset.spl_num_kpoints(ik);
+        auto& kp  = *kset[gidk];
+
+        // also return weights for every spin index
+        for (int ispn = 0; ispn < ns; ++ispn) {
+            weights.push_back(kp.weight());
+            kindices.emplace_back(gidk, ispn);
+        }
+    }
+    return std::make_shared<Scalar>(weights, kindices, kset.comm().mpi_comm());
+}
+
+double Energy::get_total_energy()
+{
+    return etot;
+}
+
+void Energy::set_wfct(nlcglib::MatrixBaseZ& vector)
+{
+    throw std::runtime_error("not implemented.");
+}
+
+Array1d::buffer_t Array1d::get(int i)
+{
+    // call 1d constructor
+    return buffer_t(data[i].size(), data[i].data(), nlcglib::memory_type::host);
+}
+
+const Array1d::buffer_t Array1d::get(int i) const
+{
+    // call 1d constructor
+    return buffer_t(data[i].size(), const_cast<double*>(data[i].data()), nlcglib::memory_type::host);
+}
+
+} // namespace sirius
+#endif
diff --git a/src/nlcglib/adaptor.hpp b/src/nlcglib/adaptor.hpp
new file mode 100644
index 000000000..2844af9e3
--- /dev/null
+++ b/src/nlcglib/adaptor.hpp
@@ -0,0 +1,200 @@
+#ifndef NLCGLIB_ADAPTOR_H
+#define NLCGLIB_ADAPTOR_H
+
+#include <memory>
+#include <nlcglib/interface.hpp>
+#include <cmath>
+
+#include "k_point/k_point_set.hpp"
+#include "density/density.hpp"
+#include "potential/potential.hpp"
+#include "SDDK/wave_functions.hpp"
+
+namespace sirius {
+
+class Matrix : public nlcglib::MatrixBaseZ
+{
+  public:
+    Matrix(const std::vector<buffer_t>& data, const std::vector<kindex_t>& indices, MPI_Comm mpi_comm = MPI_COMM_SELF)
+        : data(data)
+        , indices(indices)
+        , mpi_comm(mpi_comm)
+    {
+    }
+
+    Matrix(std::vector<buffer_t>&& data, std::vector<kindex_t>&& indices, MPI_Comm mpi_comm = MPI_COMM_SELF)
+        : data{std::forward<std::vector<buffer_t>>(data)}
+        , indices{std::forward<std::vector<kindex_t>>(indices)}
+        , mpi_comm(mpi_comm)
+    { /* empty */
+    }
+
+    buffer_t get(int i) override;
+    const buffer_t get(int i) const override;
+
+    int size() const override
+    {
+        return data.size();
+    };
+
+    MPI_Comm mpicomm(int i) const override
+    {
+        return data[i].mpi_comm;
+    }
+
+    MPI_Comm mpicomm() const override
+    {
+        return mpi_comm;
+    }
+
+    kindex_t kpoint_index(int i) const override
+    {
+        return indices[i];
+    }
+
+  private:
+    std::vector<buffer_t> data;
+    std::vector<kindex_t> indices;
+    MPI_Comm mpi_comm;
+};
+
+
+/// TODO: Array1d owns data...
+class Array1d : public nlcglib::VectorBaseZ
+{
+  public:
+    Array1d(const std::vector<std::vector<double>>& data, const std::vector<kindex_t>& indices, MPI_Comm mpi_comm = MPI_COMM_SELF)
+        : data(data)
+        , indices(indices)
+        , mpi_comm(mpi_comm)
+    {
+    }
+
+    Array1d(std::vector<std::vector<double>>&& data, std::vector<kindex_t>&& indices, MPI_Comm mpi_comm = MPI_COMM_SELF)
+        : data{std::forward<decltype(data)>(data)}
+        , indices{std::forward<decltype(indices)>(indices)}
+        , mpi_comm(mpi_comm)
+    {
+    }
+
+    buffer_t get(int i) override;
+    const buffer_t get(int i) const override;
+
+    int size() const override
+    {
+        return data.size();
+    };
+
+    MPI_Comm mpicomm(int i) const override
+    {
+        // this object is never distributed
+        return MPI_COMM_SELF;
+    }
+
+    MPI_Comm mpicomm() const override
+    {
+        return mpi_comm;
+    }
+
+    kindex_t kpoint_index(int i) const override
+    {
+        assert(i < static_cast<int>(indices.size()));
+        return indices[i];
+    }
+
+  private:
+    std::vector<std::vector<double>> data;
+    std::vector<kindex_t> indices;
+    MPI_Comm mpi_comm;
+};
+
+class Scalar : public nlcglib::ScalarBaseZ
+{
+  public:
+    Scalar(const std::vector<double>& data__, const std::vector<kindex_t>& indices__,
+           MPI_Comm mpi_comm = MPI_COMM_SELF)
+        : data(data__)
+        , indices(indices__)
+        , mpi_comm(mpi_comm)
+    {
+    }
+
+    Scalar(std::vector<double>&& data__, std::vector<kindex_t>&& indices__,
+           MPI_Comm mpi_comm = MPI_COMM_SELF)
+        : data{std::forward<decltype(data)>(data__)}
+        , indices{std::forward<decltype(indices)>(indices__)}
+        , mpi_comm(mpi_comm)
+    {
+    }
+
+    buffer_t get(int i) override
+    {
+        return data[i];
+    }
+
+    const buffer_t get(int i) const override
+    {
+        return data[i];
+    }
+
+    int size() const override
+    {
+        return data.size();
+    };
+
+    MPI_Comm mpicomm(int i) const override
+    {
+        // this object is never distributed
+        return MPI_COMM_SELF;
+    }
+
+    MPI_Comm mpicomm() const override
+    {
+        return mpi_comm;
+    }
+
+    kindex_t kpoint_index(int i) const override
+    {
+        return indices[i];
+    }
+
+  private:
+    std::vector<double> data;
+    std::vector<kindex_t> indices;
+    MPI_Comm mpi_comm;
+};
+
+/// Kohn-Sham energy
+class Energy : public nlcglib::EnergyBase
+{
+  public:
+    Energy(K_point_set& kset, Density& density, Potential& potential);
+
+    void set_occupation_numbers(const std::vector<std::vector<double>>& fn) override;
+    void set_wfct(nlcglib::MatrixBaseZ& vector) override;
+    int nelectrons() override;
+    int occupancy() override;
+    void compute() override;
+    double get_total_energy() override;
+    std::shared_ptr<nlcglib::MatrixBaseZ> get_hphi() override;
+    std::shared_ptr<nlcglib::MatrixBaseZ> get_sphi() override;
+    std::shared_ptr<nlcglib::MatrixBaseZ> get_C(nlcglib::memory_type) override;
+    std::shared_ptr<nlcglib::VectorBaseZ> get_fn() override;
+    void set_fn(const std::vector<std::vector<double>>& fn) override;
+    std::shared_ptr<nlcglib::VectorBaseZ> get_ek() override;
+    std::shared_ptr<nlcglib::VectorBaseZ> get_gkvec_ekin() override;
+    std::shared_ptr<nlcglib::ScalarBaseZ> get_kpoint_weights() override;
+
+  private:
+    K_point_set& kset;
+    Density& density;
+    Potential& potential;
+    std::vector<std::shared_ptr<sddk::Wave_functions>> hphis;
+    std::vector<std::shared_ptr<sddk::Wave_functions>> sphis;
+    std::vector<std::shared_ptr<sddk::Wave_functions>> cphis;
+    double etot{std::nan("1")};
+};
+
+} // namespace sirius
+
+#endif /* NLCGLIB_ADAPTOR_H */
diff --git a/src/nlcglib/apply_hamiltonian.hpp b/src/nlcglib/apply_hamiltonian.hpp
new file mode 100644
index 000000000..9b409f041
--- /dev/null
+++ b/src/nlcglib/apply_hamiltonian.hpp
@@ -0,0 +1,60 @@
+#ifndef APPLY_HAMILTONIAN_H
+#define APPLY_HAMILTONIAN_H
+
+#include "potential/potential.hpp"
+#include "hamiltonian/hamiltonian.hpp"
+#include "density/density.hpp"
+#include "SDDK/wave_functions.hpp"
+#include <memory>
+#include <complex>
+
+namespace sirius {
+
+void apply_hamiltonian(Hamiltonian0& H0, K_point& kp, Wave_functions& wf_out, Wave_functions& wf,
+                       std::shared_ptr<Wave_functions>& swf)
+{
+    /////////////////////////////////////////////////////////////
+    // // TODO: Hubbard needs manual call to copy to device // //
+    /////////////////////////////////////////////////////////////
+
+    int num_wf = wf.num_wf();
+    int num_sc = wf.num_sc();
+    if (num_wf != wf_out.num_wf() || wf_out.num_sc() != num_sc) {
+        throw std::runtime_error("Hamiltonian::apply_ref (python bindings): num_sc or num_wf do not match");
+    }
+    auto H    = H0(kp);
+    auto& ctx = H0.ctx();
+// #ifdef __GPU
+//     if (is_device_memory(ctx.preferred_memory_t())) {
+//         auto& mpd = ctx.mem_pool(memory_t::device);
+//         for (int ispn = 0; ispn < num_sc; ++ispn) {
+//             wf_out.pw_coeffs(ispn).allocate(mpd);
+//             wf.pw_coeffs(ispn).allocate(mpd);
+//             wf.pw_coeffs(ispn).copy_to(memory_t::device, 0, num_wf);
+//         }
+//     }
+// #endif
+    /* apply H to all wave functions */
+    int N = 0;
+    int n = num_wf;
+    for (int ispn_step = 0; ispn_step < ctx.num_spin_dims(); ispn_step++) {
+        // sping_range: 2 for non-colinear magnetism, otherwise ispn_step
+        auto spin_range = sddk::spin_range((ctx.num_mag_dims() == 3) ? 2 : ispn_step);
+        H.apply_h_s<std::complex<double>>(spin_range, N, n, wf, &wf_out, swf.get());
+    }
+// #ifdef __GPU
+//     if (is_device_memory(ctx.preferred_memory_t())) {
+//         for (int ispn = 0; ispn < num_sc; ++ispn) {
+//             wf_out.pw_coeffs(ispn).copy_to(memory_t::host, 0, n);
+//             if (swf) {
+//                 swf->pw_coeffs(ispn).copy_to(memory_t::host, 0, n);
+//             }
+//         }
+//     }
+// #endif // __GPU
+}
+
+
+}  // sirius
+
+#endif /* APPLY_HAMILTONIAN_H */
diff --git a/src/options.json b/src/options.json
index ecc5e5d84..f8e849577 100644
--- a/src/options.json
+++ b/src/options.json
@@ -65,9 +65,15 @@
         },
         "residual_tolerance" :
         {
-            "description" : "Tolerance for the residual L2 norm." ,
-          "usage" :  "residual_tolerance (1e-6)" ,
-          "default_value" :  0.000001
+            "description" : "Absolute tolerance for the residual L2 norm." ,
+            "usage" :  "residual_tolerance (1e-6)" ,
+            "default_value" :  0.000001
+        },
+        "relative_tolerance" :
+        {
+            "description" : "Relative tolerance for the residual L2 norm." ,
+            "usage" : "relative_tolerance (0.1)" ,
+            "default_value" : 0.0
         },
         "empty_state_tolerance" :
         {
diff --git a/src/potential/potential.hpp b/src/potential/potential.hpp
index 54ebe97d7..a7ad740f2 100644
--- a/src/potential/potential.hpp
+++ b/src/potential/potential.hpp
@@ -346,7 +346,7 @@ class Potential : public Field4D
 
         /* create list of XC functionals */
         for (auto& xc_label : ctx_.xc_functionals()) {
-            xc_func_.push_back(std::move(XC_functional(ctx_.spfft(), ctx_.unit_cell().lattice_vectors(), xc_label, ctx_.num_spins())));
+            xc_func_.emplace_back(ctx_.spfft(), ctx_.unit_cell().lattice_vectors(), xc_label, ctx_.num_spins());
         }
 
         using pf = Periodic_function<double>;
@@ -486,6 +486,13 @@ class Potential : public Field4D
                 }
             }
         }
+
+        // VDWXC depends on unit cell, which might have changed.
+        for (auto& xc : xc_func_) {
+            if (xc.is_vdw()) {
+                xc.vdw_update_unit_cell(ctx_.spfft(), ctx_.unit_cell().lattice_vectors());
+            }
+        }
     }
 
     /// Solve Poisson equation for a single atom.
@@ -958,6 +965,14 @@ class Potential : public Field4D
     /// Generate plane-wave coefficients of the potential in the interstitial region.
     void generate_pw_coefs();
 
+    void insert_xc_functionals(const std::vector<std::string>& labels__)
+    {
+        /* create list of XC functionals */
+        for (auto& xc_label : labels__) {
+            xc_func_.emplace_back(ctx_.spfft(), ctx_.unit_cell().lattice_vectors(), xc_label, ctx_.num_spins());
+        }
+    }
+
     /// Calculate D operator from potential and augmentation charge.
     /** The following real symmetric matrix is computed:
      *  \f[
diff --git a/src/potential/xc.cpp b/src/potential/xc.cpp
index 0023ab137..d6aa5ca39 100644
--- a/src/potential/xc.cpp
+++ b/src/potential/xc.cpp
@@ -27,6 +27,7 @@
 #include "potential.hpp"
 #include "typedefs.hpp"
 #include "utils/profiler.hpp"
+#include "SDDK/omp.hpp"
 
 namespace sirius {
 
@@ -1045,7 +1046,7 @@ void Potential::xc(Density const& density__)
 {
     PROFILE("sirius::Potential::xc");
 
-    if (ctx_.xc_functionals().size() == 0) {
+    if (xc_func_.size() == 0) {
         xc_potential_->zero();
         xc_energy_density_->zero();
         for (int i = 0; i < ctx_.num_mag_dims(); i++) {
diff --git a/src/potential/xc_functional.hpp b/src/potential/xc_functional.hpp
index fc8625266..87b80021e 100644
--- a/src/potential/xc_functional.hpp
+++ b/src/potential/xc_functional.hpp
@@ -39,7 +39,7 @@
 namespace sirius {
 
 /// Interface class to Libxc.
-    class XC_functional : public XC_functional_base
+class XC_functional : public XC_functional_base
 {
     private:
         // I can not use a generic void pointer because xc_func_type is a structure
@@ -55,12 +55,12 @@ namespace sirius {
         XC_functional& operator=(const XC_functional& src) = delete;
 
     public:
-    /* we need the context because libvdwxc asks for lattice vectors and fft parameters */
 
-    XC_functional(spfft::Transform const& fft__, const matrix3d<double>& lattice_vectors__,
-                  const std::string libxc_name__, int num_spins__)
-        :  XC_functional_base(libxc_name__, num_spins__)
-        {
+      /* we need the context because libvdwxc asks for lattice vectors and fft parameters */
+      XC_functional(spfft::Transform const& fft__, const matrix3d<double>& lattice_vectors__,
+                    const std::string libxc_name__, int num_spins__)
+          : XC_functional_base(libxc_name__, num_spins__)
+    {
 
 #if defined(__USE_VDWXC)
             /* return immediately if the functional_base class is initialized */
@@ -200,6 +200,21 @@ namespace sirius {
             return false;
 #endif
         }
+
+    void vdw_update_unit_cell(spfft::Transform const& fft__, const matrix3d<double>& lattice_vectors__)
+    {
+        #ifdef __USE_VDWXC
+        if(is_vdw()) {
+            double v1[3] = {lattice_vectors__(0, 0), lattice_vectors__(1, 0), lattice_vectors__(2, 0)};
+            double v2[3] = {lattice_vectors__(0, 1), lattice_vectors__(1, 1), lattice_vectors__(2, 1)};
+            double v3[3] = {lattice_vectors__(0, 2), lattice_vectors__(1, 2), lattice_vectors__(2, 2)};
+
+            vdwxc_set_unit_cell(handler_vdw_, fft__.dim_x(), fft__.dim_y(), fft__.dim_z(), v1[0], v1[1], v1[2], v2[0],
+                                v2[1], v2[2], v3[0], v3[1], v3[2]);
+        }
+        #endif
+    }
+
         int kind() const
         {
 
diff --git a/src/radial/radial_integrals.cpp b/src/radial/radial_integrals.cpp
index 7d45f48f6..8aecbd594 100644
--- a/src/radial/radial_integrals.cpp
+++ b/src/radial/radial_integrals.cpp
@@ -23,7 +23,6 @@
  */
 
 #include "radial_integrals.hpp"
-#include <omp.h>
 
 namespace sirius {
 
diff --git a/src/radial/radial_solver.hpp b/src/radial/radial_solver.hpp
index 3323ae40f..823e9601c 100644
--- a/src/radial/radial_solver.hpp
+++ b/src/radial/radial_solver.hpp
@@ -663,10 +663,10 @@ class Radial_solver
         int nn{0};
 
         for (int j = 0; j <= dme__; j++) {
-            p.push_back(std::move(std::vector<double>(nr)));
-            q.push_back(std::move(std::vector<double>(nr)));
-            dpdr.push_back(std::move(std::vector<double>(nr)));
-            dqdr.push_back(std::move(std::vector<double>(nr)));
+            p.push_back(std::vector<double>(nr));
+            q.push_back(std::vector<double>(nr));
+            dpdr.push_back(std::vector<double>(nr));
+            dqdr.push_back(std::vector<double>(nr));
 
             if (j) {
                 if (rel__ == relativity_t::none || rel__ == relativity_t::zora) {
diff --git a/src/simulation_context.cpp b/src/simulation_context.cpp
index 62438d2fc..bfea559dd 100644
--- a/src/simulation_context.cpp
+++ b/src/simulation_context.cpp
@@ -28,6 +28,7 @@
 #include "symmetry/find_lat_sym.hpp"
 #include "utils/profiler.hpp"
 #include "utils/env.hpp"
+#include "SDDK/omp.hpp"
 
 namespace sirius {
 
@@ -457,6 +458,7 @@ void Simulation_context::initialize()
         if (num_mag_dims() == 3) {
             nbnd *= 2;
         }
+        /* if number of bands was not set by the host code, set it here */
         if (num_bands() < 0) {
             num_bands(nbnd);
         }
@@ -497,12 +499,29 @@ void Simulation_context::initialize()
         if (evsn[i] == "") {
             /* conditions for sequential diagonalization */
             if (comm_band().size() == 1 || npc == 1 || npr == 1 || !is_scalapack) {
-                if (is_cuda) {
-                    evsn[i] = "cusolver";
-                } else if (is_magma && num_bands() > 200) {
-                    evsn[i] = "magma";
+                if (full_potential()) {
+                    if (is_magma) {
+                        evsn[i] = "magma";
+                    } else if (is_cuda) {
+                        evsn[i] = "cusolver";
+                    } else {
+                        evsn[i] = "lapack";
+                    }
+                    //if (is_cuda) {
+                    //    evsn[i] = "cusolver";
+                    //} else if (is_magma) {
+                    //    evsn[i] = "magma";
+                    //} else {
+                    //    evsn[i] = "lapack";
+                    //}
                 } else {
-                    evsn[i] = "lapack";
+                    if (is_cuda) {
+                        evsn[i] = "cusolver";
+                    } else if (is_magma && num_bands() > 200) {
+                        evsn[i] = "magma";
+                    } else {
+                        evsn[i] = "lapack";
+                    }
                 }
             } else {
                 if (is_scalapack) {
@@ -518,8 +537,8 @@ void Simulation_context::initialize()
     std_evp_solver_name(evsn[0]);
     gen_evp_solver_name(evsn[1]);
 
-    std_evp_solver_ = Eigensolver_factory(std_evp_solver_type());
-    gen_evp_solver_ = Eigensolver_factory(gen_evp_solver_type());
+    std_evp_solver_ = Eigensolver_factory(std_evp_solver_name(), &mem_pool(memory_t::device));
+    gen_evp_solver_ = Eigensolver_factory(gen_evp_solver_name(), &mem_pool(memory_t::device));
 
     auto& std_solver = std_evp_solver();
     auto& gen_solver = gen_evp_solver();
@@ -672,7 +691,7 @@ void Simulation_context::print_info() const
 
     std::string evsn[] = {"standard eigen-value solver        : ", "generalized eigen-value solver     : "};
 
-    ev_solver_t evst[] = {std_evp_solver_type(), gen_evp_solver_type()};
+    ev_solver_t evst[] = {std_evp_solver().type(), gen_evp_solver().type()};
     for (int i = 0; i < 2; i++) {
         std::printf("%s", evsn[i].c_str());
         switch (evst[i]) {
@@ -687,12 +706,8 @@ void Simulation_context::print_info() const
             }
 #endif
 #if defined(__ELPA)
-            case ev_solver_t::elpa1: {
-                std::printf("ELPA1\n");
-                break;
-            }
-            case ev_solver_t::elpa2: {
-                std::printf("ELPA2\n");
+            case ev_solver_t::elpa: {
+                std::printf("ELPA\n");
                 break;
             }
 #endif
@@ -1242,7 +1257,8 @@ void Simulation_context::generate_phase_factors(int iat__, mdarray<double_comple
 
 void Simulation_context::print_memory_usage(const char *file__, int line__)
 {
-    if (comm().rank() == 0 && control().print_memory_usage_ && control().verbosity_ >= 1) {
+    auto pmu = utils::get_env<int>("SIRIUS_PRINT_MEMORY_USAGE");
+    if (comm().rank() == 0 && ((control().print_memory_usage_ && control().verbosity_ >= 1) || (pmu && *pmu))) {
         sirius::print_memory_usage(file__, line__);
 
         std::vector<std::string> labels = {"host"};
diff --git a/src/simulation_context.hpp b/src/simulation_context.hpp
index 902fa509a..8120269c7 100644
--- a/src/simulation_context.hpp
+++ b/src/simulation_context.hpp
@@ -453,22 +453,22 @@ class Simulation_context : public Simulation_parameters
         return start_time_tag_;
     }
 
-    inline ev_solver_t std_evp_solver_type() const
+    inline Eigensolver& std_evp_solver()
     {
-        return get_ev_solver_t(std_evp_solver_name());
+        return* std_evp_solver_;
     }
 
-    inline ev_solver_t gen_evp_solver_type() const
+    inline Eigensolver const& std_evp_solver() const
     {
-        return get_ev_solver_t(gen_evp_solver_name());
+        return* std_evp_solver_;
     }
 
-    inline Eigensolver& std_evp_solver()
+    inline Eigensolver& gen_evp_solver()
     {
-        return* std_evp_solver_;
+        return* gen_evp_solver_;
     }
 
-    inline Eigensolver& gen_evp_solver()
+    inline Eigensolver const& gen_evp_solver() const
     {
         return* gen_evp_solver_;
     }
diff --git a/src/simulation_parameters.cpp b/src/simulation_parameters.cpp
index 305bfc88b..8e78e4708 100644
--- a/src/simulation_parameters.cpp
+++ b/src/simulation_parameters.cpp
@@ -49,6 +49,8 @@ void Simulation_parameters::import(std::string const& str__)
     settings_input_.read(dict);
     /* read hubbard parameters */
     hubbard_input_.read(dict);
+    /* read nlcg parameters */
+    nlcg_input_.read(dict);
 }
 
 void Simulation_parameters::import(json const& dict)
@@ -70,6 +72,8 @@ void Simulation_parameters::import(json const& dict)
     settings_input_.read(dict);
     /* read hubbard parameters */
     hubbard_input_.read(dict);
+    /* read nlcg parameters */
+    nlcg_input_.read(dict);
 }
 
 void Simulation_parameters::import(cmd_args const& args__)
diff --git a/src/simulation_parameters.hpp b/src/simulation_parameters.hpp
index 263483cca..ad65e6a19 100644
--- a/src/simulation_parameters.hpp
+++ b/src/simulation_parameters.hpp
@@ -87,6 +87,9 @@ class Simulation_parameters
     /// LDA+U input parameters.
     Hubbard_input hubbard_input_;
 
+    /// NLCG input parameters
+    NLCG_input nlcg_input_;
+
     /// json dictionary containing all runtime options set up through the interface
     json runtime_options_dictionary_;
 
@@ -540,6 +543,11 @@ class Simulation_parameters
         return hubbard_input_;
     }
 
+    NLCG_input const& nlcg_input() const
+    {
+        return nlcg_input_;
+    }
+
     /// Get the options set at runtime.
     json& get_runtime_options_dictionary()
     {
@@ -568,7 +576,7 @@ class Simulation_parameters
     memory_pool& mem_pool(memory_t M__) const
     {
         if (memory_pool_.count(M__) == 0) {
-            memory_pool_.emplace(M__, std::move(memory_pool(M__)));
+            memory_pool_.emplace(M__, memory_pool(M__));
         }
         return memory_pool_.at(M__);
     }
diff --git a/src/sirius.hpp b/src/sirius.hpp
index b017d1548..56e0bf82e 100644
--- a/src/sirius.hpp
+++ b/src/sirius.hpp
@@ -29,6 +29,13 @@
 #include <apex_api.hpp>
 #endif
 
+#include "SDDK/omp.hpp"
+#if defined(__GPU) && defined(__CUDA)
+#include "gpu/cusolver.hpp"
+#endif
+#if defined(__ELPA)
+#include "linalg/elpa.hpp"
+#endif
 #include "utils/cmd_args.hpp"
 #include "utils/json.hpp"
 #include "utils/profiler.hpp"
@@ -99,10 +106,10 @@ inline void initialize(bool call_mpi_init__ = true)
            number of OMP threads */
         acc::create_streams(omp_get_max_threads() + 100);
 #if defined(__GPU)
-        gpublas::create_stream_handles();
+        accblas::create_stream_handles();
 #endif
 #if defined(__CUDA)
-        cublas::xt::create_handle();
+        accblas::xt::create_handle();
         cusolver::create_handle();
 #endif
     }
@@ -143,11 +150,11 @@ inline void finalize(bool call_mpi_fin__ = true, bool reset_device__ = true, boo
 
     if (acc::num_devices()) {
 #if defined(__GPU)
-        gpublas::destroy_stream_handles();
+        accblas::destroy_stream_handles();
 #endif
 #if defined(__CUDA)
         cusolver::destroy_handle();
-        cublas::xt::destroy_handle();
+        accblas::xt::destroy_handle();
 #endif
         acc::destroy_streams();
         if (reset_device__) {
@@ -178,9 +185,45 @@ inline void finalize(bool call_mpi_fin__ = true, bool reset_device__ = true, boo
 
 /** \mainpage Welcome to SIRIUS
 
-SIRIUS is a domain-specific library for electronic structure calculations. It supports full-potential linearized
-augmented plane wave (FP-LAPW) and pseudopotential plane wave (PP-PW) methods with ultrasoft, norm-conserving and PAW
-flavors of pseudopotential and is designed to work with codes such as Exciting, Elk and Quantum ESPRESSO.
+  SIRIUS is a domain specific library for electronic structure calculations. It implements pseudopotential plane
+  wave (PP-PW) and full potential linearized augmented plane wave (FP-LAPW) methods and is designed for
+  GPU acceleration of popular community codes such as Exciting, Elk and Quantum ESPRESSO.
+  SIRIUS is written in C++11 with MPI, OpenMP and CUDA/ROCm programming models. SIRIUS is organised as a
+  collection of classes that abstract away the different building blocks of DFT self-consistency cycle.
+
+  For a quick start please refer to the main development page at
+  <a href="https://github.com/electronic-structure/SIRIUS">GitHub</a>.
+
+  The generated Fortran API is described here: generated.f90
+
+  The frequent variable names are listed on the page \ref stdvarname.
+
+  We use the following \ref coding.
+
+  The library files and directories are organised in the following way:
+    - \b apps -
+     - \b atoms - utility program to generate FP-LAPW atomic species files
+     - \b bands - band plotting
+     - \b cif_input - CIF parser
+     - \b dft_loop - DFT miniapp
+     - \b tests - tests of various functionality
+     - \b timers - scripts to analyze timer outputs
+     - \b unit_tests - unit tests
+     - \b upf - scripts to parse and convert UPF files
+     - \b utils - utilities to work with unit cell
+    - \b ci - directory with Jenkins, Travis CI and GitHub action scripts
+    - \b cmake - directory with CMake scripts
+    - \b doc - this directory contains configuration file for Doxygen documentation and PNG images
+    - \b examples - examples of input files for pseudopotential and full-potential calculations
+    - \b python_module - Python interface module
+    - \b reframe - ReFrame regression tests description
+    - \b src - main directory with the source code
+    - \b verification - verification tests
+    - .clang-format - source code formatting rules
+    - CMakeLists.txt - CMake file of the project
+    - check_format.py, check_format.x - scripts to check source code formatting
+    - clang_format.x - script to apply Clang format to a file
+    - prerequisite.py - script to install missing dependencies
 
 */
 
diff --git a/src/symmetry/symmetrize.hpp b/src/symmetry/symmetrize.hpp
index fdada710a..b6938b039 100644
--- a/src/symmetry/symmetrize.hpp
+++ b/src/symmetry/symmetrize.hpp
@@ -25,9 +25,9 @@
 #ifndef __SYMMETRIZE_HPP__
 #define __SYMMETRIZE_HPP__
 
-#include <omp.h>
 #include "unit_cell/unit_cell_symmetry.hpp"
 #include "SDDK/gvec.hpp"
+#include "SDDK/omp.hpp"
 #include "typedefs.hpp"
 #include "sht/sht.hpp"
 #include "utils/profiler.hpp"
diff --git a/src/unit_cell/atom_symmetry_class.hpp b/src/unit_cell/atom_symmetry_class.hpp
index b71a69896..002b125a1 100644
--- a/src/unit_cell/atom_symmetry_class.hpp
+++ b/src/unit_cell/atom_symmetry_class.hpp
@@ -26,7 +26,7 @@
 #define __ATOM_SYMMETRY_CLASS_HPP__
 
 #include "atom_type.hpp"
-#include "linalg/eigenproblem.hpp"
+#include "linalg/eigensolver.hpp"
 
 namespace sirius {
 
@@ -569,12 +569,12 @@ inline std::vector<int> Atom_symmetry_class::check_lo_linear_independence(double
     mdarray<double, 2> ovlp(num_lo_descriptors(), num_lo_descriptors());
     loprod >> ovlp;
 
-    Eigensolver_lapack stdevp;
+    auto stdevp = Eigensolver_factory("lapack", nullptr);
 
     std::vector<double> loprod_eval(num_lo_descriptors());
     dmatrix<double>     loprod_evec(num_lo_descriptors(), num_lo_descriptors());
 
-    stdevp.solve(num_lo_descriptors(), loprod, &loprod_eval[0], loprod_evec);
+    stdevp->solve(num_lo_descriptors(), loprod, &loprod_eval[0], loprod_evec);
 
     if (std::abs(loprod_eval[0]) < tol__) {
         std::printf("\n");
@@ -616,7 +616,7 @@ inline std::vector<int> Atom_symmetry_class::check_lo_linear_independence(double
             }
         }
 
-        stdevp.solve(static_cast<int>(ilo.size()), tmp, &eval[0], evec);
+        stdevp->solve(static_cast<int>(ilo.size()), tmp, &eval[0], evec);
 
         if (eval[0] < tol__) {
             std::printf("local orbital %i can be removed\n", i);
diff --git a/src/unit_cell/atom_type.hpp b/src/unit_cell/atom_type.hpp
index 4da555a19..408e10f39 100644
--- a/src/unit_cell/atom_type.hpp
+++ b/src/unit_cell/atom_type.hpp
@@ -454,7 +454,7 @@ class Atom_type
         local_orbital_descriptor lod;
 
         Spline<double> s(radial_grid_, f__);
-        ps_atomic_wfs_.push_back(std::move(std::make_tuple(n__, l__, occ__, std::move(s))));
+        ps_atomic_wfs_.push_back(std::make_tuple(n__, l__, occ__, std::move(s)));
     }
 
     /// Return a tuple describing a given atomic radial function
@@ -483,7 +483,7 @@ class Atom_type
             TERMINATE("can't add more beta projectors");
         }
         Spline<double> s(radial_grid_, beta__);
-        beta_radial_functions_.push_back(std::move(std::make_pair(l__, std::move(s))));
+        beta_radial_functions_.push_back(std::make_pair(l__, std::move(s)));
 
         local_orbital_descriptor lod;
         lod.l = std::abs(l__);
diff --git a/src/unit_cell/unit_cell.cpp b/src/unit_cell/unit_cell.cpp
index b47d2e4ff..a682114ab 100644
--- a/src/unit_cell/unit_cell.cpp
+++ b/src/unit_cell/unit_cell.cpp
@@ -499,7 +499,7 @@ Atom_type& Unit_cell::add_atom_type(const std::string label__, const std::string
     }
 
     int id = next_atom_type_id(label__);
-    atom_types_.push_back(std::move(Atom_type(parameters_, id, label__, file_name__)));
+    atom_types_.push_back(Atom_type(parameters_, id, label__, file_name__));
     return atom_types_.back();
 }
 
@@ -517,7 +517,7 @@ void Unit_cell::add_atom(const std::string label, vector3d<double> position, vec
         TERMINATE(s);
     }
 
-    atoms_.push_back(std::move(Atom(atom_type(label), position, vector_field)));
+    atoms_.push_back(Atom(atom_type(label), position, vector_field));
     atom_type(label).add_atom_id(static_cast<int>(atoms_.size()) - 1);
 }
 
@@ -573,12 +573,12 @@ void Unit_cell::initialize()
     for (int iat = 0; iat < num_atom_types(); iat++) {
         int nat = atom_type(iat).num_atoms();
         if (nat > 0) {
-            atom_coord_.push_back(std::move(mdarray<double, 2>(nat, 3, memory_t::host)));
+            atom_coord_.push_back(mdarray<double, 2>(nat, 3, memory_t::host));
             if (parameters_.processing_unit() == device_t::GPU) {
                 atom_coord_.back().allocate(memory_t::device);
             }
         } else {
-            atom_coord_.push_back(std::move(mdarray<double, 2>()));
+            atom_coord_.push_back(mdarray<double, 2>());
         }
     }
     update();
@@ -630,7 +630,7 @@ void Unit_cell::get_symmetry()
         if (asc[i] == -1) {
             /* take next id */
             atom_class_id++;
-            atom_symmetry_classes_.push_back(std::move(Atom_symmetry_class(atom_class_id, atoms_[i].type())));
+            atom_symmetry_classes_.push_back(Atom_symmetry_class(atom_class_id, atoms_[i].type()));
 
             /* scan all atoms */
             for (int j = 0; j < num_atoms(); j++) {
diff --git a/src/utils/cmd_args.hpp b/src/utils/cmd_args.hpp
index d4ba1d7eb..5e661cea7 100644
--- a/src/utils/cmd_args.hpp
+++ b/src/utils/cmd_args.hpp
@@ -54,6 +54,16 @@ class cmd_args
     /// Constructor.
     cmd_args();
 
+    /// Constructor with the list of keys.
+    /** The following example shows how to initialize arguments:
+        \code{.cpp}
+         cmd_args args(argn, argv, {
+             {"device=", "(string) CPU or GPU"},
+             {"pw_cutoff=", "(double) plane-wave cutoff for density and potential"},
+             {"N=", "(int) cell multiplicity"}
+        });
+        \endcode
+     */
     cmd_args(int argn__, char** argv__, std::initializer_list<std::pair<std::string, std::string>> keys__);
 
     void register_key(std::string const key__, std::string const description__);
diff --git a/src/utils/env.hpp b/src/utils/env.hpp
index d945e3a69..862349383 100644
--- a/src/utils/env.hpp
+++ b/src/utils/env.hpp
@@ -37,23 +37,18 @@ namespace utils {
 template <typename T>
 inline T const* get_env(std::string const& name__)
 {
-    static std::map<std::string, std::pair<bool, T>> map_name;
+    static std::map<std::string, T*> map_name;
     if (map_name.count(name__) == 0) {
         /* first time the function is called */
         const char* raw_str = std::getenv(name__.c_str());
         if (raw_str == NULL) {
-            map_name[name__] = std::make_pair(false, T());
+            map_name[name__] = nullptr;
         } else {
-            T var;
-            std::istringstream(std::string(raw_str)) >> var;
-            map_name[name__] = std::make_pair(true, var);
+            map_name[name__] = new T;
+            std::istringstream(std::string(raw_str)) >> (*map_name[name__]);
         }
     }
-    if (map_name[name__].first == false) {
-        return nullptr;
-    } else {
-        return &map_name[name__].second;
-    }
+    return map_name[name__];
 }
 
 } // namespace utils