diff --git a/.gitignore b/.gitignore
index 769857d7..ff73601b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -122,3 +122,4 @@ performance/spmm/spmm
 *.user
 
 .vscode
+a.out
diff --git a/build.sh b/build.sh
index 4f53b52f..9809c63c 100644
--- a/build.sh
+++ b/build.sh
@@ -1,8 +1,8 @@
 nvcc -DCUSP_PATH=$(realpath .) \
      -I . -I ../KTT/Source \
-     -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \
-     --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \
-     -std=c++17 -g -O3 -DKTT_LINE_INFO -lineinfo main.cu
+     -l cuda -l ktt -L ../KTT/Build/x86_64_Release/ \
+     --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Release/) \
+     -std=c++17 -g -O3 -lineinfo main.cu
 
 [ "$?" -eq "0" ] || exit 1
 
diff --git a/build/build-env.py b/build/build-env.py
index fbd7f216..3d19bc53 100644
--- a/build/build-env.py
+++ b/build/build-env.py
@@ -277,6 +277,10 @@ def addKTT(env):
     env.Append(RPATH = [ ktt_lib_path ])
     env.Append(LIBS = [ "ktt" ])
 
+    if env['mode'] == 'debug':
+        env.Append(CFLAGS   = ['-DKTT_LINE_INFO'])
+        env.Append(CXXFLAGS = ['-DKTT_LINE_INFO'])
+
 
 def Environment(buildDir):
     # allow the user discretion to choose the MSVC version
@@ -465,7 +469,7 @@ def Environment(buildDir):
             env['ENV']['DYLD_LIBRARY_PATH'] = os.environ['DYLD_LIBRARY_PATH']
         elif 'LD_LIBRARY_PATH' in os.environ:
             env['ENV']['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH']
-    
+
     addKTT(env)
 
     # generate help text
diff --git a/cusp/detail/temporary_array.h b/cusp/detail/temporary_array.h
index 08d27908..8dbed6b0 100644
--- a/cusp/detail/temporary_array.h
+++ b/cusp/detail/temporary_array.h
@@ -28,7 +28,7 @@
 #include <thrust/detail/temporary_array.h>
 
 #if THRUST_VERSION >= 100800
-#define TEMP_HOST_DEVICE_DECORATORS __host__ __device__
+#define TEMP_HOST_DEVICE_DECORATORS __host__
 #else
 #define TEMP_HOST_DEVICE_DECORATORS
 #endif
diff --git a/cusp/ktt/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl
new file mode 100644
index 00000000..8a67949d
--- /dev/null
+++ b/cusp/ktt/detail/ellr_matrix.inl
@@ -0,0 +1,137 @@
+#include <cusp/array2d.h>
+#include <cusp/convert.h>
+#include <cusp/detail/utils.h>
+
+#include <thrust/transform.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/device_ptr.h> // raw_pointer_cast
+
+namespace cusp
+{
+
+namespace ktt
+{
+
+
+template<typename IndexType>
+struct ell_row_length
+{
+    const IndexType* column_indices;
+    size_t num_cols_per_row;
+    size_t pitch;
+
+    __host__ __device__ IndexType operator()(IndexType row_idx)
+    {
+        IndexType len = 0;
+
+        while (len < num_cols_per_row
+                && column_indices[row_idx + len*pitch] >= 0)
+        {
+            len++;
+        }
+
+        return len;
+    }
+};
+
+template<typename IndexType, typename ValueType, typename MemorySpace>
+void
+compute_row_lengths(cusp::ktt::ellr_matrix<IndexType, ValueType, MemorySpace>& A)
+{
+    thrust::counting_iterator<IndexType> row_idx_it(0);
+
+    thrust::transform(
+        row_idx_it,
+        row_idx_it + A.num_rows,
+        A.row_lengths.begin(),
+        ell_row_length<IndexType>{
+            thrust::raw_pointer_cast(&A.column_indices(0, 0)),
+            A.column_indices.num_cols,
+            A.column_indices.pitch }
+    );
+}
+
+
+//////////////////
+// Constructors //
+//////////////////
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::ellr_matrix(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+             const size_t num_entries_per_row, const size_t alignment)
+    : Parent(num_rows, num_cols, num_entries, num_entries_per_row, alignment)
+{
+    row_lengths.resize(num_rows);
+}
+
+// construct from a different matrix
+template <typename IndexType, typename ValueType, class MemorySpace>
+template <typename MatrixType>
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::ellr_matrix(const MatrixType& matrix)
+{
+    cusp::convert(matrix, *this);
+    update_row_lengths();
+}
+
+//////////////////////
+// Member Functions //
+//////////////////////
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::swap(ellr_matrix& matrix)
+{
+    Parent::swap(matrix);
+    row_lengths.swap(matrix.row_lengths);
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+         const size_t num_entries_per_row)
+{
+    Parent::resize(num_rows, num_cols, num_entries, num_entries_per_row);
+    row_lengths.resize(num_rows);
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+         const size_t num_entries_per_row, const size_t alignment)
+{
+    Parent::resize(num_rows, num_cols, num_entries,
+                   num_entries_per_row, alignment);
+    row_lengths.resize(num_rows);
+}
+
+// assignment from another matrix
+template <typename IndexType, typename ValueType, class MemorySpace>
+template <typename MatrixType>
+ellr_matrix<IndexType, ValueType, MemorySpace>&
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::operator=(const MatrixType& matrix)
+{
+    cusp::convert(matrix, *this);
+    update_row_lengths();
+
+    return *this;
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void ellr_matrix<IndexType, ValueType, MemorySpace>::update_row_lengths()
+{
+    if (row_lengths.size() != this->num_rows)
+        row_lengths.resize(this->num_rows);
+
+    compute_row_lengths(*this);
+}
+
+
+} // namespace ktt
+
+} // namespace cusp
diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl
index f2b3c745..63f97368 100644
--- a/cusp/ktt/detail/ktt.inl
+++ b/cusp/ktt/detail/ktt.inl
@@ -15,6 +15,7 @@ namespace ktt {
 
 namespace detail {
 
+
 inline std::unique_ptr<::ktt::Tuner> tuner;
 inline bool is_enabled = true;
 
@@ -41,9 +42,11 @@ inline void lazy_init()
             CUstream stream;
             cuStreamCreate(&stream, CU_STREAM_DEFAULT);
 
-            ::ktt::ComputeApiInitializer initializer(context, std::vector<::ktt::ComputeQueue>{ stream });
+            std::vector<::ktt::ComputeQueue> queues = { stream };
+            ::ktt::ComputeApiInitializer initializer(context, queues);
 
-            tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, initializer);
+            tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA,
+                                                   initializer);
         }
 
         std::string compiler_flags = "-std=c++17 ";
@@ -51,7 +54,6 @@ inline void lazy_init()
         compiler_flags += "-lineinfo ";
 #endif
         tuner->SetCompilerOptions(compiler_flags);
-
         tuner->SetValidationMode(::ktt::ValidationMode::OfflineTuning);
 
         std::atexit(cleanup);
@@ -77,36 +79,43 @@ inline ::ktt::Tuner& get_tuner()
 }
 
 
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-              const cusp::array1d<ValueType1, cusp::device_memory>& x,
-              cusp::array1d<ValueType2, cusp::device_memory>& y)
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y)
 {
     return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y);
 }
 
 
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                             const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                             cusp::array1d<ValueType2, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& configuration,
-                             bool run_with_profiling)
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y,
+     const ::ktt::KernelConfiguration& configuration,
+     bool run_with_profiling)
 {
     return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y, configuration, run_with_profiling);
 }
 
 
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
 std::vector<::ktt::KernelResult>
-tune(const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
      const cusp::array1d<ValueType2, cusp::device_memory>& x,
      cusp::array1d<ValueType3, cusp::device_memory>& y,
      std::optional<::ktt::ReferenceComputation> reference_computation)
@@ -114,33 +123,18 @@ tune(const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
     return cusp::system::cuda::ktt::tune(get_tuner(), A, x, y, reference_computation);
 }
 
-
-template<typename IndexType,
-         typename ValueType1,
-         typename ValueType2,
-         typename ValueType3,
-         typename Format>
-void reset_tuning()
-{
-    using namespace cusp::system::cuda::ktt;
-
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(get_tuner(), Format{});
-    detail::tuner->ClearData(kernel.kernel_id);
-}
-
-
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
 void reset_tuning(const MatrixType& A,
-                  const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                  cusp::array1d<ValueType2, cusp::device_memory>& y)
+                  const cusp::array1d<ValueType1, MemorySpace1>& x,
+                  cusp::array1d<ValueType2, MemorySpace2>& y)
 {
-    using IndexType = typename MatrixType::index_type;
-    using ValueType = typename MatrixType::value_type;
-    using Format = typename MatrixType::format;
-
-    return reset_tuning<IndexType, ValueType, ValueType2, ValueType2, Format>();
+    auto& tuner = get_tuner();
+    const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    tuner.ClearData(kernel.kernel_id);
 }
 
 
diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h
new file mode 100644
index 00000000..829f5dfc
--- /dev/null
+++ b/cusp/ktt/ellr_matrix.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <cusp/detail/config.h>
+
+#include <cusp/ell_matrix.h>
+
+#include <type_traits>
+
+
+namespace cusp
+{
+
+namespace ktt
+{
+
+
+template <typename IndexType, typename ValueType, typename MemorySpace>
+class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
+{
+
+    using Parent = cusp::ell_matrix<IndexType, ValueType, MemorySpace>;
+
+public:
+
+    using row_lengths_array_type = cusp::array1d<IndexType, MemorySpace>;
+
+    row_lengths_array_type row_lengths;
+
+    /*! Construct an empty \p ellr_matrix.
+     */
+    ellr_matrix(void) {}
+
+    /*! Construct an \p ellr_matrix with a specific shape, number of nonzero entries,
+     *  and maximum number of nonzero entries per row.
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero matrix entries.
+     *  \param num_entries_per_row Maximum number of nonzeros per row.
+     *  \param alignment Amount of padding used to align the data structure (default 32).
+     */
+    ellr_matrix(const size_t num_rows, const size_t num_cols,
+                const size_t num_entries, const size_t num_entries_per_row,
+                const size_t alignment = 32);
+
+    /*! Construct an \p ellr_matrix from another matrix.
+     *
+     *  \param matrix Another sparse or dense matrix.
+     */
+    template <typename MatrixType>
+    ellr_matrix(const MatrixType& matrix);
+
+    /*! Resize matrix dimensions and underlying storage
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero matrix entries.
+     *  \param num_entries_per_row Maximum number of nonzeros per row.
+     */
+    void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+                const size_t num_entries_per_row);
+
+    /*! Resize matrix dimensions and underlying storage
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero matrix entries.
+     *  \param num_entries_per_row Maximum number of nonzeros per row.
+     *  \param alignment Amount of padding used to align the data structure (default 32).
+     */
+    void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+                const size_t num_entries_per_row, const size_t alignment);
+
+    /*! Swap the contents of two \p ellr_matrix objects.
+     *
+     *  \param matrix Another \p ellr_matrix with the same IndexType and ValueType.
+     */
+    void swap(ellr_matrix& matrix);
+
+    /*! Assignment from another matrix.
+     *
+     *  \tparam MatrixType Format type of input matrix.
+     *
+     *  \param matrix Another sparse or dense matrix.
+     */
+    template <typename MatrixType>
+    ellr_matrix& operator=(const MatrixType& matrix);
+
+    void update_row_lengths();
+};
+
+
+} // namespace ktt
+
+} // namespace cusp
+
+#include <cusp/ktt/detail/ellr_matrix.inl>
diff --git a/cusp/ktt/ktt.h b/cusp/ktt/ktt.h
index cab2d637..37c89de5 100644
--- a/cusp/ktt/ktt.h
+++ b/cusp/ktt/ktt.h
@@ -1,9 +1,10 @@
 #pragma once
 
-#include <Ktt.h>
-
+#include <cusp/detail/config.h>
 #include <cusp/array1d.h>
 
+#include <Ktt.h>
+
 namespace cusp {
 
 namespace ktt {
@@ -14,93 +15,110 @@ inline void enable();
 
 inline ::ktt::Tuner& get_tuner();
 
+
 /**
  * @brief Perform the multiplication y = A*x using an autotuned kernel.
+ * This function performs a single step of dynamic autotuning.
  *
  * @tparam Matrix      The type of the sparse matrix to use.
- * @tparam ValueType1  The value type of the x vector.
- * @tparam ValueType2  The value type of the y vector.
+ * @tparam IndexType   The type used for indices.
+ * @tparam ValueType1  The value type of the matrix.
+ * @tparam ValueType2  The value type of the x vector.
+ * @tparam ValueType3  The value type of the y vector.
  *
  * @param A The matrix.
  * @param x The input vector.
  * @param y The output vector.
  */
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                             const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                             cusp::array1d<ValueType2, cusp::device_memory>& y);
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y);
+
 
-// NOTE: Should we have this?
 /**
  * @brief Perform the multiplication y = A*x without autotuning,
  * instead just run the kernel in the given configuration.
  *
  * @tparam Matrix      The type of the sparse matrix to use.
- * @tparam ValueType1  The value type of the x vector.
- * @tparam ValueType2  The value type of the y vector.
+ * @tparam IndexType   The type used for indices.
+ * @tparam ValueType1  The value type of the matrix.
+ * @tparam ValueType2  The value type of the x vector.
+ * @tparam ValueType3  The value type of the y vector.
  *
- * @param A The matrix.
- * @param x The input vector.
- * @param y The output vector.
+ * @param A                   The matrix.
+ * @param x                   The input vector.
+ * @param y                   The output vector.
+ * @param configuration       The configuration of the kernel.
+ * @param run_with_profiling  Flag used to toggle profiling.
  */
-template <typename Matrix,
-          typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                            const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                            cusp::array1d<ValueType2, cusp::device_memory>& y,
-                            const ::ktt::KernelConfiguration& configuration,
-                            bool run_with_profiling = false);
-
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
-std::vector<::ktt::KernelResult>
-tune(const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
      const cusp::array1d<ValueType2, cusp::device_memory>& x,
      cusp::array1d<ValueType3, cusp::device_memory>& y,
-     std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt);
+     const ::ktt::KernelConfiguration& configuration,
+     bool run_with_profiling = false);
+
 
 /**
- * TODO: Add some documentation.
+ * @brief Perform offline autotuning on the specified matrix and vectors.
  *
- * @brief
+ * @tparam Matrix      The type of the sparse matrix to use.
+ * @tparam IndexType   The type used for indices.
+ * @tparam ValueType1  The value type of the matrix.
+ * @tparam ValueType2  The value type of the x vector.
+ * @tparam ValueType3  The value type of the y vector.
  *
- * @tparam IndexType
- * @tparam ValueType1
- * @tparam ValueType2
- * @tparam ValueType3
- * @tparam Format
+ * @param A                      The matrix.
+ * @param x                      The input vector.
+ * @param y                      The output vector.
+ * @param reference_computation  The reference computation that computes
+                                 the correct result.
  */
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          typename Format>
-void reset_tuning();
+          typename ValueType3>
+std::vector<::ktt::KernelResult>
+tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y,
+     std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt);
+
 
 /**
- * TODO: Add some documentation.
- *
- * @brief
+ * @brief Resets the tuning for the kernel instantiation used for the given arguments.
  *
- * @tparam MatrixType
- * @tparam ValueType1
- * @tparam ValueType2
+ * @tparam MatrixType    Type of the sparse matrix.
+ * @tparam ValueType1    The value type of the x vector
+ * @tparam ValueType2    The value type of the y vector.
+ * @tparam MemorySpace1  The memory space of the x vector.
+ * @tparam MemorySpace2  The memory space of the y vector.
  *
- * @param A
- * @param x
- * @param y
+ * @param A The matrix.
+ * @param x The input vector.
+ * @param y The output vector.
  */
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
 void reset_tuning(const MatrixType& A,
-                  const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                  cusp::array1d<ValueType2, cusp::device_memory>& y);
+                  const cusp::array1d<ValueType1, MemorySpace1>& x,
+                  cusp::array1d<ValueType2, MemorySpace2>& y);
+
 
 } // namespace ktt
 
diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h
index 6c5e0b15..712e0b55 100644
--- a/cusp/ktt/matrix_generation.h
+++ b/cusp/ktt/matrix_generation.h
@@ -3,11 +3,60 @@
 #include <cusp/dia_matrix.h>
 
 #include <algorithm> // min
+#include <stdexcept>
+#include <vector>
 
 namespace cusp {
 
 namespace ktt {
 
+
+cusp::dia_matrix<int, float, cusp::host_memory>
+make_diagonal_matrix(int rows, int cols, const std::vector<int>& diag_offsets)
+{
+    using DiaMatrix = cusp::dia_matrix<int, float, cusp::host_memory>;
+
+    int diagonal_count = diag_offsets.size();
+
+    DiaMatrix::diagonal_offsets_array_type offsets;
+    offsets.reserve(diagonal_count);
+
+    DiaMatrix::values_array_type values(rows, diagonal_count, 0);
+
+    size_t num_entries = 0;
+    for (int i = 0; i < diagonal_count; ++i) {
+        int offset = diag_offsets[i];
+
+        int starting_row = offset < 0 ? -offset : 0;
+        int starting_col = offset < 0 ? 0 : offset;
+
+        if (starting_row >= rows || starting_col >= cols)
+            throw std::runtime_error(
+                    "make_diagonal_matrix: Diagonal out of bounds.");
+
+        offsets.push_back(offset);
+
+        int ending_row = starting_row + std::min(rows - starting_row, cols - starting_col);
+
+        for (int row = starting_row; row < ending_row; ++row) {
+            num_entries++;
+            values(row, i) = 1;
+        }
+    }
+
+    DiaMatrix res;
+
+    res.num_rows = rows;
+    res.num_cols = cols;
+    res.num_entries = num_entries;
+
+    res.diagonal_offsets.swap(offsets);
+    res.values.swap(values);
+
+    return res;
+}
+
+
 /**
  * @brief Create a symetric matrix with given number of diagonals filled with ones.
  *
@@ -15,8 +64,8 @@ namespace ktt {
  * on both sides of it. That means the matrix is actually symetric only for
  * odd numbers of diagonals.
  *
- * If the number of diagonals is greater then would fit into the matrix,
- * the extra diagonals are just discarded.
+ * If the number of diagonals is greater then would fit into the matrix
+ * an exception is thrown.
  *
  * @param rows            The number of rows of the resulting matrix.
  * @param cols            The number of columns of the resulting matrix.
@@ -31,45 +80,28 @@ make_diagonal_symmetric_matrix(int rows,
                                int offset_step,
                                int diagonal_count)
 {
-    using DiaMatrix = cusp::dia_matrix<int, float, cusp::host_memory>;
-
-    DiaMatrix::diagonal_offsets_array_type offsets;
+    std::vector<int> offsets;
     offsets.reserve(diagonal_count);
 
-    DiaMatrix::values_array_type values(rows, diagonal_count, 0);
-
     int starting_offset = -offset_step * diagonal_count/2;
-    size_t num_entries = 0;
 
     for (int i = 0; i < diagonal_count; ++i) {
         int offset = starting_offset + offset_step*i;
-        offsets.push_back(offset);
 
         int starting_row = offset < 0 ? -offset : 0;
         int starting_col = offset < 0 ? 0 : offset;
 
-        int ending_row = starting_row >= rows || starting_col >= cols
-                            ? 0
-                            : starting_row + std::min(rows - starting_row, cols - starting_col);
+        if (starting_row >= rows || starting_col >= cols)
+            throw std::runtime_error(
+                    "make_diagonal_symmetric_matrix: Too many diagonals.");
 
-        for (int row = starting_row; row < ending_row; ++row) {
-            num_entries++;
-            values(row, i) = 1;
-        }
+        offsets.push_back(offset);
     }
 
-    DiaMatrix res;
-
-    res.num_rows = rows;
-    res.num_cols = cols;
-    res.num_entries = num_entries;
-
-    res.diagonal_offsets.swap(offsets);
-    res.values.swap(values);
-
-    return res;
+    return make_diagonal_matrix(rows, cols, offsets);
 }
 
+
 } // namespace ktt
 
 } // namespace cusp
diff --git a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h
index 7bf4b98d..6f9e2c5c 100644
--- a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h
+++ b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h
@@ -24,6 +24,13 @@
 
 #include <algorithm>
 
+#ifdef TIME_CSR
+#include <chrono>
+#include <iostream>
+
+namespace krn = std::chrono;
+#endif
+
 namespace cusp
 {
 namespace system
@@ -190,11 +197,22 @@ void __spmv_csr_vector(cuda::execution_policy<DerivedPolicy>& exec,
 
     cudaStream_t s = stream(thrust::detail::derived_cast(exec));
 
+    #ifdef TIME_CSR
+        auto start = krn::steady_clock::now();
+    #endif
+
     spmv_csr_vector_kernel<RowIterator, ColumnIterator, ValueIterator1, ValueIterator2, ValueIterator3,
                            UnaryFunction, BinaryFunction1, BinaryFunction2,
                            VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, 0, s>>>
                            (A.num_rows, A.row_offsets.begin(), A.column_indices.begin(), A.values.begin(), x.begin(), y.begin(),
                             initialize, combine, reduce);
+
+    #ifdef TIME_CSR
+        cudaStreamSynchronize(s);
+        auto end = krn::steady_clock::now();
+        size_t time = krn::duration_cast<krn::microseconds>(end - start).count();
+        std::cout << "cusp csr kernel: " << time << " us\n";
+    #endif
 }
 
 template <typename DerivedPolicy,
diff --git a/cusp/system/cuda/detail/multiply/dia_spmv.h b/cusp/system/cuda/detail/multiply/dia_spmv.h
index bf381825..81dbe069 100644
--- a/cusp/system/cuda/detail/multiply/dia_spmv.h
+++ b/cusp/system/cuda/detail/multiply/dia_spmv.h
@@ -26,6 +26,13 @@
 
 #include <algorithm>
 
+#ifdef TIME_DIA
+#include <chrono>
+#include <iostream>
+
+namespace krn = std::chrono;
+#endif
+
 namespace cusp
 {
 namespace system
@@ -165,8 +172,19 @@ void multiply(cuda::execution_policy<DerivedPolicy>& exec,
 
     cudaStream_t s = stream(thrust::detail::derived_cast(exec));
 
+#ifdef TIME_DIA
+    auto start = krn::steady_clock::now();
+#endif
+
     spmv_dia_kernel<OffsetsIterator, ValueIterator1, ValueIterator2, ValueIterator3, UnaryFunction, BinaryFunction1, BinaryFunction2, BLOCK_SIZE> <<<NUM_BLOCKS, BLOCK_SIZE, 0, s>>>
     (A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets.begin(), A.values.values.begin(), x.begin(), y.begin(), initialize, combine, reduce);
+
+#ifdef TIME_DIA
+    cudaStreamSynchronize(s);
+    auto end = krn::steady_clock::now();
+    size_t time = krn::duration_cast<krn::microseconds>(end - start).count();
+    std::cout << "cusp dia kernel: " << time << " us\n";
+#endif
 }
 
 } // end namespace detail
diff --git a/cusp/system/cuda/detail/multiply/ell_spmv.h b/cusp/system/cuda/detail/multiply/ell_spmv.h
index 2a5eab78..18bdecf5 100644
--- a/cusp/system/cuda/detail/multiply/ell_spmv.h
+++ b/cusp/system/cuda/detail/multiply/ell_spmv.h
@@ -28,6 +28,13 @@
 #include <cassert>
 #include <algorithm>
 
+#ifdef TIME_ELL
+#include <chrono>
+#include <iostream>
+
+namespace krn = std::chrono;
+#endif
+
 namespace cusp
 {
 namespace system
@@ -132,8 +139,19 @@ void multiply(cuda::execution_policy<DerivedPolicy>& exec,
 
     cudaStream_t s = stream(thrust::detail::derived_cast(exec));
 
+#ifdef TIME_ELL
+    auto start = krn::steady_clock::now();
+#endif
+
     spmv_ell_kernel<IndexType,ValueType,UnaryFunction,BinaryFunction1,BinaryFunction2,BLOCK_SIZE> <<<NUM_BLOCKS, BLOCK_SIZE, 0, s>>>
     (A.num_rows, A.num_cols, num_entries_per_row, pitch, J, V, x_ptr, y_ptr, initialize, combine, reduce);
+
+#ifdef TIME_ELL
+    cudaStreamSynchronize(s);
+    auto end = krn::steady_clock::now();
+    size_t time = krn::duration_cast<krn::microseconds>(end - start).count();
+    std::cout << "cusp ell kernel: " << time << " us\n";
+#endif
 }
 
 } // end namespace detail
diff --git a/cusp/system/cuda/detail/synchronize.inl b/cusp/system/cuda/detail/synchronize.inl
index 8256398e..73919e38 100644
--- a/cusp/system/cuda/detail/synchronize.inl
+++ b/cusp/system/cuda/detail/synchronize.inl
@@ -32,7 +32,7 @@ namespace detail
 
 void synchronize(const char *message)
 {
-  cudaError_t error = cudaThreadSynchronize();
+  cudaError_t error = cudaDeviceSynchronize();
   if(error)
   {
     throw thrust::system_error(error, thrust::cuda_category(), std::string("synchronize: ") + message);
diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h
index 0c0deec8..d35a2fd8 100644
--- a/cusp/system/cuda/ktt/csr_multiply.h
+++ b/cusp/system/cuda/ktt/csr_multiply.h
@@ -1,25 +1,18 @@
-#pragma once 
+#pragma once
 
-#include <iostream>
+#include <cusp/detail/config.h>
 
 #include <cusp/ktt/detail/external/nameof.hpp>
-#include <cusp/ktt/detail/ktt.inl>
-#include <cusp/system/cuda/arch.h> // max_active_blocks
-
-#include <cusp/system/cuda/ktt/kernels/csr_kernel.h>
 #include <cusp/system/cuda/ktt/utils.h>
 
-namespace cusp::system::cuda::ktt::csr {
-    inline ::ktt::KernelId get_kernel_id(::ktt::KernelDefinitionId id,  std::string kernel_name) {
-        static ::ktt::KernelId kernel = cusp::ktt::detail::tuner->CreateSimpleKernel(kernel_name, id);
-        return kernel;
-    }
-}
+#include <cusp/csr_matrix.h>
+#include <cusp/array1d.h>
+#include <cusp/system/cuda/utils.h>
+
+#include <thrust/fill.h>
+
+#include <iostream>
 
-template<>
-inline ::ktt::KernelId cusp::ktt::detail::get_kernel_id<cusp::csr_format>() {
-    return cusp::system::cuda::ktt::csr::get_kernel_id(0, ""); 
-}
 
 namespace cusp {
 
@@ -31,109 +24,148 @@ namespace ktt {
 
 namespace csr {
 
-inline void setup_tuning_parameters(::ktt::Tuner& tuner, ::ktt::KernelId kernel) {
-    tuner.AddParameter(kernel, "TEST_PARAM", std::vector<uint64_t>{0, 1, 2});
+
+inline void setup_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 128 });
+    tuner.AddParameter(kernel_id, "THREADS_PER_VECTOR", std::vector<uint64_t>{ 32 });
+
+    tuner.AddThreadModifier(
+        kernel.kernel_id,
+        { kernel.definition_ids[0] },
+        ::ktt::ModifierType::Local,
+        ::ktt::ModifierDimension::X,
+        { std::string("BLOCK_SIZE") },
+        [] (const uint64_t defaultSize, const std::vector<uint64_t>& parameters) {
+            return parameters[0];
+        }
+    );
 }
 
-} // namespace csr
 
-template <typename DerivedPolicy,
-          typename MatrixType,
-          typename VectorType1,
-          typename VectorType2>
-void multiply(cuda::execution_policy<DerivedPolicy>& exec,
-              const MatrixType& A,
-              const VectorType1& x,
-              VectorType2& y,
-              cusp::csr_format)
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+kernel_context initialize_kernel(::ktt::Tuner& tuner)
 {
-    static_assert( std::is_same_v<cusp::device_memory, typename MatrixType::memory_space>
-                   && std::is_same_v<cusp::device_memory, typename VectorType1::memory_space>
-                   && std::is_same_v<cusp::device_memory, typename VectorType2::memory_space>, 
-                   "All arguments must be in device memory." );
+    std::string kernel_path =
+        STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h";
 
-    using ValueType = typename MatrixType::value_type;
+    kernel_context kernel(tuner);
 
-    if (A.num_entries == 0) {
-        thrust::fill(y.begin(), y.end(), ValueType(0));
-        return;
-    }
+    std::vector< std::string > type_names {
+        std::string(NAMEOF_TYPE(IndexType)),
+        std::string(NAMEOF_TYPE(ValueType1)),
+        std::string(NAMEOF_TYPE(ValueType2)),
+        std::string(NAMEOF_TYPE(ValueType3)),
+    };
 
-    std::cout << "Hello from out thing\n";
+    // NOTE: These can be anything since they are awlays set in the launcher.
+    // So use some values that will hopefully cause a crash if not set properly
+    // in the launcher.
+    ::ktt::DimensionVector block_size(0);
+    ::ktt::DimensionVector grid_size(0);
 
-    // copied from csr_vector_spmv.h
-    typedef typename MatrixType::row_offsets_array_type::const_iterator     RowIterator;
-    typedef typename MatrixType::column_indices_array_type::const_iterator  ColumnIterator;
-    typedef typename MatrixType::values_array_type::const_iterator          ValueIterator1;
+    auto definition_id = tuner.AddKernelDefinitionFromFile(
+        "ktt_csr_vector_kernel",
+        kernel_path,
+        grid_size,
+        block_size,
+        type_names
+    );
 
-    typedef typename VectorType1::const_iterator                            ValueIterator2;
-    typedef typename VectorType2::iterator                                  ValueIterator3;
+    kernel.definition_ids.push_back(definition_id);
+    kernel.kernel_id = tuner.CreateSimpleKernel("CsrKernel", definition_id);
 
-    auto& tuner = *cusp::ktt::detail::tuner;
+    setup_tuning_parameters(kernel);
 
-    std::string path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h";
+    return kernel;
+}
 
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t THREADS_PER_VECTOR = 32;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-    
-    std::vector< std::string > type_names {
-        std::string(NAMEOF_TYPE(typename MatrixType::index_type)),
-        std::string(NAMEOF_TYPE(typename MatrixType::value_type)),
-        std::string(NAMEOF_TYPE(typename VectorType1::value_type)),
-        std::string(NAMEOF_TYPE(typename VectorType2::value_type)),
-        std::to_string(VECTORS_PER_BLOCK),
-        std::to_string(THREADS_PER_VECTOR)
-    };
 
-    // TODO: we probably wanna change this later
-    const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks(
-                                  ktt_csr_vector_kernel<typename MatrixType::index_type, typename MatrixType::value_type,
-                                  typename VectorType1::value_type, typename VectorType2::value_type,
-                                  VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
+} // namespace csr
+
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::csr_format format)
+{
+    static kernel_context kernel =
+        csr::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner);
+
+    return kernel;
+}
+
 
-    const ::ktt::DimensionVector blockDimensions(THREADS_PER_BLOCK);
-    const ::ktt::DimensionVector gridDimensions(NUM_BLOCKS);
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_arguments(const kernel_context& kernel,
+              const cusp::csr_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+              const cusp::array1d<ValueType2, cusp::device_memory>& x,
+              cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    auto args = add_arguments(*kernel.tuner,
+                              A.num_rows, A.row_offsets, A.column_indices,
+                              A.values, x, y);
 
-    ::ktt::KernelDefinitionId definition = tuner.GetKernelDefinitionId("ktt_csr_vector_kernel", type_names);
+    kernel.tuner->SetArguments(kernel.definition_ids[0], args);
 
-    bool called_first_time = definition == ::ktt::InvalidKernelDefinitionId; 
+    return args;
+}
 
-    if (called_first_time) {
-        definition = tuner.AddKernelDefinitionFromFile(
-            "ktt_csr_vector_kernel", 
-            path, 
-            gridDimensions,
-            blockDimensions,
-            type_names);
-    }
 
-    ::ktt::KernelId kernel = csr::get_kernel_id(definition, "CsrKernel");
+// Returns the argument id of the y vector given a vector of arguments
+// returned by a previous call of `add_arguments`.
+inline ::ktt::ArgumentId
+get_output_argument(const std::vector<::ktt::ArgumentId>& arguments,
+                    cusp::csr_format)
+{
+    return arguments[5];
+}
 
-    if (called_first_time) {
-        csr::setup_tuning_parameters(tuner, kernel);
-    }
 
-    auto num_rows_id = tuner.AddArgumentScalar(A.num_rows);
-    auto Ap_id = add_arg(A.row_offsets); 
-    auto Aj_id = add_arg(A.column_indices);
-    auto Ax_id = add_arg(A.values);
-    auto x_id = add_arg(x);
-    auto y_id = add_arg(y, ::ktt::ArgumentAccessType::ReadWrite);
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+auto get_launcher(const kernel_context& ctx,
+                  const cusp::csr_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+                  const cusp::array1d<ValueType2, cusp::device_memory>& x,
+                  cusp::array1d<ValueType3, cusp::device_memory>& y,
+                  bool profile = false)
+{
+    return [&, profile] (::ktt::ComputeInterface& interface)
+    {
+        auto conf = interface.GetCurrentConfiguration();
 
-    std::vector<::ktt::ArgumentId> args = { num_rows_id, Ap_id, Aj_id, Ax_id, x_id, y_id };
+        ::ktt::DimensionVector block_size =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]);
 
-    tuner.SetArguments(definition, args);
+        auto threads_per_vector = get_parameter_uint(conf, "THREADS_PER_VECTOR");
+        auto vectors_per_block = block_size.GetSizeX() / threads_per_vector;
 
-    tuner.TuneIteration(kernel, {});
+        ::ktt::DimensionVector grid_size(
+            DIVIDE_INTO(A.num_rows, vectors_per_block));
 
-    tuner.SetArguments(definition, {});
-    for (auto arg : args) {
-        tuner.RemoveArgument(arg);
-    }
+        if (!profile) {
+            interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
+        } else {
+            interface.RunKernelWithProfiling(ctx.definition_ids[0],
+                                             grid_size, block_size);
+        }
+    };
 }
 
+
 } // namespace ktt
 
 } // namespace cuda
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 6a7f31b2..8380b6f2 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -1,14 +1,14 @@
 #pragma once
 
-#include <cusp/ktt/detail/external/nameof.hpp>
-#include <cusp/system/cuda/arch.h> // max_active_blocks
+#include <cusp/detail/config.h>
 
-#include <cusp/system/cuda/ktt/kernels/dia_kernel.h>
 #include <cusp/system/cuda/ktt/kernel.h>
 #include <cusp/system/cuda/ktt/utils.h>
+#include <cusp/ktt/detail/external/nameof.hpp>
+
+#include <cusp/system/cuda/utils.h>
+#include <cusp/dia_matrix.h>
 
-#include <optional>
-#include <iostream>
 
 namespace cusp {
 
@@ -20,36 +20,64 @@ namespace ktt {
 
 namespace dia {
 
-inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
+
+inline void setup_tuning_parameters(const kernel_context& kernel)
 {
-    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1, 2 });
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 });
+    tuner.AddParameter(kernel_id, "PREFETCH_FACTOR",
+                       u64_vec{ 0, 2, 3, 4 });
+    tuner.AddParameter(kernel_id, "PREFETCH_TYPE", u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "SPECIAL_LOADS", u64_vec{ 0, 1 });
+
+
+    tuner.AddThreadModifier(
+        kernel.kernel_id,
+        { kernel.definition_ids[0] },
+        ::ktt::ModifierType::Local,
+        ::ktt::ModifierDimension::X,
+        { std::string("BLOCK_SIZE") },
+        [] (const uint64_t defaultSize, const u64_vec& parameters) {
+            return parameters[0];
+        });
+
+
+    // Different register prefetching implementations are used
+    // only when register prefetching is actually enabled
+    tuner.AddConstraint(
+        kernel.kernel_id,
+        { "PREFETCH_FACTOR", "PREFETCH_TYPE" },
+        [] (const std::vector<uint64_t>& values) {
+            return values[0] > 0 || values[1] == 0;
+        });
 }
 
-} // namespace dia
-
 
-template<typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3>
-kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+kernel_context initialize_kernel(::ktt::Tuner& tuner)
 {
     kernel_context kernel(tuner);
 
-    std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h";
-
-    const size_t BLOCK_SIZE = 256;
-    const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks(
-                                ktt_dia_vector_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>,
-                                BLOCK_SIZE, 0);
+    std::string kernel_path =
+        STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h";
 
     std::vector< std::string > type_names {
         std::string(NAMEOF_TYPE(IndexType)),
         std::string(NAMEOF_TYPE(ValueType1)),
         std::string(NAMEOF_TYPE(ValueType2)),
-        std::string(NAMEOF_TYPE(ValueType3)),
-        std::to_string(BLOCK_SIZE)
+        std::string(NAMEOF_TYPE(ValueType3))
     };
 
-    const ::ktt::DimensionVector blockDimensions(BLOCK_SIZE);
-    const ::ktt::DimensionVector gridDimensions(MAX_BLOCKS);
+    // NOTE: These can be anything since they are awlays set in the launcher.
+    // So use some values that will hopefully cause a crash if not set properly
+    // in the launcher.
+    const ::ktt::DimensionVector blockDimensions(0);
+    const ::ktt::DimensionVector gridDimensions(0);
 
     auto definition_id = tuner.AddKernelDefinitionFromFile(
         "ktt_dia_vector_kernel",
@@ -61,18 +89,28 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
     kernel.definition_ids.push_back(definition_id);
     kernel.kernel_id = tuner.CreateSimpleKernel("DiaKernel", definition_id);
 
-    dia::setup_tuning_parameters(tuner, kernel);
+    dia::setup_tuning_parameters(kernel);
 
     return kernel;
 }
 
+
+} // namespace dia
+
+
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
-         typename ValueType3>
-const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::dia_format format)
+         typename ValueType3,
+         typename MemorySpace>
+const kernel_context& get_kernel(
+        ::ktt::Tuner& tuner,
+        const cusp::dia_matrix<IndexType, ValueType1, MemorySpace>& A)
 {
-    static kernel_context kernel = initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, format);
+    constexpr auto init_kernel =
+        dia::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>;
+
+    static kernel_context kernel = init_kernel(tuner);
     return kernel;
 }
 
@@ -81,138 +119,61 @@ template <typename IndexType,
           typename ValueType2,
           typename ValueType3>
 std::vector<::ktt::ArgumentId> add_arguments(
-                                    const kernel_context& kernel,
-                                    const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-                                    const cusp::array1d<ValueType2, cusp::device_memory>& x,
-                                    cusp::array1d<ValueType3, cusp::device_memory>& y)
+        const kernel_context& kernel,
+        const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+        const cusp::array1d<ValueType2, cusp::device_memory>& x,
+        cusp::array1d<ValueType3, cusp::device_memory>& y)
 {
     const IndexType num_diagonals = A.values.num_cols;
-    const IndexType pitch = A.values.pitch; // length of one diagonal + possible padding
+    const IndexType pitch = A.values.pitch;
+
+    auto args =
+        add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals,
+                      pitch, A.diagonal_offsets, A.values.values, x, y);
 
-    auto args = add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets, A.values.values, x, y);
     kernel.tuner->SetArguments(kernel.definition_ids[0], args);
 
     return args;
 }
 
 
-// Returns the argument id of the y vector given a vector of arguments returned by a previous
-// call of `add_arguments`.
-inline ::ktt::ArgumentId get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, cusp::dia_format) {
-    return arguments[7];
-}
-
-auto get_launcher(const kernel_context& ctx,
-                  size_t num_rows,
-                  size_t num_cols,
-                  bool profile = false)
+// Returns the argument id of the y vector given a vector of arguments
+// returned by a previous call of `add_arguments`.
+inline ::ktt::ArgumentId get_output_argument(
+        const std::vector<::ktt::ArgumentId>& arguments,
+        cusp::dia_format)
 {
-    return [=] (::ktt::ComputeInterface& interface) {
-        ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]);
-        ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) );
-
-        if (!profile) {
-            interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
-        } else {
-            do {
-                interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size);
-            } while(interface.GetRemainingProfilingRuns(ctx.definition_ids[0]) > 0);
-        }
-    };
+    return arguments[7];
 }
 
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-::ktt::KernelResult multiply(::ktt::Tuner& tuner,
-                             const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-                             const cusp::array1d<ValueType2, cusp::device_memory>& x,
-                             cusp::array1d<ValueType3, cusp::device_memory>& y)
-{
-    if (A.num_entries == 0) {
-        thrust::fill(y.begin(), y.end(), ValueType3(0));
-        ::ktt::KernelResult result("DiaKernel", {});
-        result.SetStatus(::ktt::ResultStatus::Ok);
-        return result;
-    }
-
-    const kernel_context& kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-    auto args = add_arguments(kernel, A, x, y);
-
-    tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols));
-
-    auto res = tuner.TuneIteration(kernel.kernel_id, {});
-    remove_arguments(kernel, args);
-
-    return res;
-}
 
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
-::ktt::KernelResult multiply(::ktt::Tuner& tuner,
-                             const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-                             const cusp::array1d<ValueType2, cusp::device_memory>& x,
-                             cusp::array1d<ValueType3, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& configuration,
-                             bool run_with_profiling = false)
+auto get_launcher(
+        const kernel_context& ctx,
+        const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+        const cusp::array1d<ValueType2, cusp::device_memory>& x,
+        cusp::array1d<ValueType3, cusp::device_memory>& y,
+        bool profile = false)
 {
-    if (A.num_entries == 0) {
-        thrust::fill(y.begin(), y.end(), ValueType3(0));
-        ::ktt::KernelResult result("DiaKernel", {});
-        result.SetStatus(::ktt::ResultStatus::Ok);
-        return result;
-    }
-
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-    auto args = add_arguments(kernel, A, x, y);
-
-    tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling));
-
-    auto result = tuner.Run(kernel.kernel_id, configuration, {});
-    remove_arguments(kernel, args);
+    return [&, profile] (::ktt::ComputeInterface& interface)
+    {
+        ::ktt::DimensionVector block_size =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]);
+        ::ktt::DimensionVector grid_size(
+            DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
 
-    return result;
+        if (!profile) {
+            interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
+        } else {
+            interface.RunKernelWithProfiling(ctx.definition_ids[0],
+                                             grid_size, block_size);
+        }
+    };
 }
 
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-std::vector<::ktt::KernelResult>
-tune(::ktt::Tuner& tuner,
-     const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-     const cusp::array1d<ValueType2, cusp::device_memory>& x,
-     cusp::array1d<ValueType3, cusp::device_memory>& y,
-     std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt)
-{
-    if (A.num_entries == 0) {
-        return {};
-    }
-
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-    auto args = add_arguments(kernel, A, x, y);
-
-    if (reference_computation) {
-        tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value());
-    }
-
-    cusp::array1d<ValueType3, cusp::host_memory> host_y = y;
-    tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) {
-        // clear y so previous results don't affect validation
-        y = host_y;
-        auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols);
-        launcher(interface);
-    });
-
-    auto results = tuner.Tune(kernel.kernel_id);
-
-    remove_arguments(kernel, args);
-
-    return results;
-}
 
 } // namespace ktt
 
diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
new file mode 100644
index 00000000..d4ff4703
--- /dev/null
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -0,0 +1,264 @@
+#pragma once
+
+#include <cusp/ktt/ellr_matrix.h>
+
+#include <cusp/system/cuda/ktt/kernel.h>
+#include <cusp/system/cuda/ktt/utils.h>
+#include <cusp/ell_matrix.h>
+
+namespace cusp {
+
+namespace system {
+
+namespace cuda {
+
+namespace ktt {
+
+namespace ell {
+
+
+inline void setup_common_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE",      u64_vec{ 128, 256, 512 });
+    tuner.AddParameter(kernel_id, "SPECIAL_LOADS",  u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "DISABLE_UNROLL",  u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 });
+    tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4, 8 });
+
+    tuner.AddThreadModifier(
+        kernel_id,
+        { kernel.definition_ids[0] },
+        ::ktt::ModifierType::Local,
+        ::ktt::ModifierDimension::X,
+        { std::string("BLOCK_SIZE") },
+        [] (const uint64_t defaultSize, const std::vector<uint64_t>& parameters) {
+            return parameters[0];
+        }
+    );
+
+    // All the warps working on one stripe of the matrix fit in a thread block.
+    tuner.AddConstraint(
+            kernel.kernel_id,
+            { "THREADS_PER_ROW", "BLOCK_SIZE" },
+            [] (const std::vector<uint64_t>& values) {
+                return 32*values[0] <= values[1];
+            });
+}
+
+inline void setup_ell_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    setup_common_tuning_parameters(kernel);
+
+    tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 });
+
+    // BREAK can be used only when there is no prefetching
+    tuner.AddConstraint(
+            kernel.kernel_id,
+            { "BREAK", "PREFETCH_FACTOR" },
+            [] (const std::vector<uint64_t>& values) {
+                return values[0] == 0 || values[1] == 0;
+            });
+}
+
+inline void setup_ellr_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    setup_common_tuning_parameters(kernel);
+
+    tuner.AddParameter(kernel_id, "ELLR", u64_vec{ 1 });
+}
+
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3,
+         typename TuningParamsInit>
+kernel_context initialize_kernel(::ktt::Tuner& tuner,
+                                 const std::string& kernel_function,
+                                 const std::string& kernel_name,
+                                 TuningParamsInit init)
+{
+    std::string kernel_path =
+        STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/ell_kernel.h";
+
+    kernel_context kernel(tuner);
+
+    std::vector< std::string > type_names {
+        std::string(NAMEOF_TYPE(IndexType)),
+        std::string(NAMEOF_TYPE(ValueType1))
+    };
+
+    // NOTE: These can be anything since they are awlays set in the launcher.
+    // So use some values that will hopefully cause a crash if not set properly
+    // in the launcher.
+    ::ktt::DimensionVector block_size(0);
+    ::ktt::DimensionVector grid_size(0);
+
+    auto definition_id = tuner.AddKernelDefinitionFromFile(
+        kernel_function,
+        kernel_path,
+        grid_size,
+        block_size,
+        type_names
+    );
+
+    kernel.definition_ids.push_back(definition_id);
+    kernel.kernel_id = tuner.CreateSimpleKernel(kernel_name, definition_id);
+
+    init(kernel);
+
+    return kernel;
+}
+
+
+} // namespace ell
+
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+const kernel_context& get_kernel(
+        ::ktt::Tuner& tuner,
+        const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A)
+{
+    static kernel_context kernel =
+        ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
+            tuner, "ktt_ell_kernel", "EllKernel",
+            ell::setup_ell_tuning_parameters);
+
+    return kernel;
+}
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+const kernel_context& get_kernel(
+    ::ktt::Tuner& tuner,
+    const cusp::ktt::ellr_matrix<IndexType, ValueType1, cusp::device_memory>& A)
+{
+    static kernel_context kernel =
+        ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
+            tuner, "ktt_ellr_kernel", "EllrKernel",
+            ell::setup_ellr_tuning_parameters);
+
+    return kernel;
+}
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_common_arguments(
+        const kernel_context& kernel,
+        const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+        const cusp::array1d<ValueType2, cusp::device_memory>& x,
+        cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    IndexType pitch = A.column_indices.pitch;
+    IndexType num_entries_per_row = A.column_indices.num_cols;
+
+    return add_arguments(*kernel.tuner, A.num_rows, A.num_cols,
+                         num_entries_per_row, pitch, A.column_indices.values,
+                         A.values.values, x, y);
+}
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_arguments(const kernel_context& kernel,
+              const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+              const cusp::array1d<ValueType2, cusp::device_memory>& x,
+              cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    auto args = add_common_arguments(kernel, A, x, y);
+    kernel.tuner->SetArguments(kernel.definition_ids[0], args);
+    return args;
+}
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_arguments(
+    const kernel_context& kernel,
+    const cusp::ktt::ellr_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+    const cusp::array1d<ValueType2, cusp::device_memory>& x,
+    cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    auto args = add_common_arguments(kernel, A, x, y);
+    auto aditional_args = add_arguments(*kernel.tuner, A.row_lengths);
+
+    args.push_back(aditional_args[0]);
+
+    kernel.tuner->SetArguments(kernel.definition_ids[0], args);
+
+    return args;
+}
+
+
+// Returns the argument id of the y vector given a vector of arguments
+// returned by a previous call of `add_arguments`.
+inline ::ktt::ArgumentId
+get_output_argument(const std::vector<::ktt::ArgumentId>& arguments,
+                    cusp::ell_format)
+{
+    return arguments[7];
+}
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+auto get_launcher(const kernel_context& ctx,
+                  const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+                  const cusp::array1d<ValueType2, cusp::device_memory>& x,
+                  cusp::array1d<ValueType3, cusp::device_memory>& y,
+                  bool profile = false)
+{
+    return [&, profile] (::ktt::ComputeInterface& interface)
+    {
+        const auto& conf = interface.GetCurrentConfiguration();
+        auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW");
+
+        auto threads_in_block =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]).GetSizeX();
+
+        ::ktt::DimensionVector block_size(
+            threads_in_block/threads_per_row, threads_per_row);
+        ::ktt::DimensionVector grid_size(
+            DIVIDE_INTO(A.num_rows, threads_in_block/threads_per_row));
+
+        if (!profile) {
+            interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
+        } else {
+            interface.RunKernelWithProfiling(ctx.definition_ids[0],
+                                             grid_size, block_size);
+        }
+    };
+}
+
+
+} // namespace ktt
+
+} // namespace cuda
+
+} // namespace system
+
+} // namespace cusp
diff --git a/cusp/system/cuda/ktt/kernel.h b/cusp/system/cuda/ktt/kernel.h
index a34c8b16..4061c558 100644
--- a/cusp/system/cuda/ktt/kernel.h
+++ b/cusp/system/cuda/ktt/kernel.h
@@ -16,15 +16,17 @@ namespace ktt {
 struct kernel_context {
     ::ktt::Tuner* tuner = nullptr;
     ::ktt::KernelId kernel_id = ::ktt::InvalidKernelId;
-    
+
     // This is a vector to allow for composite kernels.
     std::vector<::ktt::KernelDefinitionId> definition_ids;
 
     kernel_context(::ktt::Tuner& tuner) : tuner(&tuner) { }
-    
-    kernel_context(::ktt::Tuner& tuner, const std::vector<::ktt::KernelDefinitionId>& definition_ids, ::ktt::KernelId kernel_id)
+
+    kernel_context(::ktt::Tuner& tuner,
+                   const std::vector<::ktt::KernelDefinitionId>& definition_ids,
+                   ::ktt::KernelId kernel_id)
         : tuner(&tuner),
-          definition_ids(definition_ids), 
+          definition_ids(definition_ids),
           kernel_id(kernel_id) { }
 };
 
diff --git a/cusp/system/cuda/ktt/kernels/csr_kernel.h b/cusp/system/cuda/ktt/kernels/csr_kernel.h
index f28773a1..85cd6fa6 100644
--- a/cusp/system/cuda/ktt/kernels/csr_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/csr_kernel.h
@@ -1,17 +1,11 @@
+#pragma once
 
-inline __device__ int fib(int n) {
-    if (n < 2) {
-        return n;
-    }
-    return fib(n-1) + fib(n-2);
-}
+#define VECTORS_PER_BLOCK (BLOCK_SIZE/THREADS_PER_VECTOR)
 
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int VECTORS_PER_BLOCK,
-          unsigned int THREADS_PER_VECTOR>
+          typename ValueType3>
 __global__ void
 ktt_csr_vector_kernel(const unsigned int num_rows,
                        const IndexType*   Ap,
@@ -20,17 +14,6 @@ ktt_csr_vector_kernel(const unsigned int num_rows,
                        const ValueType2*  x,
                        ValueType3*        y)
 {
-
-#if TEST_PARAM == 0
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        printf("Test param is 0.\n");
-    }
-#else
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        printf("Fib of 20 is %d.\n", fib(20));
-    }
-#endif
-
     typedef ValueType1 ValueType;
 
     __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index df7ade63..9c64f6e1 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -1,217 +1,252 @@
-#pragma once
 
+template<typename T>
+__device__ T min(T a, T b) {
+    return a < b ? a : b;
+}
 
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
-__device__ void
-ktt_dia_vector_kernel_simple(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+template<typename T>
+__device__ T max(T a, T b) {
+    return a > b ? a : b;
+}
+
+
+template<typename T>
+__device__ T load_diag_val(const T* val)
 {
-    using ValueType = ValueType3;
+#if SPECIAL_LOADS == 0
+    return *val;
+#elif SPECIAL_LOADS == 1
+    // Cache streaming, likely to be accessed once.
+    // The ld.cs load cached streaming operation allocates global lines with
+    // evict-first policy in L1 and L2
+    // to limit cache pollution by temporary streaming data
+    return __ldcs(val);
+#endif
+}
 
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+template<typename T>
+__device__ T load_x_val(const T* val)
+{
+#if SPECIAL_LOADS == 0
+    return *val;
+#elif SPECIAL_LOADS == 1
+    // Needs compute capability at least 3.5
+    return __ldg(val);
+#endif
+}
 
-    if (thread_id < num_rows) {
-        ValueType sum = ValueType(0);
-        for (IndexType i = 0; i < num_diagonals; ++i) {
-            IndexType col = diagonal_offsets[i] + thread_id;
-            if (col >= 0 && col < num_cols) {
-                auto val = values[i*pitch + thread_id];
-                auto a = x[col];
-                sum +=val*a;
-            }
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         int K>
+struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
+{
+    using parent = UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>;
+
+    IndexType col;
+    ValueType1 diag_val = 0;
+    ValueType2 x_val = 0;
+
+    __device__ void prefetch_vals(const IndexType* offsets,
+                                  const ValueType1* values,
+                                  const ValueType2* x,
+                                  int pitch,
+                                  int cols,
+                                  IndexType row,
+                                  IndexType offset_base,
+                                  IndexType i)
+    {
+        col = offsets[i] + row;
+
+        if (col >= 0 && col < cols)
+        {
+            diag_val = load_diag_val(&values[(offset_base + i)*pitch + row]);
+            x_val = load_x_val(&x[col]);
         }
-        y[thread_id] = sum;
+
+        parent::prefetch_vals(offsets, values, x, pitch, cols, row,
+                              offset_base, i + 1);
+    }
+
+    template<typename ValueType3>
+    __device__ ValueType3 accumulate()
+    {
+        return static_cast<ValueType3>(diag_val*x_val)
+                + parent::template accumulate<ValueType3>();
+    }
+};
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2>
+struct UnrolledLoop<IndexType, ValueType1, ValueType2, 0>
+{
+    __device__ void prefetch_vals(const IndexType* offsets,
+                                  const ValueType1* values,
+                                  const ValueType2* x,
+                                  int pitch,
+                                  int cols,
+                                  IndexType row,
+                                  IndexType offset_base,
+                                  IndexType i) {}
+
+    template<typename ValueType3>
+    __device__ ValueType3 accumulate() { return 0; }
+};
+
+
+#define PREFETCH_VALS_ITERATION(iter)                                          \
+    IndexType col##iter       = offsets[i + iter - 1] + row;             \
+    ValueType1 diag_val##iter = 0;                                             \
+    ValueType2 x_val##iter    = 0;                                             \
+    if (col##iter >= 0 && col##iter < num_cols)                                \
+    {                                                                          \
+        IndexType idx = (offset_base + i + iter - 1)*pitch + row    ;        \
+        diag_val##iter = load_diag_val(&values[idx]);                          \
+        x_val##iter    = load_x_val(&x[col##iter]);                            \
     }
-}
+
+#define ACCUMULATE_ITERATION(iter) \
+    sum += diag_val##iter * x_val##iter;
+
+#define REPEAT1(ITER) ITER(1);
+#define REPEAT2(ITER) ITER(2); REPEAT1(ITER)
+#define REPEAT3(ITER) ITER(3); REPEAT2(ITER)
+#define REPEAT4(ITER) ITER(4); REPEAT3(ITER)
+#define REPEAT5(ITER) ITER(5); REPEAT4(ITER)
+#define REPEAT6(ITER) ITER(6); REPEAT5(ITER)
+#define REPEAT7(ITER) ITER(7); REPEAT6(ITER)
+#define REPEAT8(ITER) ITER(8); REPEAT7(ITER)
+
+#define PREFETCH_VALS_IMPL(factor) REPEAT##factor(PREFETCH_VALS_ITERATION)
+#define ACCUMULATE_IMPL(factor) REPEAT##factor(ACCUMULATE_ITERATION)
+
+#define PREFETCH_VALS(factor) PREFETCH_VALS_IMPL(factor)
+#define ACCUMULATE(factor) ACCUMULATE_IMPL(factor)
+
 
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __device__ void
-ktt_dia_vector_kernel_blocked(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+blocked_offsets_dia_kernel(const int num_rows,
+                           const int num_cols,
+                           const int num_diagonals,
+                           const int pitch,
+                           const IndexType* __restrict__ diagonal_offsets,
+                           const ValueType1* __restrict__ values,
+                           const ValueType2*__restrict__  x,
+                           ValueType3* __restrict__ y)
 {
-    using ValueType = ValueType3;
-
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+    const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
     __shared__ IndexType offsets[BLOCK_SIZE];
-    ValueType sum = ValueType(0);
+    ValueType3 sum = ValueType3(0);
 
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) {
-        if (offset_base + threadIdx.x < num_diagonals) {
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
+    {
+        if (offset_base + threadIdx.x < num_diagonals)
             offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
 
         __syncthreads();
 
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base
+                            ? num_diagonals - offset_base
+                            : BLOCK_SIZE;
+
+        if (row < num_rows)
+        {
+            IndexType i = 0;
+
+#if PREFETCH_FACTOR > 0
+            for (; i < batch_size - PREFETCH_FACTOR + 1; i += PREFETCH_FACTOR)
+            {
+#if PREFETCH_TYPE == 0
+                UnrolledLoop<IndexType, ValueType1, ValueType2, PREFETCH_FACTOR> loop;
+                loop.prefetch_vals(offsets, values, x, pitch, num_cols,
+                                   row, offset_base, i);
+                sum += loop.template accumulate<ValueType3>();
+#elif PREFETCH_TYPE == 1
+                PREFETCH_VALS(PREFETCH_FACTOR);
+                ACCUMULATE(PREFETCH_FACTOR);
+#endif
+            }
+#endif
+
+            for (; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + row;
+                if (col >= 0 && col < num_cols)
+                {
+                    IndexType idx = (offset_base + i)*pitch + row;
+
+                    auto diag_val = load_diag_val(&values[idx]);
+                    auto x_val = load_x_val(&x[col]);
 
-        if (thread_id < num_rows) {
-            for (IndexType i = 0; i < batch_size; ++i) {
-                IndexType col = offsets[i] + thread_id;
-                if (col >= 0 && col < num_cols) {
-                    sum += values[(offset_base + i)*pitch + thread_id]*x[col];
+                    sum += diag_val*x_val;
                 }
             }
-            y[thread_id] = sum;
         }
 
         __syncthreads();
     }
+
+    if (row < num_rows)
+        y[row] = sum;
 }
 
-#define ITERATION(i, offset) \
-    col = offset + thread_id;                        \
-    if (col >= 0 && col < num_cols) {                              \
-        sum += values[i*pitch + thread_id]*x[col]; \
-    }
 
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __device__ void
-ktt_dia_vector_kernel_crazy(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+naive_dia_kernel(const int num_rows,
+                 const int num_cols,
+                 const int num_diagonals,
+                 const int pitch,
+                 const IndexType* __restrict__ diagonal_offsets,
+                 const ValueType1* __restrict__ values,
+                 const ValueType2* __restrict__ x,
+                 ValueType3* __restrict__ y)
 {
-    using ValueType = ValueType3;
-
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
-    ValueType sum = ValueType(0);
-
-    if (thread_id < num_rows) {
-        IndexType col;
-        ITERATION(0, -4);
-        ITERATION(1, -3);
-        ITERATION(2, -2);
-        ITERATION(3, -1);
-        ITERATION(4, -0);
-        ITERATION(5, -1);
-        ITERATION(6, -2);
-        ITERATION(7, -3);
-        ITERATION(8, -4);
-        ITERATION(9, -5);
+    if (thread_id < num_rows)
+    {
+        ValueType3 sum = ValueType3(0);
+        for (IndexType i = 0; i < num_diagonals; ++i)
+        {
+            IndexType col = diagonal_offsets[i] + thread_id;
+            if (col >= 0 && col < num_cols)
+            {
+                auto diag_val = load_diag_val(&values[i*pitch + thread_id]);
+                auto x_val = load_x_val(&x[col]);
+
+                sum += diag_val * x_val;
+            }
+        }
         y[thread_id] = sum;
     }
-
 }
 
+
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
-__launch_bounds__(BLOCK_SIZE,1) __global__ void
-ktt_dia_vector_kernel(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+          typename ValueType3>
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_dia_vector_kernel(const int num_rows,
+                      const int num_cols,
+                      const int num_diagonals,
+                      const int pitch,
+                      const IndexType* __restrict__ diagonal_offsets,
+                      const ValueType1* __restrict__ values,
+                      const ValueType2 *__restrict__ x,
+                      ValueType3* __restrict__ y)
 {
-#if KERNEL_TYPE == 0
-    ktt_dia_vector_kernel_simple<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
-        (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#elif KERNEL_TYPE == 1
-    ktt_dia_vector_kernel_blocked<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    blocked_offsets_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#elif KERNEL_TYPE == 2
-    ktt_dia_vector_kernel_crazy<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
-        (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#endif
 }
-
-// template <typename IndexType,
-//           typename ValueType1,
-//           typename ValueType2,
-//           typename ValueType3,
-//           unsigned int BLOCK_SIZE>
-// __launch_bounds__(BLOCK_SIZE,1) __global__ void
-// ktt_dia_vector_kernel(
-//                 const int num_rows,
-//                 const int num_cols,
-//                 const int num_diagonals,
-//                 const int pitch,
-//                 const IndexType* diagonal_offsets,
-//                 const ValueType1* values,
-//                 const ValueType2* x,
-//                 ValueType3* y)
-// {
-//     typedef ValueType1 ValueType;
-
-//     __shared__ IndexType offsets[BLOCK_SIZE];
-
-//     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-//     const IndexType grid_size = BLOCK_SIZE * gridDim.x;
-
-//     for(IndexType base = 0; base < num_diagonals; base += BLOCK_SIZE)
-//     {
-//         // read a chunk of the diagonal offsets into shared memory
-//         const IndexType chunk_size = IndexType(BLOCK_SIZE) < num_diagonals - base ? IndexType(BLOCK_SIZE) : num_diagonals - base;
-
-//         if(threadIdx.x < chunk_size)
-//             offsets[threadIdx.x] = diagonal_offsets[base + threadIdx.x];
-
-//         __syncthreads();
-
-//         // process chunk
-//         for(IndexType row = thread_id; row < num_rows; row += grid_size)
-//         {
-//             ValueType sum = (base == 0) ? ValueType(0) : ValueType(0);
-
-//             // index into values array
-//             IndexType idx = row + pitch * base;
-
-//             for(IndexType n = 0; n < chunk_size; n++)
-//             {
-//                 const IndexType col = row + offsets[n];
-
-//                 if(col >= 0 && col < num_cols)
-//                 {
-//                     const ValueType A_ij = values[idx];
-//                     sum = sum + (A_ij * x[col]);
-//                 }
-
-//                 idx += pitch;
-//             }
-
-//             y[row] = sum;
-//         }
-
-//         // wait until all threads are done reading offsets
-//         __syncthreads();
-//     }
-// }
\ No newline at end of file
diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
new file mode 100644
index 00000000..f71431a9
--- /dev/null
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -0,0 +1,213 @@
+
+#if DISABLE_UNROLL == 1
+#define UNROLL 1
+#else
+#define UNROLL
+#endif
+
+
+template<typename T>
+__device__ T load(const T* addr)
+{
+#if SPECIAL_LOADS == 0
+    return *addr;
+#elif SPECIAL_LOADS == 1
+    return __ldcv(addr);
+#endif
+}
+
+
+template<typename IndexType,
+         typename ValueType,
+         int K>
+struct Prefetcher : Prefetcher<IndexType, ValueType,  K - 1>
+{
+    using parent = Prefetcher<IndexType, ValueType, K - 1>;
+
+    IndexType col;
+    ValueType A_val = 0;
+    ValueType x_val = 0;
+
+    __device__
+    void prefetch_cols(const IndexType* __restrict__ Aj,
+                      IndexType offset,
+                      IndexType stride,
+                      IndexType i)
+    {
+        col = load(Aj + offset + i*stride);
+        parent::prefetch_cols(Aj, offset, stride, i + 1);
+    }
+
+    __device__
+    void prefetch_vals(const ValueType* __restrict__ Ax,
+                       const ValueType* __restrict__ x,
+                       IndexType offset,
+                       IndexType stride,
+                       IndexType i)
+    {
+#ifndef ELLR
+        if (col >= 0)
+        {
+#endif
+            A_val = load(Ax + offset + i*stride);
+            x_val = x[col];
+#ifndef ELLR
+        }
+#endif
+        parent::prefetch_vals(Ax, x, offset, stride, i + 1);
+    }
+
+    __device__ ValueType accumulate_results()
+    {
+        return x_val*A_val + parent::accumulate_results();
+    }
+};
+
+template<typename IndexType,
+         typename ValueType>
+struct Prefetcher<IndexType, ValueType, 0>
+{
+    __device__
+    void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset,
+                       IndexType stride, IndexType i) { }
+
+    __device__
+    void prefetch_vals(const ValueType* __restrict__ Ax,
+                       const ValueType* __restrict__ x, IndexType offset,
+                       IndexType stride, IndexType i) { }
+
+    __device__ ValueType accumulate_results()
+    {
+        return 0;
+    }
+};
+
+
+template <typename IndexType,
+          typename ValueType>
+__device__ void
+ktt_ell_kernel_basic(const IndexType num_rows,
+                     const IndexType num_cols,
+                     const IndexType max_cols_per_row,
+                     const IndexType pitch,
+                     const IndexType* __restrict__ Aj,
+                     const ValueType* __restrict__ Ax,
+                     const ValueType* __restrict__ x,
+                     ValueType* __restrict__ y,
+                     const IndexType* __restrict__ row_lengths)
+{
+    const IndexType row = blockDim.x*blockIdx.x + threadIdx.x;
+    const IndexType stride = THREADS_PER_ROW * pitch;
+
+#if THREADS_PER_ROW > 1
+    __shared__ ValueType sums[BLOCK_SIZE / THREADS_PER_ROW];
+
+    if (threadIdx.y == 0)
+        sums[threadIdx.x] = 0;
+
+    __syncthreads();
+#endif
+
+    if (row < num_rows)
+    {
+        const IndexType num_elems = row_lengths == NULL
+                                    ? max_cols_per_row
+                                    : row_lengths[row];
+        ValueType sum = 0;
+        IndexType n = threadIdx.y;
+        IndexType offset = row + n*pitch;
+
+#if PREFETCH_FACTOR > 0
+        IndexType bound = num_elems - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW;
+        IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW;
+
+        #pragma unroll UNROLL
+        for(; n < bound; n += step)
+        {
+            Prefetcher<IndexType, ValueType, PREFETCH_FACTOR> prefetcher;
+
+            prefetcher.prefetch_cols(Aj, offset, stride, 0);
+            prefetcher.prefetch_vals(Ax, x, offset, stride, 0);
+            sum += prefetcher.accumulate_results();
+
+            offset += PREFETCH_FACTOR * stride;
+        }
+#endif
+
+        #pragma unroll UNROLL
+        for (; n < num_elems; n += THREADS_PER_ROW)
+        {
+            const IndexType col = load(Aj + offset);
+
+            // This assumes that
+            // cusp::ell_matrix<...>::invalid_index is always < 0
+#ifndef ELLR
+            if (col >= 0)
+            {
+#endif
+                const ValueType x_j = x[col];
+                const ValueType A_ij = load(Ax + offset);
+
+                sum += A_ij * x_j;
+#ifndef ELLR
+            }
+            else
+            {
+#if BREAK == 1
+                break;
+#endif
+            }
+#endif
+
+            offset += stride;
+        }
+
+#if THREADS_PER_ROW > 1
+        atomicAdd(&sums[threadIdx.x], sum);
+
+        __syncthreads();
+
+        if (threadIdx.y == 0)
+        {
+            y[row] = sums[threadIdx.x];
+        }
+#else
+        y[row] = sum;
+#endif
+    }
+}
+
+
+template <typename IndexType,
+          typename ValueType>
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_ell_kernel(const IndexType num_rows,
+               const IndexType num_cols,
+               const IndexType num_cols_per_row,
+               const IndexType pitch,
+               const IndexType* __restrict__ Aj,
+               const ValueType* __restrict__ Ax,
+               const ValueType* __restrict__ x,
+               ValueType* __restrict__ y)
+{
+    ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch,
+                         Aj, Ax, x, y, (const IndexType*)NULL);
+}
+
+
+template <typename IndexType,
+          typename ValueType>
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_ellr_kernel(const IndexType num_rows,
+                const IndexType num_cols,
+                const IndexType num_cols_per_row,
+                const IndexType pitch,
+                const IndexType* __restrict__ Aj,
+                const ValueType* __restrict__ Ax,
+                const ValueType* __restrict__ x,
+                ValueType* __restrict__ y,
+                const IndexType* __restrict__ row_lengths)
+{
+    ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch,
+                         Aj, Ax, x, y, row_lengths);
+}
diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h
index 5e6accab..3f79d14b 100644
--- a/cusp/system/cuda/ktt/multiply.h
+++ b/cusp/system/cuda/ktt/multiply.h
@@ -1,8 +1,14 @@
-#pragma once 
+#pragma once
 
 #include <cusp/system/cuda/ktt/kernel.h>
-//#include <cusp/system/cuda/ktt/csr_multiply.h>
+
+#include <cusp/system/cuda/ktt/csr_multiply.h>
 #include <cusp/system/cuda/ktt/dia_multiply.h>
+#include <cusp/system/cuda/ktt/ell_multiply.h>
+
+#include <Ktt.h>
+
+#include <optional>
 
 namespace cusp {
 
@@ -12,24 +18,128 @@ namespace cuda {
 
 namespace ktt {
 
-// TODO: should this be here??
+
 // Helpful function to get a kernel used for a given multiplication
 // instead of having to manually specify the types.
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
-kernel_context get_kernel(::ktt::Tuner& tuner,
-                          const MatrixType& A,
-                          const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                          cusp::array1d<ValueType2, cusp::device_memory>& y)
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
+kernel_context get_kernel(
+    ::ktt::Tuner& tuner,
+    const MatrixType& A,
+    const cusp::array1d<ValueType1, MemorySpace1>& x,
+    cusp::array1d<ValueType2, MemorySpace2>& y)
 {
     using IndexType = typename MatrixType::index_type;
     using ValueType = typename MatrixType::value_type;
-    using FormatType = typename MatrixType::format;
 
-    return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, FormatType{});
+    if constexpr (std::is_same_v<cusp::host_memory,
+                                 typename MatrixType::memory_space>)
+    {
+        using DeviceMatrix =
+            typename MatrixType::rebind<cusp::device_memory>::type;
+        return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(
+                    tuner, DeviceMatrix{});
+    }
+    else
+    {
+        return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, A);
+    }
+}
+
+
+template <typename MatrixType,
+          typename VectorType1,
+          typename VectorType2>
+::ktt::KernelResult
+multiply(::ktt::Tuner& tuner,
+         const MatrixType& A,
+         const VectorType1& x,
+         VectorType2& y)
+{
+    const kernel_context& kernel = get_kernel(tuner, A, x, y);
+
+    auto args = add_arguments(kernel, A, x, y);
+
+    auto launcher = get_launcher(kernel, A, x, y, false);
+    tuner.SetLauncher(kernel.kernel_id, launcher);
+
+    auto res = tuner.TuneIteration(kernel.kernel_id, {});
+
+    remove_arguments(kernel, args);
+
+    return res;
+}
+
+
+template <typename MatrixType,
+          typename VectorType1,
+          typename VectorType2>
+::ktt::KernelResult
+multiply(::ktt::Tuner& tuner,
+         const MatrixType& A,
+         const VectorType1& x,
+         VectorType2& y,
+         const ::ktt::KernelConfiguration& configuration,
+         bool run_with_profiling = false)
+{
+    const kernel_context& kernel = get_kernel(tuner, A, x, y);
+
+    auto args = add_arguments(kernel, A, x, y);
+
+    auto launcher = get_launcher(kernel, A, x, y, run_with_profiling);
+    tuner.SetLauncher(kernel.kernel_id, launcher);
+
+    auto result = tuner.Run(kernel.kernel_id, configuration, {});
+
+    remove_arguments(kernel, args);
+
+    return result;
+}
+
+
+template <typename MatrixType,
+          typename VectorType1,
+          typename VectorType2>
+std::vector<::ktt::KernelResult>
+tune(::ktt::Tuner& tuner,
+     const MatrixType& A,
+     const VectorType1& x,
+     VectorType2& y,
+     std::optional<::ktt::ReferenceComputation> ref_computation = std::nullopt)
+{
+    using Format = typename MatrixType::format;
+    using HostVector2 =
+        typename VectorType2::template rebind<cusp::host_memory>::type;
+
+    kernel_context kernel = get_kernel(tuner, A, x, y);
+    auto args = add_arguments(kernel, A, x, y);
+
+    if (ref_computation)
+    {
+        tuner.SetReferenceComputation(get_output_argument(args, Format{}),
+                                      *ref_computation);
+    }
+
+    HostVector2 host_y = y;
+    tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface)
+    {
+        // clear y so previous results don't affect validation
+        y = host_y;
+        auto launcher = get_launcher(kernel, A, x, y, false);
+        launcher(interface);
+    });
+
+    auto results = tuner.Tune(kernel.kernel_id);
+
+    remove_arguments(kernel, args);
+
+    return results;
 }
 
+
 } // namespace ktt
 
 } // namespace cuda
diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h
index a2d1c029..e773b461 100644
--- a/cusp/system/cuda/ktt/utils.h
+++ b/cusp/system/cuda/ktt/utils.h
@@ -3,6 +3,10 @@
 #include <Ktt.h>
 #include <cusp/system/cuda/ktt/kernel.h>
 
+#include <cstdint>   // uint64_t
+#include <stdexcept> // std::runtime_error
+
+
 #define STR(str) #str
 #define STRING(str) STR(str)
 
@@ -14,9 +18,12 @@ namespace cuda {
 
 namespace ktt {
 
+using u64_vec = std::vector<uint64_t>;
+
+
 template<typename T>
 void* cast(const T* ptr)
-{ 
+{
     return const_cast<void*>(static_cast<const void*>(ptr));
 }
 
@@ -65,7 +72,7 @@ void add_arg(::ktt::Tuner& tuner, std::vector<::ktt::ArgumentId>& argument_ids,
     argument_ids.push_back(id);
 }
 
-template<typename... Args> 
+template<typename... Args>
 std::vector<::ktt::ArgumentId> add_arguments(::ktt::Tuner& tuner, Args&&... args)
 {
     std::vector<::ktt::ArgumentId> argument_ids;
@@ -85,6 +92,26 @@ inline void remove_arguments(const kernel_context& kernel, const std::vector<::k
 }
 
 
+inline uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf,
+                            const std::string& name)
+{
+    for (const auto& pair : conf.GetPairs())
+        if (pair.GetName() == name)
+            return pair.GetValue();
+
+    throw std::runtime_error("No paramater with name: " + name);
+}
+
+inline double get_parameter_double(const ::ktt::KernelConfiguration& conf,
+                            const std::string& name)
+{
+    for (const auto& pair : conf.GetPairs())
+        if (pair.GetName() == name)
+            return pair.GetValueDouble();
+
+    throw std::runtime_error("No paramater with name: " + name);
+}
+
 } // namespace ktt
 
 } // namespace cuda
diff --git a/cusp/system/detail/generic/conversions/coo_to_other.h b/cusp/system/detail/generic/conversions/coo_to_other.h
index 06a88b7f..b25fe8f8 100644
--- a/cusp/system/detail/generic/conversions/coo_to_other.h
+++ b/cusp/system/detail/generic/conversions/coo_to_other.h
@@ -137,7 +137,8 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
         DestinationType& dst,
         cusp::coo_format&,
         cusp::dia_format&,
-        size_t alignment = 32)
+        size_t alignment = 32,
+        bool dont_throw = false)
 {
     typedef typename DestinationType::index_type   IndexType;
     typedef typename DestinationType::value_type   ValueType;
@@ -156,7 +157,7 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
     const float size       = float(occupied_diagonals) * float(src.num_rows);
     const float fill_ratio = size / std::max(1.0f, float(src.num_entries));
 
-    if (max_fill < fill_ratio && size > threshold)
+    if (max_fill < fill_ratio && size > threshold && !dont_throw)
         throw cusp::format_conversion_exception("dia_matrix fill-in would exceed maximum tolerance");
 
     // compute number of occupied diagonals and enumerate them
@@ -219,7 +220,8 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
         cusp::coo_format&,
         cusp::ell_format&,
         size_t num_entries_per_row = 0,
-        size_t alignment = 32)
+        size_t alignment = 32,
+        bool dont_throw = false)
 {
     typedef typename DestinationType::index_type   IndexType;
     typedef typename DestinationType::value_type   ValueType;
@@ -242,7 +244,7 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
         const float size       = float(max_entries_per_row) * float(src.num_rows);
         const float fill_ratio = size / std::max(1.0f, float(src.num_entries));
 
-        if (max_fill < fill_ratio && size > threshold)
+        if (max_fill < fill_ratio && size > threshold && !dont_throw)
             throw cusp::format_conversion_exception("ell_matrix fill-in would exceed maximum tolerance");
 
         num_entries_per_row = max_entries_per_row;
diff --git a/cusp/system/detail/generic/conversions/dia_to_other.h b/cusp/system/detail/generic/conversions/dia_to_other.h
index 45c26f47..bc30fb5a 100644
--- a/cusp/system/detail/generic/conversions/dia_to_other.h
+++ b/cusp/system/detail/generic/conversions/dia_to_other.h
@@ -41,6 +41,7 @@
 #include <thrust/iterator/zip_iterator.h>
 
 #include <cassert>
+#include <iostream>
 
 namespace cusp
 {
@@ -211,6 +212,62 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
 
     thrust::copy(exec, src.values.values.begin(), src.values.values.end(), dst.values.values.begin());
 
+    using StridedIndexIterator = thrust::transform_iterator<
+                                    cusp::multiplies_value<IndexType>,
+                                    IndexIterator>;
+    using RowElemIndicesIterator = thrust::transform_iterator<
+                                    cusp::plus_value<IndexType>,
+                                    StridedIndexIterator>;
+    using RowColsIterator = thrust::permutation_iterator<
+                                decltype(dst.column_indices.values.begin()),
+                                RowElemIndicesIterator>;
+    using RowValsIterator = thrust::permutation_iterator<
+                                decltype(dst.values.values.begin()),
+                                RowElemIndicesIterator>;
+
+    for (size_t row_idx = 0; row_idx < src.num_rows; ++row_idx)
+    {
+        StridedIndexIterator strided_cols(
+                                IndexIterator(0),
+                                cusp::multiplies_value<IndexType>(pitch));
+        RowElemIndicesIterator elem_indices(
+                                strided_cols,
+                                cusp::plus_value<IndexType>((IndexType)row_idx));
+
+        RowColsIterator row_cols(dst.column_indices.values.begin(), elem_indices);
+        RowValsIterator row_vals(dst.values.values.begin(), elem_indices);
+
+        thrust::stable_partition(exec,
+                                 row_vals,
+                                 row_vals + dst.column_indices.num_cols,
+                                 row_cols,
+                                 thrust::placeholders::_1 != IndexType(-1));
+
+        thrust::stable_partition(exec,
+                                 row_cols,
+                                 row_cols + dst.column_indices.num_cols,
+                                 thrust::placeholders::_1 != IndexType(-1));
+    }
+
+    // for (int row = 0; row < dst.num_rows; ++row)
+    // {
+    //     int last = 0;
+    //     for (int col = 0; col < dst.column_indices.num_cols; ++col)
+    //     {
+    //         if (dst.column_indices(row, col) >= 0)
+    //         {
+    //             auto tmp1 = dst.column_indices(row, last);
+    //             dst.column_indices(row, last) = dst.column_indices(row, col);
+    //             dst.column_indices(row, col) = tmp1;
+
+    //             auto tmp2 = dst.values(row, last);
+    //             dst.values(row, last) = dst.values(row, col);
+    //             dst.values(row, col) = tmp2;
+
+    //             last++;
+    //         }
+    //     }
+    // }
 }
 
 } // end namespace generic
diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl
index 6e620701..75e4c85c 100644
--- a/cusp/system/detail/generic/multiply.inl
+++ b/cusp/system/detail/generic/multiply.inl
@@ -27,10 +27,13 @@
 #include <cusp/system/detail/generic/multiply/spgemm.h>
 #include <cusp/system/detail/generic/multiply/spmv.h>
 
+#include <thrust/functional.h>
+
 #include <cusp/ktt/ktt.h>
 #include <cusp/system/cuda/ktt/multiply.h>
 
-#include <thrust/functional.h>
+#include <type_traits>
+
 
 namespace cusp
 {
@@ -107,44 +110,59 @@ multiply(thrust::execution_policy<DerivedPolicy> &exec,
     cusp::multiply(exec, A, B, C, initialize, combine, reduce);
 }
 
+
+// The has_rebind type traits is used to check whether matrix is a view
 template <typename T, typename /*U*/ = void>
 struct has_rebind : std::false_type {};
- 
+
 template <typename T>
 struct has_rebind<T, std::void_t<typename T::rebind<int>>> : std::true_type {};
 
 template<typename T>
 constexpr bool has_rebind_v = has_rebind<T>::value;
 
+
 template <typename DerivedPolicy,
-         typename LinearOperator,
+         typename Matrix,
          typename ValueType1,
          typename ValueType2>
-typename thrust::detail::disable_if_convertible<typename LinearOperator::format,cusp::unknown_format>::type
+typename thrust::detail::disable_if_convertible<
+    typename Matrix::format,
+    cusp::unknown_format
+>::type
 multiply(cusp::system::cuda::detail::execution_policy<DerivedPolicy> &exec,
-         const LinearOperator&  A,
-         const cusp::array1d<ValueType1, cusp::device_memory>& B,
-         cusp::array1d<ValueType2, cusp::device_memory>& C)
+         const Matrix& A,
+         const cusp::array1d<ValueType1, cusp::device_memory>& x,
+         cusp::array1d<ValueType2, cusp::device_memory>& y)
 {
-    typedef typename LinearOperator::format  MatrixFormat;
-
-    if constexpr ((/*cusp::detail::is_csr<LinearOperator>::value ||*/ cusp::detail::is_dia<LinearOperator>::value) && has_rebind_v<LinearOperator>) {
-        if (cusp::ktt::detail::is_enabled) {
-            cusp::ktt::detail::lazy_init();
-            cusp::system::cuda::ktt::multiply(*cusp::ktt::detail::tuner, A, B, C);
-            return;    
+    using IndexType = typename Matrix::index_type;
+    using ValueType = typename Matrix::value_type;
+
+    if constexpr ((cusp::detail::is_ell<Matrix>::value
+                    || cusp::detail::is_dia<Matrix>::value
+                    && std::is_fundamental_v<IndexType>
+                    && std::is_fundamental_v<ValueType>
+                    && std::is_fundamental_v<ValueType1>
+                    && std::is_fundamental_v<ValueType2>)
+                    && has_rebind_v<Matrix>)
+    {
+        if (cusp::ktt::detail::is_enabled)
+        {
+            cusp::system::cuda::ktt::multiply(cusp::ktt::get_tuner(), A, x, y);
+            return;
         }
     }
 
-    typedef typename LinearOperator::value_type ValueType;
+    // The call is not meant for KTT. Pass it on...
 
     cusp::constant_functor<ValueType> initialize(0);
     thrust::multiplies<ValueType> combine;
     thrust::plus<ValueType> reduce;
 
-    cusp::multiply(exec, A, B, C, initialize, combine, reduce);
+    cusp::multiply(exec, A, x, y, initialize, combine, reduce);
 }
 
+
 template <typename DerivedPolicy,
          typename LinearOperator,
          typename MatrixOrVector1,
diff --git a/main.cu b/main.cu
new file mode 100644
index 00000000..c5f4e4da
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,1039 @@
+#include <cusp/multiply.h>
+#include <cusp/array2d.h>
+#include <cusp/print.h>
+#include <cusp/csr_matrix.h>
+#include <cusp/dia_matrix.h>
+#include <cusp/io/matrix_market.h>
+#include <cusp/gallery/poisson.h>
+
+#include <cusp/ktt/ktt.h>
+#include <cusp/ktt/matrix_generation.h>
+
+#include <chrono>
+#include <cmath>
+#include <type_traits>
+#include <sstream>
+#include <iomanip>
+#include <fstream>
+
+namespace krn = std::chrono;
+
+template<typename T, typename U, typename V>
+void write_csv(const std::vector<std::vector<T>>& data,
+               const std::vector<U>& col_labels,
+               const std::vector<V>& row_labels,
+               const std::string& path)
+{
+    if (row_labels.size() != data.size())
+        throw std::runtime_error("Weird number of rows or row labels.");
+
+    for (int i = 1; i < data.size(); ++i)
+    {
+        if (data[i].size() != data[i-1].size())
+            throw std::runtime_error("Not all rows have same length.");
+        if (i == 1 && data[i].size() != col_labels.size())
+            throw std::runtime_error("Bad number of column lables.");
+    }
+
+    std::ofstream file(path);
+
+    if (!file)
+        throw std::runtime_error("Cannot open: " + path);
+
+    const char* sep = "";
+    for (const auto& label : col_labels)
+    {
+        file << sep << label;
+        sep = ",";
+    }
+    file << "\n";
+
+    for (int i = 0; i < data.size(); ++i)
+    {
+        file << row_labels[i];
+        for (const auto& x : data[i])
+        {
+            file << "," << x;
+        }
+        file << "\n";
+    }
+}
+
+
+struct Timer
+{
+    Timer() : start_(krn::steady_clock::now()) { }
+
+    void restart() { start_ = krn::steady_clock::now(); }
+
+    void print_elapsed(const std::string& label)
+    {
+        auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start_).count();
+        std::cout << label << ": " << duration << "\n";
+    }
+
+    void checkpoint(const std::string& label)
+    {
+        print_elapsed(label);
+        restart();
+    }
+
+private:
+    krn::steady_clock::time_point start_;
+
+};
+
+
+struct ConversionTimer
+{
+    cusp::dia_matrix<int, float, cusp::host_memory> m;
+
+    ConversionTimer() {
+        m = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 512);
+    }
+
+    ConversionTimer(const ConversionTimer& o) {
+        std::cout << "COPY!\n";
+    }
+
+    template<typename Matrix1, typename Matrix2>
+    void operator()()
+    {
+        using Format1 = typename Matrix1::format;
+        using Format2 = typename Matrix2::format;
+
+        Matrix1 m1 = m;
+
+        auto start = krn::steady_clock::now();
+        Matrix2 m2 = m1;
+        auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+
+        std::cout << NAMEOF_TYPE(Format1) << " to " << NAMEOF_TYPE(Format2) << ": " << duration << "\n";
+    }
+};
+
+
+template<typename... Types>
+struct type_list
+{
+    static constexpr bool is_empty = true;
+};
+
+template<
+    typename First,
+    typename... Args
+>
+struct type_list<First, Args ...>
+{
+    using head = First;
+    using tail = type_list<Args ...>;
+
+    static constexpr bool is_empty = false;
+};
+
+
+template<
+    typename T,
+    typename List,
+    typename Op
+>
+void for_each_type(Op&& op)
+{
+    if constexpr (!List::is_empty)
+    {
+        if (!std::is_same_v<typename List::head, T>)
+        {
+            op.template operator()<T, typename List::head>();
+        }
+        for_each_type<T, typename List::tail>(std::forward<Op>(op));
+    }
+}
+
+template<typename List1, typename List2, typename Op>
+void for_each_pair(Op&& op)
+{
+    if constexpr (!List1::is_empty && !List2::is_empty)
+    {
+        for_each_type<typename List1::head, List2>(std::forward<Op>(op));
+        for_each_pair<typename List1::tail, List2>(std::forward<Op>(op));
+    }
+}
+
+template<typename List, typename Op>
+void for_each_pair(Op&& op)
+{
+    for_each_pair<List, List>(std::forward<Op>(op));
+}
+
+template<typename IndexType, typename ValueType, typename MemorySpace>
+using format_list = type_list<
+    cusp::dia_matrix<IndexType, ValueType, MemorySpace>,
+    cusp::csr_matrix<IndexType, ValueType, MemorySpace>,
+    cusp::coo_matrix<IndexType, ValueType, MemorySpace>,
+    cusp::ell_matrix<IndexType, ValueType, MemorySpace>
+>;
+
+
+void print_ktt_status(const ::ktt::ResultStatus& status) {
+    switch (status)
+    {
+    case ::ktt::ResultStatus::Ok:
+        std::cout << "OK\n";
+        break;
+    case ::ktt::ResultStatus::CompilationFailed:
+        std::cout << "COMPILATION FAILED\n";
+        break;
+    case ::ktt::ResultStatus::ComputationFailed:
+        std::cout << "COMPUTATION FAILED\n";
+        break;
+    case ::ktt::ResultStatus::DeviceLimitsExceeded:
+        std::cout << "DEVICE LIMIT EXCEEDED\n";
+        break;
+    case ::ktt::ResultStatus::ValidationFailed:
+        std::cout << "VALIDATION FAILED\n";
+        break;
+    }
+}
+
+template<typename IndexType, typename ValueType>
+cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>
+convert(const cusp::coo_matrix<IndexType, ValueType, cusp::host_memory>& matrix) {
+    using DiaMatrix = cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>;
+
+    cusp::array1d<int, cusp::host_memory> diag_map(matrix.num_rows + matrix.num_cols, 0);
+
+    auto start = krn::steady_clock::now();
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        diag_map[diag_idx] = 1;
+    }
+    auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    std::cout << "First loop:  " << duration << "\n";
+
+    typename DiaMatrix::diagonal_offsets_array_type offsets;
+    offsets.reserve(512); // arbitrary number
+
+    start = krn::steady_clock::now();
+    int diag_count = 0;
+    for (int i = 0; i < diag_map.size(); ++i) {
+        if (diag_map[i] != 0) {
+            diag_map[i] = diag_count;
+            offsets.push_back(i - matrix.num_rows + 1);
+            diag_count++;
+        }
+    }
+    duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    std::cout << "Third loop:  " << duration << "\n";
+
+
+    //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows;
+    int pitch = matrix.num_rows + 64/sizeof(float);
+    typename DiaMatrix::values_array_type values(matrix.num_rows, diag_count, ValueType(0), pitch);
+    //std::cout << values.pitch << "\n";
+
+    start = krn::steady_clock::now();
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        int values_array_idx = diag_map[diag_idx];
+        values(matrix.row_indices[i], values_array_idx) = matrix.values[i];
+    }
+    duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    std::cout << "Second loop: " << duration << "\n";
+
+    DiaMatrix res;
+
+    res.num_rows = matrix.num_rows;
+    res.num_cols = matrix.num_cols;
+    res.num_entries = matrix.num_entries;
+
+    res.diagonal_offsets.swap(offsets);
+    res.values.swap(values);
+
+    return res;
+}
+
+template<typename IndexType, typename ValueType>
+cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>
+convert2(const cusp::coo_matrix<IndexType, ValueType, cusp::host_memory>& matrix) {
+    using DiaMatrix = cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>;
+
+    Timer timer;
+
+    cusp::array1d<int, cusp::host_memory> diag_map(matrix.num_rows + matrix.num_cols, 0);
+
+    timer.checkpoint("init map");
+
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        diag_map[diag_idx] = 1;
+    }
+
+    timer.checkpoint("find diags");
+
+    typename DiaMatrix::diagonal_offsets_array_type offsets;
+    offsets.reserve(512); // arbitrary number
+
+    int diag_count = 0;
+    for (int i = 0; i < diag_map.size(); ++i) {
+        if (diag_map[i] != 0) {
+            diag_map[i] = diag_count;
+            offsets.push_back(i - matrix.num_rows + 1);
+            diag_count++;
+        }
+    }
+
+    timer.checkpoint("fill map");
+
+
+    //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows;
+    typename DiaMatrix::values_array_type values(diag_count, matrix.num_rows, ValueType(0));
+    //std::cout << values.pitch << "\n";
+
+    timer.checkpoint("init values");
+
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        int values_array_idx = diag_map[diag_idx];
+        values(values_array_idx, matrix.row_indices[i]) = matrix.values[i];
+    }
+
+    timer.checkpoint("fill values");
+
+    DiaMatrix res;
+
+    res.num_rows = matrix.num_rows;
+    res.num_cols = matrix.num_cols;
+    res.num_entries = matrix.num_entries;
+
+    res.diagonal_offsets.swap(offsets);
+    res.values.swap(values);
+
+    return res;
+}
+
+template<template<typename, typename, typename> typename Matrix,
+         typename IndexType,
+         typename ValueType>
+Matrix<IndexType, ValueType, cusp::device_memory>
+send_to_device(const Matrix<IndexType, ValueType, cusp::host_memory>& host_matrix)
+{
+    return host_matrix;
+}
+
+template<template<typename, typename, typename> typename Matrix,
+         typename IndexType,
+         typename ValueType,
+         typename MemorySpace>
+auto make_test_device_x(const Matrix<IndexType, ValueType, MemorySpace>& matrix)
+{
+    return cusp::array1d<ValueType, cusp::device_memory>(matrix.num_cols, 1);
+}
+
+template<template<typename, typename, typename> typename Matrix,
+         typename IndexType,
+         typename ValueType,
+         typename MemorySpace>
+auto make_test_device_y(const Matrix<IndexType, ValueType, MemorySpace>& matrix)
+{
+    return cusp::array1d<ValueType, cusp::device_memory>(matrix.num_rows, 0);
+}
+
+template<typename IndexType, typename ValueType>
+bool equals(const cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>& A,
+            const cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>& B)
+{
+    return A.values == B.values && A.diagonal_offsets == B.diagonal_offsets;
+}
+
+std::string size_str(uint64_t bytes)
+{
+    float size = bytes;
+
+    auto get_str = [](float x, const char* units)
+    {
+        std::stringstream s;
+        s << std::setprecision(3) << std::fixed << x << units;
+        return s.str();
+    };
+
+    if (size < 1024)
+    {
+        return get_str(size, " B");
+    }
+
+    size /= 1024;
+    if (size < 1024)
+    {
+        return get_str(size, " KB");
+    }
+
+    size /= 1024;
+    if (size < 1024)
+    {
+        return get_str(size, " MB");
+    }
+
+    size /= 1024;
+    return get_str(size, " GB");
+}
+
+std::string counter_str(const ::ktt::KernelProfilingCounter& counter)
+{
+    switch (counter.GetType()) {
+        case ::ktt::ProfilingCounterType::Double:
+            return std::to_string(counter.GetValueDouble());
+        case ::ktt::ProfilingCounterType::Int:
+            return std::to_string(counter.GetValueInt());
+        case ::ktt::ProfilingCounterType::Percent:
+            return std::to_string(counter.GetValueDouble()) + " %";
+        case ::ktt::ProfilingCounterType::Throughput:
+            return std::to_string(counter.GetValueUint()) + " bytes/sec\n";
+        case ::ktt::ProfilingCounterType::UnsignedInt:
+            return std::to_string(counter.GetValueUint());
+        case ::ktt::ProfilingCounterType::UtilizationLevel:
+            return std::to_string(counter.GetValueUint());
+    }
+
+    return "";
+}
+
+size_t dia_matrix_size(size_t num_rows, size_t num_cols, size_t num_diags)
+{
+    return sizeof(int)*num_diags + num_rows*num_diags*sizeof(float);
+}
+
+size_t dia_problem_size(size_t num_rows, size_t num_cols, size_t num_diags)
+{
+    return sizeof(int)*num_diags
+        + num_rows*num_diags*sizeof(float)
+        + num_rows*sizeof(float)
+        + num_cols*sizeof(float);
+}
+
+template<typename IndexT,
+         typename ValueT,
+         typename MemorySpace>
+size_t min_read_bytes(const cusp::dia_matrix<IndexT, ValueT, MemorySpace>& A)
+{
+    // Assume that each column has at least one diagonal passing through it.
+    size_t result = A.num_cols * sizeof(ValueT);
+
+    for (IndexT offset : A.diagonal_offsets)
+    {
+        IndexT start_row = offset >= 0 ? 0 : -offset;
+        IndexT start_col = offset >= 0 ? offset : 0;
+        IndexT end_row = start_row + std::min(A.num_cols - start_col, A.num_rows - start_row);
+
+        // Assume that global reads are done in 32 byte transactions that must be aligned.
+        IndexT elems_per_read = 32 / sizeof(ValueT);
+        IndexT adjusted_start_row = start_row - (start_row % elems_per_read);
+        IndexT adjusted_end_row = end_row + elems_per_read - (end_row % elems_per_read);
+
+        result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT);
+    }
+
+    return result;
+}
+
+std::vector<::ktt::KernelProfilingCounter>
+profile_multiply(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
+                 const cusp::array1d<float, cusp::device_memory>& x,
+                 cusp::array1d<float, cusp::device_memory>& y,
+                 const ::ktt::KernelConfiguration& conf,
+                 const std::vector<std::string>& counters,
+                ::ktt::KernelResult& result)
+{
+    auto& tuner = cusp::ktt::get_tuner();
+
+    auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    tuner.SetProfilingCounters(counters);
+    tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
+    tuner.SetProfiling(true);
+
+    ::ktt::KernelResult kernel_result;
+    do {
+        kernel_result = cusp::ktt::multiply(A, x, y, conf, true);
+    } while(kernel_result.HasRemainingProfilingRuns());
+
+    result = kernel_result;
+
+    for (const auto& computation_result : kernel_result.GetResults())
+    {
+        if (!computation_result.HasProfilingData())
+        {
+            std::cout << "No profiling data!\n";
+            continue;
+        }
+
+        const auto& profiling_data = computation_result.GetProfilingData();
+        if (!profiling_data.IsValid())
+        {
+            std::cout << "profiling data invalid\n";
+        }
+        else
+        {
+            return profiling_data.GetCounters();
+        }
+    }
+
+    return {};
+}
+
+size_t get_actual_read_bytes(::ktt::Tuner& tuner,
+                             const cusp::dia_matrix<int, float, cusp::device_memory>& A,
+                             const cusp::array1d<float, cusp::device_memory>& x,
+                             cusp::array1d<float, cusp::device_memory>& y,
+                             const ::ktt::KernelConfiguration& conf,
+                             ::ktt::KernelResult* res_p = nullptr)
+{
+    ::ktt::KernelResult res;
+    const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"}, res);
+
+    if (res_p) *res_p = res;
+
+    if (counters.empty()) {
+        throw std::runtime_error("Profiling failed.");
+    }
+
+    return counters[0].GetValueUint();
+}
+
+size_t get_actual_read_bytes(::ktt::Tuner& tuner,
+                             const cusp::dia_matrix<int, float, cusp::host_memory>& host_matrix,
+                             ::ktt::KernelResult* res_p = nullptr)
+{
+    cusp::dia_matrix<int, float, cusp::device_memory> A = host_matrix;
+    cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+    //                                                               { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+    auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id,
+                                          { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                            { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
+                                            { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } });
+
+    return get_actual_read_bytes(tuner, A, x, y, conf, res_p);
+}
+
+int get_max_size(int num_diags, size_t global_size)
+{
+    // Assume we have a square matrix of size n.
+    //
+    // On the gpu we need:
+    //   - offsets   -> num_diags * sizeof(int) bytes
+    //   - diagonals -> num_diags * n * sizeof(float) bytes
+    //   - x         ->             n * sizeof(float) bytes
+    //   - y         ->             n * sizeof(float) bytes
+    //
+    // It must hold:
+    //
+    //   global_size >= num_diags*sizeof(int) + (num_diags + 2)*n*sizeof(float)
+    //
+    // So:
+    //
+    //   n <= (global_size - num_diags*sizeof(int)) / (num_diags + 2)*sizeof(float)
+    //
+    return (global_size - num_diags*sizeof(int)) / ((num_diags + 2)*sizeof(float));
+}
+
+void test_poisson(::ktt::Tuner& tuner,
+                  const cusp::dia_matrix<int, float, cusp::host_memory>& host_matrix,
+                  const std::string& name)
+{
+    auto expected_bytes = min_read_bytes(host_matrix);
+    auto actual_bytes = get_actual_read_bytes(tuner, host_matrix);
+    std::cout << "  " << name << ": " << size_str(actual_bytes) << " vs " << size_str(expected_bytes) << "\n";
+}
+
+void test_poisson_sizes(::ktt::Tuner& tuner, size_t global_size)
+{
+    std::cout << "Testing poisson\n";
+
+    int n = get_max_size(5, global_size);
+    int grid_size = std::sqrt(n);
+    cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "5pt");
+
+    n = get_max_size(9, global_size);
+    grid_size = std::sqrt(n);
+    cusp::gallery::poisson9pt(dia_host_matrix, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "9pt");
+
+    n = get_max_size(27, global_size);
+    grid_size = std::cbrt(n);
+    cusp::gallery::poisson27pt(dia_host_matrix, grid_size, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "27pt");
+
+    n = get_max_size(7, global_size);
+    grid_size = std::cbrt(n);
+    cusp::gallery::poisson7pt(dia_host_matrix, grid_size, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "7pt");
+
+
+    // std::cout << "  Memory requirements: "
+    //           << size_str(dia_problem_size(dia_host_matrix.num_rows,
+    //                                        dia_host_matrix.num_cols,
+    //                                        dia_host_matrix.diagonal_offsets.size()))
+    //           << "\n";
+}
+
+void test_poisson_7pt(::ktt::Tuner& tuner, size_t global_limit)
+{
+    std::cout << "Testing different sizes of poisson 7pt problem (actual vs min)\n";
+
+    size_t mb = (size_t(1) << 20);
+    cusp::dia_matrix<int, float, cusp::host_memory> host_matrix;
+
+    for (size_t mem = 512*mb; mem <= global_limit; mem += 512*mb)
+    {
+        int n = get_max_size(7, mem);
+        int grid_size = std::cbrt(n);
+
+        cusp::gallery::poisson7pt(host_matrix, grid_size, grid_size, grid_size);
+
+        auto expected_bytes = min_read_bytes(host_matrix);
+        auto actual_bytes = get_actual_read_bytes(tuner, host_matrix);
+        float ratio = float(actual_bytes)/expected_bytes;
+
+        std::cout << "  "
+                  << size_str(mem) << " (" << grid_size << ") -> "
+                  << size_str(actual_bytes) << " vs "
+                  << size_str(expected_bytes) << " ("
+                  << ratio << ")\n";
+    }
+}
+
+void test_x_caching_uniform(::ktt::Tuner& tuner, size_t global_limit)
+{
+    std::vector<int> distances;
+    std::vector<int> diag_counts;
+
+    for (int dist = 64; dist <= 2048; dist *= 2)
+        distances.push_back(dist);
+
+    for (int count = 64; count <= 2048; count *= 2)
+        diag_counts.push_back(count + 1);
+
+    std::vector<std::vector<float>> bytes;
+    std::vector<std::vector<size_t>> times;
+
+    std::cout << "TESTING UNIFORM MATRICES\n"
+              << "Taget device memory: " << size_str(global_limit) << "\n";
+
+    for (auto dist : distances)
+    {
+        std::cout << "Distance: " << dist << "\n";
+        bytes.emplace_back();
+        times.emplace_back();
+
+        ::ktt::KernelResult kernel_result;
+        size_t reference_time = 1;
+
+        for (auto diag_count : diag_counts)
+        {
+            size_t n = get_max_size(diag_count, global_limit);
+            if (n < 1024) // That is too small.
+            {
+                std::cout << "skipping\n";
+                bytes.back().push_back(-1);
+                times.back().push_back(0);
+                continue;
+            }
+
+            auto host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, diag_count);
+
+            if (host_matrix.diagonal_offsets.size() != diag_count)
+            {
+                std::cout << "skipping\n";
+                bytes.back().push_back(-1);
+                times.back().push_back(0);
+                continue;
+            }
+
+            auto expected_bytes = min_read_bytes(host_matrix);
+            auto actual_bytes = get_actual_read_bytes(tuner, host_matrix, &kernel_result);
+            float ratio = float(actual_bytes)/expected_bytes;
+
+            if (reference_time == 1)
+            {
+                reference_time = kernel_result.GetKernelDuration();
+            }
+
+            size_t time = kernel_result.GetKernelDuration();
+            float time_ratio = float(time)/reference_time;
+
+            std::cout << diag_count << "(" << n << ") -> " << ratio << "(" << actual_bytes << " : " << expected_bytes << ")\n";
+                     // << " (" << time_ratio << " : " << time << ")\n";
+
+            bytes.back().push_back(ratio);
+            times.back().push_back(time);
+        }
+    }
+
+    write_csv(bytes, distances, diag_counts, "bytes_transfered.csv");
+}
+
+void test_l2()
+{
+    auto& tuner = cusp::ktt::get_tuner();
+    tuner.SetLoggingLevel(::ktt::LoggingLevel::Off);
+
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, 0);
+    if (err != cudaSuccess)
+    {
+        std::cout << "Failed to read device properties.\n";
+        return;
+    }
+
+    size_t max_bytes = std::min(2*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20));
+
+    std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n";
+    std::cout << "Global memory available: " << size_str(prop.totalGlobalMem) << "\n";
+    std::cout << "Global limit: " << size_str(max_bytes) << "\n";
+
+    //test_poisson_sizes(tuner, max_bytes);
+    //test_poisson_7pt(tuner, max_bytes);
+    test_x_caching_uniform(tuner, max_bytes);
+
+    // std::cout << "Using size: " << size_str(max_bytes) << "\n";
+
+    // // const size_t n = (1u << 20)*4;
+
+    // for (int dist = 1; dist <= 1024; dist *= 2)
+    // {
+    //     for (int diags = 4; diags <= 1024; diags *= 2)
+    //     {
+    //         int actual_diags = diags + 1; // to make it nicely symmetric
+    //         int n = max_floats/(actual_diags + 1);
+
+    //         if (dist*diags > n)
+    //         {
+    //             std::cout << "skipping\n";
+    //             continue;
+    //         }
+
+    //         auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, actual_diags);
+
+    //         cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+    //         cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    //         cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    //         auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    //         auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+    //                                                                       { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+
+    //         auto expected_bytes = min_read_bytes(A, x);
+    //         auto actual_bytes = get_actual_read_bytes(A, x, y, conf);
+
+    //         float ration = float(actual_bytes)/expected_bytes;
+
+    //         std::cout << "(" << size_str(n) << ", " << dist << ", " << actual_diags << ") -> " << ration << "\n";
+
+    //         // std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, actual_diags)) << "\n";
+    //         // std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+    //         // std::cout << "Actual:   " << size_str(actual_bytes) << "\n";
+    //         // std::cout << "------------------------------------------------\n";
+    //     }
+    // }
+
+    // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11);
+
+    // cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+    // cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    // cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+
+    // auto expected_bytes = min_read_bytes(A, x);
+    // auto actual_bytes = get_actual_read_bytes(A, x, y, conf);
+
+    // std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+    // std::cout << "Actual:   " << size_str(actual_bytes) << "\n";
+}
+
+void test_matrix()
+{
+    auto& tuner = cusp::ktt::get_tuner();
+
+    int n = 130879;
+
+    //n = get_max_size(5, 2*(size_t(1) << 30));
+    //int grid_size = std::sqrt(n);
+    //cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    //cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size);
+
+    auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 512, 2049);
+
+    std::cout << min_read_bytes(dia_host_matrix) << "\n";
+
+    cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+    cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+    cusp::array1d<float, cusp::device_memory> yy(A.num_rows);
+
+    // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
+
+    //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11);
+    //cusp::dia_matrix<int, float, cusp::device_memory> A2 = dia_host_matrix;
+
+    auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    // tuner.SetProfiling(true);
+    // tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
+    // tuner.SetProfilingCounters({
+    //     "dram_read_bytes"
+    //     //"dram_write_bytes",
+    //     //"l2_global_load_bytes"
+    // });
+
+    std::cout << "Device memory needed: " << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n";
+    // cusp::ktt::tune(A, x, y);
+
+    // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
+    // cusp::ktt::multiply(A, x, y, conf1);
+
+    auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id,
+                                          { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                            { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
+                                            { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } });
+
+    auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                                                   { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
+                                                                   { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(4) } });
+    // size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2);
+
+    // conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) },
+    //                                                           { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+    // size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2);
+
+    std::cout << A.diagonal_offsets[0] << "\n";
+
+    cusp::ktt::disable();
+    cusp::multiply(A, x, yy);
+    cusp::ktt::enable();
+
+    cusp::ktt::multiply(A, x, y, conf);
+
+    if (y == yy) {
+        std::cout << "GOOD!\n";
+    } else {
+        std::cout << "WRONG!\n";
+    }
+
+    for (int i = 0; i < y.size(); ++i)
+    {
+        if (y[i] != yy[i])
+        {
+            std::cout << i << " -> " << y[i] - yy[i] << "\n";
+        }
+    }
+
+    // std::cout << "non_cached: " << size_str(non_cached) << "\n";
+    // std::cout << "cached:     " << size_str(cached) << "\n";
+
+    // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
+
+    // cusp::ktt::tune(A, x, y);
+    // cusp::ktt::multiply(A, x, y, conf2);
+    // for (int i = 0; i < 10; ++i)
+    // {
+    //     std::cout << y[i] << "\n";
+    // }
+
+    // std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+
+    //auto expected_bytes = min_read_bytes(A, x);
+    //std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+
+    //for (int i = 0; i < 10; ++i) {
+    //    std::cout << y[i] << "\n";
+    //}
+
+    //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+    //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+
+    // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } });
+    // cusp::ktt::multiply(A, x, y, conf3);
+}
+
+int main(void)
+{
+    auto& tuner = cusp::ktt::get_tuner();
+    tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds);
+
+    test_l2();
+    // test_matrix();
+
+    // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256);
+    // cusp::coo_matrix<int, float, cusp::device_memory> A = orig;
+    // cusp::coo_matrix<int, float, cusp::host_memory> B = orig;
+
+    // auto start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::host_memory> B1 = convert(B);
+    // auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "ME host:     " << duration << "\n\n";
+
+    // start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::device_memory> B2 = convert(A);
+    // duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "ME device:   " << duration << "\n\n";
+
+    // start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::device_memory> B3 = A;
+    // duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "cusp device: " << duration << "\n\n";
+
+    // cusp::dia_matrix<int, float, cusp::host_memory> D = convert(A);
+    // if (!equals(D, orig)) {
+    //     std::cout << "RIIIIIIIP!!!!\n";
+    // }
+
+    // auto start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::host_memory> B1 = convert(A);
+    // auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "ME:         " << duration << "\n\n";
+
+    // start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::host_memory> B2 = convert2(A);
+    // duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "Transposed: " << duration << "\n\n";
+
+    // if (!equals(B1, B2)) {
+    //     std::cout << "RIIIIIIP!!!\n";
+    // }
+
+    // using list = format_list<int, float, cusp::host_memory>;
+    // ConversionTimer timer;
+    // for_each_pair<list>(timer);
+
+    // cudaDeviceProp prop;
+    // cudaError_t err = cudaGetDeviceProperties(&prop, 0);
+
+    // if (err == cudaSuccess) {
+    //     std::cout << "Max warps per sm: " << prop.maxThreadsPerMultiProcessor << "\n";
+    // } else {
+    //     std::cout << "Failed to retrieve device properties\n";
+    // }
+
+    // {
+    //     const int n = (1u << 20);
+    //     //const int n = (1u << 20)/2;
+
+    //     //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024);
+    //     auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11);
+    //     cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+
+    //     // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
+
+    //     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    //     cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    //     //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11);
+    //     //cusp::dia_matrix<int, float, cusp::device_memory> A2 = dia_host_matrix;
+
+    //     auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    //     //cusp::ktt::tune(A2, x, y);
+    //     //cusp::ktt::tune(A, x, y);
+
+    //     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
+    //     // cusp::ktt::multiply(A, x, y, conf1);
+
+    //     auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+    //     //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
+    //     cusp::ktt::multiply(A, x, y, conf2);
+
+    //     // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } });
+    //     // cusp::ktt::multiply(A, x, y, conf3);
+    // }
+
+    // {
+    //     const int n = 1u << 15;
+    //     auto coo_host_matrix = make_diagonal_symmetric_matrix(n, n, 512, 16);
+    //     cusp::coo_matrix<int, float, cusp::device_memory> coo_device_matrix = coo_host_matrix;
+    //     cusp::dia_matrix<int, float, cusp::device_memory> A = coo_host_matrix;
+
+    //     cusp::array1d<float, cusp::device_memory> x(coo_host_matrix.num_cols, 1);
+    //     cusp::array1d<float, cusp::device_memory> y_ref(coo_host_matrix.num_rows);
+    //     cusp::array1d<float, cusp::device_memory> y(coo_host_matrix.num_rows);
+
+    //     cusp::multiply(coo_device_matrix, x, y_ref);
+
+    //     auto& tuner = cusp::ktt::get_tuner();
+    //     auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    //     // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0]) << "\n";
+
+    //     tuner.SetProfiling(true);
+    //     tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
+
+    //     tuner.SetProfilingCounters({
+    //         "tex_cache_hit_rate",
+    //         "tex_cache_transactions",
+    //         "l1_cache_global_hit_rate",
+    //         "l2_l1_read_hit_rate",
+    //         "l2_l1_read_transactions",
+    //         "l2_read_transactions",
+    //         "l2_write_transactions",
+    //         "dram_read_transactions",
+    //         "dram_write_transactions",
+    //         "dram_read_bytes",
+    //         "dram_write_bytes"
+    //     });
+
+    //     auto result = cusp::ktt::multiply(A, x, y, {}, true);
+
+    //     for (const auto& res : result.GetResults()) {
+    //         if (!res.HasProfilingData()) {
+    //             std::cout << "No profiling data!\n";
+    //             continue;
+    //         }
+    //         const auto& prof = res.GetProfilingData();
+    //         if (!prof.IsValid()) {
+    //             std::cout << "profiling data invalid\n";
+    //         } else {
+    //             for (const auto& counter : prof.GetCounters()) {
+    //                 std::cout << counter.GetName() << " = ";
+    //                 switch (counter.GetType()) {
+    //                     case ::ktt::ProfilingCounterType::Double:
+    //                         std::cout << counter.GetValueDouble() << "\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::Int:
+    //                         std::cout << counter.GetValueInt() << "\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::Percent:
+    //                         std::cout << counter.GetValueDouble() << " %\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::Throughput:
+    //                         std::cout << counter.GetValueUint() << " bytes/sec\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::UnsignedInt:
+    //                         std::cout << counter.GetValueUint() << "\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::UtilizationLevel:
+    //                         std::cout << counter.GetValueUint() << "\n";
+    //                         break;
+    //                 }
+    //             }
+    //         }
+    //     }
+
+    //     if (y_ref != y) {
+    //         std::cout << "SECOND DIFF!\n";
+    //     }
+    // }
+
+    //cusp::ktt::multiply(A, x, y);
+    //std::cout << y[0] << "\n";
+
+    //std::cout << y_ref_host.size() << "\n";
+    //std::cout << y.size() << "\n";
+
+    return 0;
+}
diff --git a/testing/ktt.cu b/testing/ktt.cu
index 613e668a..d3bf2088 100644
--- a/testing/ktt.cu
+++ b/testing/ktt.cu
@@ -15,6 +15,7 @@
 #include <cusp/multiply.h>
 
 #include <cusp/ktt/ktt.h>
+#include <cusp/ktt/ellr_matrix.h>
 #include <cusp/ktt/matrix_generation.h>
 
 #include <iostream>
@@ -29,8 +30,17 @@
     }                                                                    \
     DECLARE_UNITTEST(VTEST##Ktt##Fmt##Matrix);
 
-#define DECLARE_KTT_UNITTEST(VTEST) \
-    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST,Dia,dia)
+#define DECLARE_KTT_ELLR_UNITTEST(VTEST)                                    \
+    void VTEST##Ktt##Ellr##Matrix(void)                                     \
+    {                                                                       \
+        VTEST< cusp::ktt::ellr_matrix<int, float, cusp::device_memory> >(); \
+    }                                                                       \
+    DECLARE_UNITTEST(VTEST##Ktt##Ellr##Matrix);
+
+#define DECLARE_KTT_UNITTEST(VTEST)                     \
+    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \
+    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell) \
+    DECLARE_KTT_ELLR_UNITTEST(VTEST)
 
 
 struct UnitTestStopCondition : ::ktt::StopCondition
@@ -71,11 +81,12 @@ private:
 };
 
 
-void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& results,
-                         const std::string& ktt_logs,
-                         const std::string& arg_name,
-                         const std::string& filename = "unknown",
-                         int lineno = -1)
+void assert_tunning_results_valid(
+    const std::vector<::ktt::KernelResult>& results,
+    const std::string& ktt_logs,
+    const std::string& arg_name,
+    const std::string& filename = "unknown",
+    int lineno = -1)
 {
     bool failed = false;
     unittest::UnitTestFailure f;
@@ -106,8 +117,10 @@ void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& result
 
         failed = true;
 
-        f << "[" << filename << ":" << lineno << "] " << result.GetKernelName() << ": ";
-        f << "Encountered an error: " << reason << "\n\n";
+        f << "[" << filename << ":" << lineno << "] ";
+        f << result.GetKernelName() << ": ";
+        f << reason << "\n\n";
+
         f << "On matrix: " << arg_name << "\n\n";
 
         f << "In configuration:\n";