From 05bd82b2d9f8e634048c20a51c8effe6b4b7b8cd Mon Sep 17 00:00:00 2001
From: Mirec <xdemek@fi.muni.cz>
Date: Mon, 26 Sep 2022 15:10:47 +0200
Subject: [PATCH 01/49] Add sketchy files

---
 build.sh                                  |   2 +-
 cusp/system/cuda/ktt/dia_multiply.h       |   8 +-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 357 +++++++----
 main.cu                                   | 710 ++++++++++++++++++++++
 4 files changed, 960 insertions(+), 117 deletions(-)
 create mode 100644 main.cu
diff --git a/build.sh b/build.sh
index 4f53b52f..1c04f512 100644
--- a/build.sh
+++ b/build.sh
@@ -2,7 +2,7 @@ nvcc -DCUSP_PATH=$(realpath .) \
      -I . -I ../KTT/Source \
      -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \
      --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \
-     -std=c++17 -g -O3 -DKTT_LINE_INFO -lineinfo main.cu
+     -std=c++17 -g -O3 -lineinfo main.cu
 
 [ "$?" -eq "0" ] || exit 1
 
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 6a7f31b2..c685f6ba 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -22,7 +22,7 @@ namespace dia {
 
 inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
 {
-    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1, 2 });
+    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1, 2, 3, 4 });
 }
 
 } // namespace dia
@@ -114,9 +114,7 @@ auto get_launcher(const kernel_context& ctx,
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
-            do {
-                interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size);
-            } while(interface.GetRemainingProfilingRuns(ctx.definition_ids[0]) > 0);
+            interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size);
         }
     };
 }
@@ -166,6 +164,8 @@ ::ktt::KernelResult multiply(::ktt::Tuner& tuner,
         return result;
     }
 
+    std::cout << "HEllo\n";
+
     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
     auto args = add_arguments(kernel, A, x, y);
 
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index df7ade63..fda35831 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -1,5 +1,52 @@
 #pragma once
 
+template<typename T>
+__device__ T min(T a, T b) {
+    return a < b ? a : b;
+}
+
+template<typename T>
+__device__ T max(T a, T b) {
+    return a > b ? a : b;
+}
+
+// template<typename IndexType,
+//          typename ValueType1,
+//          typename ValueType2,
+//          int K>
+// struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
+// {
+//     using parent = UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>;
+
+//     IndexType col;
+//     ValueType1 diag_val = 0;
+//     ValueType2 x_val = 0;
+
+//     __device__ void prefetch(const IndexType* offsets,
+//                              const ValueType1* values,
+//                              const ValueType2* x,
+//                              IndexType pitch,
+//                              IndexType row,
+//                              IndexType base,
+//                              IndexType cols)
+//     {
+//         if (col >= 0 && col < cols)
+//         {
+//             diag_val = values[]
+//             parent::prefetch(offsets, values, x, row, base + 1);
+//         }
+//     }
+
+//     template<typename ValueType3>
+//     __device__ ValueType3 do_iter(IndexType cols)
+//     {
+//         if (col < 0 || col >= cols)
+//         {
+//             return 0;
+//         }
+//         return diag_val*x_val + parent::do_iter<ValueType3>(cols);
+//     }
+// };
 
 template <typename IndexType,
           typename ValueType1,
@@ -7,15 +54,14 @@ template <typename IndexType,
           typename ValueType3,
           unsigned int BLOCK_SIZE>
 __device__ void
-ktt_dia_vector_kernel_simple(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+naive_dia_kernel(const int num_rows,
+                 const int num_cols,
+                 const int num_diagonals,
+                 const int pitch,
+                 const IndexType* diagonal_offsets,
+                 const ValueType1* values,
+                 const ValueType2* x,
+                 ValueType3* y)
 {
     using ValueType = ValueType3;
 
@@ -26,9 +72,9 @@ ktt_dia_vector_kernel_simple(
         for (IndexType i = 0; i < num_diagonals; ++i) {
             IndexType col = diagonal_offsets[i] + thread_id;
             if (col >= 0 && col < num_cols) {
-                auto val = values[i*pitch + thread_id];
-                auto a = x[col];
-                sum +=val*a;
+                auto diag_val = values[i*pitch + thread_id];
+                auto x_val = x[col];
+                sum += diag_val * x_val;
             }
         }
         y[thread_id] = sum;
@@ -41,15 +87,14 @@ template <typename IndexType,
           typename ValueType3,
           unsigned int BLOCK_SIZE>
 __device__ void
-ktt_dia_vector_kernel_blocked(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+blocked_offsets_dia_kernel(const int num_rows,
+                           const int num_cols,
+                           const int num_diagonals,
+                           const int pitch,
+                           const IndexType* diagonal_offsets,
+                           const ValueType1* values,
+                           const ValueType2* x,
+                           ValueType3* y)
 {
     using ValueType = ValueType3;
 
@@ -71,59 +116,203 @@ ktt_dia_vector_kernel_blocked(
             for (IndexType i = 0; i < batch_size; ++i) {
                 IndexType col = offsets[i] + thread_id;
                 if (col >= 0 && col < num_cols) {
-                    sum += values[(offset_base + i)*pitch + thread_id]*x[col];
+                    auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
+                    auto x_val = __ldg(x + col);
+                    sum += diag_val*x_val;
                 }
             }
-            y[thread_id] = sum;
         }
 
         __syncthreads();
     }
+
+    if (thread_id < num_rows) {
+        y[thread_id] = sum;
+    }
 }
 
-#define ITERATION(i, offset) \
-    col = offset + thread_id;                        \
-    if (col >= 0 && col < num_cols) {                              \
-        sum += values[i*pitch + thread_id]*x[col]; \
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3,
+          unsigned int BLOCK_SIZE>
+__device__ void
+cached_x_dia_kernel(const int num_rows,
+                    const int num_cols,
+                    const int num_diagonals,
+                    const int pitch,
+                    const IndexType* diagonal_offsets,
+                    const ValueType1* values,
+                    const ValueType2* x,
+                    ValueType3* y)
+{
+    using ValueType = ValueType3;
+
+    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+
+    __shared__ IndexType offsets[BLOCK_SIZE];
+    __shared__ ValueType x_cache[BLOCK_SIZE];
+    IndexType cache_start = -IndexType(BLOCK_SIZE);
+    IndexType cache_end = 0;
+
+    IndexType first_row = BLOCK_SIZE * blockIdx.x;
+    IndexType end_row = min(first_row + BLOCK_SIZE, num_rows);
+
+    ValueType sum = ValueType(0);
+
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) {
+        if (offset_base + threadIdx.x < num_diagonals) {
+            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
+        }
+
+        __syncthreads();
+
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+
+        if (thread_id < num_rows)
+        {
+            for (IndexType i = 0; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + thread_id;
+                ValueType x_val = -1;
+
+                if (col >= 0 && col < num_cols)
+                {
+                    x_val = col < cache_end ? x_cache[col - cache_start] : x[col];
+                    ValueType diag_val = values[(offset_base + i)*pitch + thread_id];
+                    sum += x_val * diag_val;
+                }
+
+                __syncthreads();
+
+                x_cache[threadIdx.x] = x_val;
+                cache_start = first_row + offsets[i];
+                cache_end = end_row + offsets[i];
+
+                __syncthreads();
+            }
+        }
+
+        __syncthreads();
     }
 
+    if (thread_id < num_rows) {
+        y[thread_id] = sum;
+    }
+}
+
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3,
           unsigned int BLOCK_SIZE>
 __device__ void
-ktt_dia_vector_kernel_crazy(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+experimental_dia_kernel(const int num_rows,
+                    const int num_cols,
+                    const int num_diagonals,
+                    const int pitch,
+                    const IndexType* diagonal_offsets,
+                    const ValueType1* values,
+                    const ValueType2* x,
+                    ValueType3* y)
 {
     using ValueType = ValueType3;
 
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
+    __shared__ IndexType offsets[BLOCK_SIZE];
+    __shared__ ValueType x_cache[2*BLOCK_SIZE];
+
+    IndexType first_row = BLOCK_SIZE * blockIdx.x;
+    IndexType first_col = offsets[0] + first_row;
+
+    if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) {
+        x_cache[threadIdx.x] = x[first_col + threadIdx.x];
+    }
+
+    if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) {
+        x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x];
+    }
+
     ValueType sum = ValueType(0);
 
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) {
+        if (offset_base + threadIdx.x < num_diagonals) {
+            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
+        }
+
+        __syncthreads();
+
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+
+        if (thread_id < num_rows)
+        {
+            for (IndexType i = 0; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + thread_id;
+
+                if (col >= 0 && col < num_cols)
+                {
+                    ValueType x_val = x_cache[col - first_col];
+                    ValueType diag_val = values[(offset_base + i)*pitch + thread_id];
+                    sum += x_val * diag_val;
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
     if (thread_id < num_rows) {
-        IndexType col;
-        ITERATION(0, -4);
-        ITERATION(1, -3);
-        ITERATION(2, -2);
-        ITERATION(3, -1);
-        ITERATION(4, -0);
-        ITERATION(5, -1);
-        ITERATION(6, -2);
-        ITERATION(7, -3);
-        ITERATION(8, -4);
-        ITERATION(9, -5);
         y[thread_id] = sum;
     }
+}
 
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3,
+          unsigned int BLOCK_SIZE>
+__device__ void
+cached_naive_dia_kernel(const int num_rows,
+                 const int num_cols,
+                 const int num_diagonals,
+                 const int pitch,
+                 const IndexType* diagonal_offsets,
+                 const ValueType1* values,
+                 const ValueType2* x,
+                 ValueType3* y)
+{
+    using ValueType = ValueType3;
+
+    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+
+    __shared__ ValueType x_cache[2*BLOCK_SIZE];
+
+    IndexType first_row = BLOCK_SIZE * blockIdx.x;
+    IndexType first_col = diagonal_offsets[0] + first_row;
+
+    if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) {
+        x_cache[threadIdx.x] = x[first_col + threadIdx.x];
+    }
+
+    if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) {
+        x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x];
+    }
+
+
+    if (thread_id < num_rows) {
+        ValueType sum = ValueType(0);
+        for (IndexType i = 0; i < num_diagonals; ++i) {
+            IndexType col = diagonal_offsets[i] + thread_id;
+            if (col >= 0 && col < num_cols) {
+                auto diag_val = values[i*pitch + thread_id];
+                auto x_val = x_cache[col - first_col];
+                sum += diag_val * x_val;
+            }
+        }
+        y[thread_id] = sum;
+    }
 }
 
 template <typename IndexType,
@@ -143,75 +332,19 @@ ktt_dia_vector_kernel(
                 ValueType3* y)
 {
 #if KERNEL_TYPE == 0
-    ktt_dia_vector_kernel_simple<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #elif KERNEL_TYPE == 1
-    ktt_dia_vector_kernel_blocked<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    blocked_offsets_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #elif KERNEL_TYPE == 2
-    ktt_dia_vector_kernel_crazy<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    cached_x_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
+#elif KERNEL_TYPE == 3
+    experimental_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+        (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
+// #elif KERNEL_TYPE == 4
+//     cached_naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+//         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #endif
 }
-
-// template <typename IndexType,
-//           typename ValueType1,
-//           typename ValueType2,
-//           typename ValueType3,
-//           unsigned int BLOCK_SIZE>
-// __launch_bounds__(BLOCK_SIZE,1) __global__ void
-// ktt_dia_vector_kernel(
-//                 const int num_rows,
-//                 const int num_cols,
-//                 const int num_diagonals,
-//                 const int pitch,
-//                 const IndexType* diagonal_offsets,
-//                 const ValueType1* values,
-//                 const ValueType2* x,
-//                 ValueType3* y)
-// {
-//     typedef ValueType1 ValueType;
-
-//     __shared__ IndexType offsets[BLOCK_SIZE];
-
-//     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-//     const IndexType grid_size = BLOCK_SIZE * gridDim.x;
-
-//     for(IndexType base = 0; base < num_diagonals; base += BLOCK_SIZE)
-//     {
-//         // read a chunk of the diagonal offsets into shared memory
-//         const IndexType chunk_size = IndexType(BLOCK_SIZE) < num_diagonals - base ? IndexType(BLOCK_SIZE) : num_diagonals - base;
-
-//         if(threadIdx.x < chunk_size)
-//             offsets[threadIdx.x] = diagonal_offsets[base + threadIdx.x];
-
-//         __syncthreads();
-
-//         // process chunk
-//         for(IndexType row = thread_id; row < num_rows; row += grid_size)
-//         {
-//             ValueType sum = (base == 0) ? ValueType(0) : ValueType(0);
-
-//             // index into values array
-//             IndexType idx = row + pitch * base;
-
-//             for(IndexType n = 0; n < chunk_size; n++)
-//             {
-//                 const IndexType col = row + offsets[n];
-
-//                 if(col >= 0 && col < num_cols)
-//                 {
-//                     const ValueType A_ij = values[idx];
-//                     sum = sum + (A_ij * x[col]);
-//                 }
-
-//                 idx += pitch;
-//             }
-
-//             y[row] = sum;
-//         }
-
-//         // wait until all threads are done reading offsets
-//         __syncthreads();
-//     }
-// }
\ No newline at end of file
diff --git a/main.cu b/main.cu
new file mode 100644
index 00000000..f7edd966
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,710 @@
+#include <cusp/multiply.h>
+#include <cusp/array2d.h>
+#include <cusp/print.h>
+#include <cusp/csr_matrix.h>
+#include <cusp/dia_matrix.h>
+#include <cusp/io/matrix_market.h>
+#include <cusp/ktt/ktt.h>
+#include <cusp/ktt/matrix_generation.h>
+
+#include <chrono>
+
+#include <type_traits>
+#include <sstream>
+#include <iomanip>
+
+#include "better_convert.hpp"
+
+namespace krn = std::chrono;
+
+
+struct Timer
+{
+    Timer() : start_(krn::steady_clock::now()) { }
+
+    void restart() { start_ = krn::steady_clock::now(); }
+
+    void print_elapsed(const std::string& label)
+    {
+        auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start_).count();
+        std::cout << label << ": " << duration << "\n";
+    }
+
+    void checkpoint(const std::string& label)
+    {
+        print_elapsed(label);
+        restart();
+    }
+
+private:
+    krn::steady_clock::time_point start_;
+
+};
+
+
+struct ConversionTimer
+{
+    cusp::dia_matrix<int, float, cusp::host_memory> m;
+
+    ConversionTimer() {
+        m = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 512);
+    }
+
+    ConversionTimer(const ConversionTimer& o) {
+        std::cout << "COPY!\n";
+    }
+
+    template<typename Matrix1, typename Matrix2>
+    void operator()()
+    {
+        using Format1 = typename Matrix1::format;
+        using Format2 = typename Matrix2::format;
+
+        Matrix1 m1 = m;
+
+        auto start = krn::steady_clock::now();
+        Matrix2 m2 = m1;
+        auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+
+        std::cout << NAMEOF_TYPE(Format1) << " to " << NAMEOF_TYPE(Format2) << ": " << duration << "\n";
+    }
+};
+
+
+template<typename... Types>
+struct type_list
+{
+    static constexpr bool is_empty = true;
+};
+
+template<
+    typename First,
+    typename... Args
+>
+struct type_list<First, Args ...>
+{
+    using head = First;
+    using tail = type_list<Args ...>;
+
+    static constexpr bool is_empty = false;
+};
+
+
+template<
+    typename T,
+    typename List,
+    typename Op
+>
+void for_each_type(Op&& op)
+{
+    if constexpr (!List::is_empty)
+    {
+        if (!std::is_same_v<typename List::head, T>)
+        {
+            op.template operator()<T, typename List::head>();
+        }
+        for_each_type<T, typename List::tail>(std::forward<Op>(op));
+    }
+}
+
+template<typename List1, typename List2, typename Op>
+void for_each_pair(Op&& op)
+{
+    if constexpr (!List1::is_empty && !List2::is_empty)
+    {
+        for_each_type<typename List1::head, List2>(std::forward<Op>(op));
+        for_each_pair<typename List1::tail, List2>(std::forward<Op>(op));
+    }
+}
+
+template<typename List, typename Op>
+void for_each_pair(Op&& op)
+{
+    for_each_pair<List, List>(std::forward<Op>(op));
+}
+
+template<typename IndexType, typename ValueType, typename MemorySpace>
+using format_list = type_list<
+    cusp::dia_matrix<IndexType, ValueType, MemorySpace>,
+    cusp::csr_matrix<IndexType, ValueType, MemorySpace>,
+    cusp::coo_matrix<IndexType, ValueType, MemorySpace>,
+    cusp::ell_matrix<IndexType, ValueType, MemorySpace>
+>;
+
+
+void print_ktt_status(const ::ktt::ResultStatus& status) {
+    switch (status)
+    {
+    case ::ktt::ResultStatus::Ok:
+        std::cout << "OK\n";
+        break;
+    case ::ktt::ResultStatus::CompilationFailed:
+        std::cout << "COMPILATION FAILED\n";
+        break;
+    case ::ktt::ResultStatus::ComputationFailed:
+        std::cout << "COMPUTATION FAILED\n";
+        break;
+    case ::ktt::ResultStatus::DeviceLimitsExceeded:
+        std::cout << "DEVICE LIMIT EXCEEDED\n";
+        break;
+    case ::ktt::ResultStatus::ValidationFailed:
+        std::cout << "VALIDATION FAILED\n";
+        break;
+    }
+}
+
+template<typename IndexType, typename ValueType>
+cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>
+convert(const cusp::coo_matrix<IndexType, ValueType, cusp::host_memory>& matrix) {
+    using DiaMatrix = cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>;
+
+    cusp::array1d<int, cusp::host_memory> diag_map(matrix.num_rows + matrix.num_cols, 0);
+
+    auto start = krn::steady_clock::now();
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        diag_map[diag_idx] = 1;
+    }
+    auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    std::cout << "First loop:  " << duration << "\n";
+
+    typename DiaMatrix::diagonal_offsets_array_type offsets;
+    offsets.reserve(512); // arbitrary number
+
+    start = krn::steady_clock::now();
+    int diag_count = 0;
+    for (int i = 0; i < diag_map.size(); ++i) {
+        if (diag_map[i] != 0) {
+            diag_map[i] = diag_count;
+            offsets.push_back(i - matrix.num_rows + 1);
+            diag_count++;
+        }
+    }
+    duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    std::cout << "Third loop:  " << duration << "\n";
+
+
+    //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows;
+    int pitch = matrix.num_rows + 64/sizeof(float);
+    typename DiaMatrix::values_array_type values(matrix.num_rows, diag_count, ValueType(0), pitch);
+    //std::cout << values.pitch << "\n";
+
+    start = krn::steady_clock::now();
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        int values_array_idx = diag_map[diag_idx];
+        values(matrix.row_indices[i], values_array_idx) = matrix.values[i];
+    }
+    duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    std::cout << "Second loop: " << duration << "\n";
+
+    DiaMatrix res;
+
+    res.num_rows = matrix.num_rows;
+    res.num_cols = matrix.num_cols;
+    res.num_entries = matrix.num_entries;
+
+    res.diagonal_offsets.swap(offsets);
+    res.values.swap(values);
+
+    return res;
+}
+
+template<typename IndexType, typename ValueType>
+cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>
+convert2(const cusp::coo_matrix<IndexType, ValueType, cusp::host_memory>& matrix) {
+    using DiaMatrix = cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>;
+
+    Timer timer;
+
+    cusp::array1d<int, cusp::host_memory> diag_map(matrix.num_rows + matrix.num_cols, 0);
+
+    timer.checkpoint("init map");
+
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        diag_map[diag_idx] = 1;
+    }
+
+    timer.checkpoint("find diags");
+
+    typename DiaMatrix::diagonal_offsets_array_type offsets;
+    offsets.reserve(512); // arbitrary number
+
+    int diag_count = 0;
+    for (int i = 0; i < diag_map.size(); ++i) {
+        if (diag_map[i] != 0) {
+            diag_map[i] = diag_count;
+            offsets.push_back(i - matrix.num_rows + 1);
+            diag_count++;
+        }
+    }
+
+    timer.checkpoint("fill map");
+
+
+    //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows;
+    typename DiaMatrix::values_array_type values(diag_count, matrix.num_rows, ValueType(0));
+    //std::cout << values.pitch << "\n";
+
+    timer.checkpoint("init values");
+
+    for (int i = 0; i < matrix.num_entries; ++i) {
+        int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1;
+        int values_array_idx = diag_map[diag_idx];
+        values(values_array_idx, matrix.row_indices[i]) = matrix.values[i];
+    }
+
+    timer.checkpoint("fill values");
+
+    DiaMatrix res;
+
+    res.num_rows = matrix.num_rows;
+    res.num_cols = matrix.num_cols;
+    res.num_entries = matrix.num_entries;
+
+    res.diagonal_offsets.swap(offsets);
+    res.values.swap(values);
+
+    return res;
+}
+
+template<typename IndexType, typename ValueType>
+bool equals(const cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>& A,
+            const cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>& B)
+{
+    return A.values == B.values && A.diagonal_offsets == B.diagonal_offsets;
+}
+
+std::string size_str(uint64_t bytes)
+{
+    float size = bytes;
+
+    auto get_str = [](float x, const char* units)
+    {
+        std::stringstream s;
+        s << std::setprecision(3) << std::fixed << x << units;
+        return s.str();
+    };
+
+    if (size < 1024)
+    {
+        return get_str(size, " B");
+    }
+
+    size /= 1024;
+    if (size < 1024)
+    {
+        return get_str(size, " KB");
+    }
+
+    size /= 1024;
+    if (size < 1024)
+    {
+        return get_str(size, " MB");
+    }
+
+    size /= 1024;
+    return get_str(size, " GB");
+}
+
+std::string counter_str(const ::ktt::KernelProfilingCounter& counter)
+{
+    switch (counter.GetType()) {
+        case ::ktt::ProfilingCounterType::Double:
+            return std::to_string(counter.GetValueDouble());
+        case ::ktt::ProfilingCounterType::Int:
+            return std::to_string(counter.GetValueInt());
+        case ::ktt::ProfilingCounterType::Percent:
+            return std::to_string(counter.GetValueDouble()) + " %";
+        case ::ktt::ProfilingCounterType::Throughput:
+            return std::to_string(counter.GetValueUint()) + " bytes/sec\n";
+        case ::ktt::ProfilingCounterType::UnsignedInt:
+            return std::to_string(counter.GetValueUint());
+        case ::ktt::ProfilingCounterType::UtilizationLevel:
+            return std::to_string(counter.GetValueUint());
+    }
+
+    return "";
+}
+
+size_t dia_matrix_size(size_t num_rows, size_t num_cols, size_t num_diags)
+{
+    return sizeof(int)*num_diags + num_rows*num_diags*sizeof(float);
+}
+
+template<typename IndexT,
+         typename ValueT1,
+         typename MemorySpace1,
+         typename ValueT2,
+         typename MemorySpace2>
+size_t min_read_bytes(const cusp::dia_matrix<IndexT, ValueT1, MemorySpace1>& A,
+                      const cusp::array1d<ValueT2, MemorySpace2>& x)
+{
+    // Assume that each column has at least one diagonal passing through it.
+    size_t result = x.size() * sizeof(ValueT2);
+
+    for (IndexT offset : A.diagonal_offsets)
+    {
+        IndexT start_row = offset >= 0 ? 0 : -offset;
+        IndexT start_col = offset >= 0 ? offset : 0;
+        IndexT end_row = std::min(A.num_cols - start_col, A.num_rows - start_row);
+
+        // Assume that global reads are done in 32 byte transactions that must be aligned.
+        IndexT elems_per_read = 32 / sizeof(ValueT1);
+        IndexT adjusted_start_row = start_row - (start_row % elems_per_read);
+        IndexT adjusted_end_row = end_row + elems_per_read - (end_row % elems_per_read);
+
+        result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT1);
+    }
+
+    return result;
+}
+
+::ktt::KernelResult profile_kernel(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
+                                   const cusp::array1d<float, cusp::device_memory>& x,
+                                   cusp::array1d<float, cusp::device_memory>& y,
+                                   const ::ktt::KernelConfiguration& conf)
+{
+    ::ktt::KernelResult result;
+    std::vector<::ktt::KernelResult> results;
+    do {
+        result = cusp::ktt::multiply(A, x, y, conf, true);
+        results.push_back(result);
+    } while(result.HasRemainingProfilingRuns());
+
+    auto& tuner = cusp::ktt::get_tuner();
+    tuner.SaveResults(results, "results.json", ::ktt::OutputFormat::JSON);
+
+    return result;
+}
+
+
+size_t get_actual_read_bytes(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
+                             const cusp::array1d<float, cusp::device_memory>& x,
+                             cusp::array1d<float, cusp::device_memory>& y,
+                             const ::ktt::KernelConfiguration& conf)
+{
+    auto& tuner = cusp::ktt::get_tuner();
+
+    auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    tuner.SetProfilingCounters({
+        "dram_read_bytes",
+        "dram_read_transactions",
+        "dram_read_throughput",
+        "l2_global_load_bytes"
+        //"l2_global_load_bytes"
+    });
+    tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
+    tuner.SetProfiling(true);
+
+    auto kernel_result = profile_kernel(A, x, y, conf);
+
+    // auto kernel_result = cusp::ktt::multiply(A, x, y, conf, true);
+
+    // if (kernel_result.HasRemainingProfilingRuns())
+    // {
+    //     std::cout << "NEED RUNS!\n";
+    // }
+
+    for (const auto& computation_result : kernel_result.GetResults())
+    {
+        if (!computation_result.HasProfilingData())
+        {
+            std::cout << "No profiling data!\n";
+            continue;
+        }
+
+        const auto& profiling_data = computation_result.GetProfilingData();
+        if (!profiling_data.IsValid())
+        {
+            std::cout << "profiling data invalid\n";
+        }
+        else
+        {
+            std::cout << "Transactions: " << profiling_data.GetCounter("dram_read_transactions").GetValueUint() << "\n";
+            std::cout << "Throughput: " << profiling_data.GetCounter("dram_read_throughput").GetValueUint() << "\n";
+            std::cout << "Bytes: " << profiling_data.GetCounter("l2_global_load_bytes").GetValueUint() << "\n";
+            return profiling_data.GetCounter("dram_read_bytes").GetValueUint();
+        }
+    }
+
+    return size_t(-1);
+}
+
+void test_l2()
+{
+    auto& tuner = cusp::ktt::get_tuner();
+
+    const size_t max_bytes = 2u*(1u << 30); // 2 GB
+    const size_t n = (1u << 20)*4;
+
+    for (int d = 32; d < 4096; d *= 2)
+    {
+        if (dia_matrix_size(n, n, d) <= max_bytes)
+        {
+            std::cout << d << " diags: \n";
+
+            auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, d);
+
+            cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+            cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+            cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+            auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+            auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+
+            auto expected_bytes = min_read_bytes(A, x);
+            auto actual_bytes = get_actual_read_bytes(A, x, y, conf);
+
+            std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, d)) << "\n";
+            std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+            std::cout << "Actual:   " << size_str(actual_bytes) << "\n";
+        }
+    }
+
+    // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11);
+
+    // cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+    // cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    // cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+
+    // auto expected_bytes = min_read_bytes(A, x);
+    // auto actual_bytes = get_actual_read_bytes(A, x, y, conf);
+
+    // std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+    // std::cout << "Actual:   " << size_str(actual_bytes) << "\n";
+}
+
+void test_matrix()
+{
+    auto& tuner = cusp::ktt::get_tuner();
+
+    const int n = 8*(1u << 20);
+    //const int n = (1u << 20)/2;
+
+    //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024);
+    auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32);
+    cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+
+    // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
+
+    cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11);
+    //cusp::dia_matrix<int, float, cusp::device_memory> A2 = dia_host_matrix;
+
+    auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    // tuner.SetProfiling(true);
+    // tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
+    // tuner.SetProfilingCounters({
+    //     "dram_read_bytes"
+    //     //"dram_write_bytes",
+    //     //"l2_global_load_bytes"
+    // });
+
+    //cusp::ktt::tune(A2, x, y);
+    //cusp::ktt::tune(A, x, y);
+
+    // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
+    // cusp::ktt::multiply(A, x, y, conf1);
+
+    auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+    //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
+
+    //cusp::ktt::multiply(A, x, y, conf2);
+    std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+
+    auto expected_bytes = min_read_bytes(A, x);
+    std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+
+
+
+    //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+    //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+
+    // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } });
+    // cusp::ktt::multiply(A, x, y, conf3);
+}
+
+int main(void)
+{
+    auto& tuner = cusp::ktt::get_tuner();
+    tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds);
+
+    //test_l2();
+    test_matrix();
+
+    // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256);
+    // cusp::coo_matrix<int, float, cusp::device_memory> A = orig;
+    // cusp::coo_matrix<int, float, cusp::host_memory> B = orig;
+
+    // auto start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::host_memory> B1 = convert(B);
+    // auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "ME host:     " << duration << "\n\n";
+
+    // start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::device_memory> B2 = convert(A);
+    // duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "ME device:   " << duration << "\n\n";
+
+    // start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::device_memory> B3 = A;
+    // duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "cusp device: " << duration << "\n\n";
+
+    // cusp::dia_matrix<int, float, cusp::host_memory> D = convert(A);
+    // if (!equals(D, orig)) {
+    //     std::cout << "RIIIIIIIP!!!!\n";
+    // }
+
+    // auto start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::host_memory> B1 = convert(A);
+    // auto duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "ME:         " << duration << "\n\n";
+
+    // start = krn::steady_clock::now();
+    // cusp::dia_matrix<int, float, cusp::host_memory> B2 = convert2(A);
+    // duration = krn::duration_cast<krn::microseconds>(krn::steady_clock::now() - start).count();
+    // std::cout << "Transposed: " << duration << "\n\n";
+
+    // if (!equals(B1, B2)) {
+    //     std::cout << "RIIIIIIP!!!\n";
+    // }
+
+    // using list = format_list<int, float, cusp::host_memory>;
+    // ConversionTimer timer;
+    // for_each_pair<list>(timer);
+
+    // cudaDeviceProp prop;
+    // cudaError_t err = cudaGetDeviceProperties(&prop, 0);
+
+    // if (err == cudaSuccess) {
+    //     std::cout << "Max warps per sm: " << prop.maxThreadsPerMultiProcessor << "\n";
+    // } else {
+    //     std::cout << "Failed to retrieve device properties\n";
+    // }
+
+    // {
+    //     const int n = (1u << 20);
+    //     //const int n = (1u << 20)/2;
+
+    //     //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024);
+    //     auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11);
+    //     cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+
+    //     // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
+
+    //     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    //     cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    //     //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11);
+    //     //cusp::dia_matrix<int, float, cusp::device_memory> A2 = dia_host_matrix;
+
+    //     auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    //     //cusp::ktt::tune(A2, x, y);
+    //     //cusp::ktt::tune(A, x, y);
+
+    //     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
+    //     // cusp::ktt::multiply(A, x, y, conf1);
+
+    //     auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+    //     //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
+    //     cusp::ktt::multiply(A, x, y, conf2);
+
+    //     // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } });
+    //     // cusp::ktt::multiply(A, x, y, conf3);
+    // }
+
+    // {
+    //     const int n = 1u << 15;
+    //     auto coo_host_matrix = make_diagonal_symmetric_matrix(n, n, 512, 16);
+    //     cusp::coo_matrix<int, float, cusp::device_memory> coo_device_matrix = coo_host_matrix;
+    //     cusp::dia_matrix<int, float, cusp::device_memory> A = coo_host_matrix;
+
+    //     cusp::array1d<float, cusp::device_memory> x(coo_host_matrix.num_cols, 1);
+    //     cusp::array1d<float, cusp::device_memory> y_ref(coo_host_matrix.num_rows);
+    //     cusp::array1d<float, cusp::device_memory> y(coo_host_matrix.num_rows);
+
+    //     cusp::multiply(coo_device_matrix, x, y_ref);
+
+    //     auto& tuner = cusp::ktt::get_tuner();
+    //     auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+
+    //     // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0]) << "\n";
+
+    //     tuner.SetProfiling(true);
+    //     tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
+
+    //     tuner.SetProfilingCounters({
+    //         "tex_cache_hit_rate",
+    //         "tex_cache_transactions",
+    //         "l1_cache_global_hit_rate",
+    //         "l2_l1_read_hit_rate",
+    //         "l2_l1_read_transactions",
+    //         "l2_read_transactions",
+    //         "l2_write_transactions",
+    //         "dram_read_transactions",
+    //         "dram_write_transactions",
+    //         "dram_read_bytes",
+    //         "dram_write_bytes"
+    //     });
+
+    //     auto result = cusp::ktt::multiply(A, x, y, {}, true);
+
+    //     for (const auto& res : result.GetResults()) {
+    //         if (!res.HasProfilingData()) {
+    //             std::cout << "No profiling data!\n";
+    //             continue;
+    //         }
+    //         const auto& prof = res.GetProfilingData();
+    //         if (!prof.IsValid()) {
+    //             std::cout << "profiling data invalid\n";
+    //         } else {
+    //             for (const auto& counter : prof.GetCounters()) {
+    //                 std::cout << counter.GetName() << " = ";
+    //                 switch (counter.GetType()) {
+    //                     case ::ktt::ProfilingCounterType::Double:
+    //                         std::cout << counter.GetValueDouble() << "\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::Int:
+    //                         std::cout << counter.GetValueInt() << "\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::Percent:
+    //                         std::cout << counter.GetValueDouble() << " %\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::Throughput:
+    //                         std::cout << counter.GetValueUint() << " bytes/sec\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::UnsignedInt:
+    //                         std::cout << counter.GetValueUint() << "\n";
+    //                         break;
+    //                     case ::ktt::ProfilingCounterType::UtilizationLevel:
+    //                         std::cout << counter.GetValueUint() << "\n";
+    //                         break;
+    //                 }
+    //             }
+    //         }
+    //     }
+
+    //     if (y_ref != y) {
+    //         std::cout << "SECOND DIFF!\n";
+    //     }
+    // }
+
+    //cusp::ktt::multiply(A, x, y);
+    //std::cout << y[0] << "\n";
+
+    //std::cout << y_ref_host.size() << "\n";
+    //std::cout << y.size() << "\n";
+
+    return 0;
+}

From 87be93a6094c6c8f7b26b9f7a661ef1dd7d2c95b Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Fri, 30 Sep 2022 14:53:47 +0200
Subject: [PATCH 02/49] Don't include better_convert.hpp

---
 main.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/main.cu b/main.cu
index f7edd966..28f9d63e 100644
--- a/main.cu
+++ b/main.cu
@@ -13,8 +13,6 @@
 #include <sstream>
 #include <iomanip>
 
-#include "better_convert.hpp"
-
 namespace krn = std::chrono;
 
 

From 7d7178b0ef6b76c7bc80f685865a878945cd365b Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@fi.muni.cz>
Date: Fri, 30 Sep 2022 14:57:33 +0200
Subject: [PATCH 03/49] Implement prefetching using shared memory

---
 build.sh                                  |  4 +-
 cusp/system/cuda/ktt/dia_multiply.h       |  3 +-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 63 ++++++++++++++++++++---
 main.cu                                   | 15 ++++--
 4 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/build.sh b/build.sh
index 1c04f512..9809c63c 100644
--- a/build.sh
+++ b/build.sh
@@ -1,7 +1,7 @@
 nvcc -DCUSP_PATH=$(realpath .) \
      -I . -I ../KTT/Source \
-     -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \
-     --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \
+     -l cuda -l ktt -L ../KTT/Build/x86_64_Release/ \
+     --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Release/) \
      -std=c++17 -g -O3 -lineinfo main.cu
 
 [ "$?" -eq "0" ] || exit 1
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index c685f6ba..659fbef2 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -22,7 +22,8 @@ namespace dia {
 
 inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
 {
-    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1, 2, 3, 4 });
+    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1 });
+    tuner.AddParameter(kernel.kernel_id, "PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4, 8 });
 }
 
 } // namespace dia
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index fda35831..51d87b06 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -100,11 +100,18 @@ blocked_offsets_dia_kernel(const int num_rows,
 
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
+#if PREFETCH_FACTOR > 0
+    __shared__ ValueType1 values_prefetched[PREFETCH_FACTOR*BLOCK_SIZE];
+    __shared__ ValueType2 x_prefetched[PREFETCH_FACTOR*BLOCK_SIZE];
+#endif
+
     __shared__ IndexType offsets[BLOCK_SIZE];
     ValueType sum = ValueType(0);
 
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) {
-        if (offset_base + threadIdx.x < num_diagonals) {
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
+    {
+        if (offset_base + threadIdx.x < num_diagonals)
+        {
             offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
         }
 
@@ -112,21 +119,65 @@ blocked_offsets_dia_kernel(const int num_rows,
 
         int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
 
-        if (thread_id < num_rows) {
-            for (IndexType i = 0; i < batch_size; ++i) {
+        if (thread_id < num_rows)
+        {
+#if PREFETCH_FACTOR == 0
+            for (IndexType i = 0; i < batch_size; ++i)
+            {
                 IndexType col = offsets[i] + thread_id;
-                if (col >= 0 && col < num_cols) {
+                if (col >= 0 && col < num_cols)
+                {
                     auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
                     auto x_val = __ldg(x + col);
                     sum += diag_val*x_val;
                 }
             }
+#else
+            int rest = batch_size % PREFETCH_FACTOR;
+            int end = batch_size - rest;
+
+            for (IndexType i = 0; i < end; i += PREFETCH_FACTOR)
+            {
+                for (int j = 0; j < PREFETCH_FACTOR; ++j)
+                {
+                    IndexType col = offsets[i + j] + thread_id;
+                    if (col >= 0 && col < num_cols)
+                    {
+                        auto val = values[ (offset_base + i + j)*pitch + thread_id ];
+                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] = val;
+                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] = __ldg(x + col);
+                    }
+                    else
+                    {
+                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType1(0);
+                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType2(0);
+                    }
+                }
+
+                for (int j = 0; j < PREFETCH_FACTOR; ++j)
+                {
+                    sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x];
+                }
+            }
+
+            for (IndexType i = end; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + thread_id;
+                if (col >= 0 && col < num_cols)
+                {
+                    auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
+                    auto x_val = __ldg(x + col);
+                    sum += diag_val*x_val;
+                }
+            }
+#endif
         }
 
         __syncthreads();
     }
 
-    if (thread_id < num_rows) {
+    if (thread_id < num_rows)
+    {
         y[thread_id] = sum;
     }
 }
diff --git a/main.cu b/main.cu
index f7edd966..cd4ade60 100644
--- a/main.cu
+++ b/main.cu
@@ -515,16 +515,21 @@ void test_matrix()
     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
     // cusp::ktt::multiply(A, x, y, conf1);
 
-    auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+    auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                                                   { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
     //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
 
-    //cusp::ktt::multiply(A, x, y, conf2);
-    std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+    // cusp::ktt::tune(A, x, y);
 
-    auto expected_bytes = min_read_bytes(A, x);
-    std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+    cusp::ktt::multiply(A, x, y, conf2);
+    //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
 
+    //auto expected_bytes = min_read_bytes(A, x);
+    //std::cout << "Expected: " << size_str(expected_bytes) << "\n";
 
+    //for (int i = 0; i < 10; ++i) {
+    //    std::cout << y[i] << "\n";
+    //}
 
     //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
     //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";

From 455504b925bb4fd75dc45d027832ee9c7d2e88e9 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 28 Nov 2022 15:09:52 +0100
Subject: [PATCH 04/49] Fix formatting

---
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 51d87b06..6c25a200 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -67,11 +67,14 @@ naive_dia_kernel(const int num_rows,
 
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
-    if (thread_id < num_rows) {
+    if (thread_id < num_rows)
+    {
         ValueType sum = ValueType(0);
-        for (IndexType i = 0; i < num_diagonals; ++i) {
+        for (IndexType i = 0; i < num_diagonals; ++i)
+        {
             IndexType col = diagonal_offsets[i] + thread_id;
-            if (col >= 0 && col < num_cols) {
+            if (col >= 0 && col < num_cols)
+            {
                 auto diag_val = values[i*pitch + thread_id];
                 auto x_val = x[col];
                 sum += diag_val * x_val;
@@ -325,7 +328,7 @@ template <typename IndexType,
           typename ValueType3,
           unsigned int BLOCK_SIZE>
 __device__ void
-cached_naive_dia_kernel(const int num_rows,
+cached_x_naive_dia_kernel(const int num_rows,
                  const int num_cols,
                  const int num_diagonals,
                  const int pitch,
@@ -395,7 +398,7 @@ ktt_dia_vector_kernel(
     experimental_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 // #elif KERNEL_TYPE == 4
-//     cached_naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+//     cached_x_naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
 //         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #endif
 }

From c563dcb014cbd6d6d97cf3727edf0684cac59735 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 28 Nov 2022 15:10:21 +0100
Subject: [PATCH 05/49] Remove debug print

---
 cusp/system/cuda/ktt/dia_multiply.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 659fbef2..d009bfce 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -165,8 +165,6 @@ ::ktt::KernelResult multiply(::ktt::Tuner& tuner,
         return result;
     }
 
-    std::cout << "HEllo\n";
-
     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
     auto args = add_arguments(kernel, A, x, y);
 

From da27bd7845213c9d4026fed41d916a9c179cbced Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 28 Nov 2022 15:10:52 +0100
Subject: [PATCH 06/49] Test cache effectiveness

---
 main.cu | 301 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 220 insertions(+), 81 deletions(-)

diff --git a/main.cu b/main.cu
index 386802e3..352b3d50 100644
--- a/main.cu
+++ b/main.cu
@@ -4,11 +4,13 @@
 #include <cusp/csr_matrix.h>
 #include <cusp/dia_matrix.h>
 #include <cusp/io/matrix_market.h>
+#include <cusp/gallery/poisson.h>
+
 #include <cusp/ktt/ktt.h>
 #include <cusp/ktt/matrix_generation.h>
 
 #include <chrono>
-
+#include <cmath>
 #include <type_traits>
 #include <sstream>
 #include <iomanip>
@@ -331,16 +333,21 @@ size_t dia_matrix_size(size_t num_rows, size_t num_cols, size_t num_diags)
     return sizeof(int)*num_diags + num_rows*num_diags*sizeof(float);
 }
 
+size_t dia_problem_size(size_t num_rows, size_t num_cols, size_t num_diags)
+{
+    return sizeof(int)*num_diags
+        + num_rows*num_diags*sizeof(float)
+        + num_rows*sizeof(float)
+        + num_cols*sizeof(float);
+}
+
 template<typename IndexT,
-         typename ValueT1,
-         typename MemorySpace1,
-         typename ValueT2,
-         typename MemorySpace2>
-size_t min_read_bytes(const cusp::dia_matrix<IndexT, ValueT1, MemorySpace1>& A,
-                      const cusp::array1d<ValueT2, MemorySpace2>& x)
+         typename ValueT,
+         typename MemorySpace>
+size_t min_read_bytes(const cusp::dia_matrix<IndexT, ValueT, MemorySpace>& A)
 {
     // Assume that each column has at least one diagonal passing through it.
-    size_t result = x.size() * sizeof(ValueT2);
+    size_t result = A.num_cols * sizeof(ValueT);
 
     for (IndexT offset : A.diagonal_offsets)
     {
@@ -349,62 +356,35 @@ size_t min_read_bytes(const cusp::dia_matrix<IndexT, ValueT1, MemorySpace1>& A,
         IndexT end_row = std::min(A.num_cols - start_col, A.num_rows - start_row);
 
         // Assume that global reads are done in 32 byte transactions that must be aligned.
-        IndexT elems_per_read = 32 / sizeof(ValueT1);
+        IndexT elems_per_read = 32 / sizeof(ValueT);
         IndexT adjusted_start_row = start_row - (start_row % elems_per_read);
         IndexT adjusted_end_row = end_row + elems_per_read - (end_row % elems_per_read);
 
-        result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT1);
+        result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT);
     }
 
     return result;
 }
 
-::ktt::KernelResult profile_kernel(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
-                                   const cusp::array1d<float, cusp::device_memory>& x,
-                                   cusp::array1d<float, cusp::device_memory>& y,
-                                   const ::ktt::KernelConfiguration& conf)
-{
-    ::ktt::KernelResult result;
-    std::vector<::ktt::KernelResult> results;
-    do {
-        result = cusp::ktt::multiply(A, x, y, conf, true);
-        results.push_back(result);
-    } while(result.HasRemainingProfilingRuns());
-
-    auto& tuner = cusp::ktt::get_tuner();
-    tuner.SaveResults(results, "results.json", ::ktt::OutputFormat::JSON);
-
-    return result;
-}
-
-
-size_t get_actual_read_bytes(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
-                             const cusp::array1d<float, cusp::device_memory>& x,
-                             cusp::array1d<float, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& conf)
+std::vector<::ktt::KernelProfilingCounter>
+profile_multiply(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
+                 const cusp::array1d<float, cusp::device_memory>& x,
+                 cusp::array1d<float, cusp::device_memory>& y,
+                 const ::ktt::KernelConfiguration& conf,
+                 const std::vector<std::string>& counters)
 {
     auto& tuner = cusp::ktt::get_tuner();
 
     auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
 
-    tuner.SetProfilingCounters({
-        "dram_read_bytes",
-        "dram_read_transactions",
-        "dram_read_throughput",
-        "l2_global_load_bytes"
-        //"l2_global_load_bytes"
-    });
+    tuner.SetProfilingCounters(counters);
     tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids);
     tuner.SetProfiling(true);
 
-    auto kernel_result = profile_kernel(A, x, y, conf);
-
-    // auto kernel_result = cusp::ktt::multiply(A, x, y, conf, true);
-
-    // if (kernel_result.HasRemainingProfilingRuns())
-    // {
-    //     std::cout << "NEED RUNS!\n";
-    // }
+    ::ktt::KernelResult kernel_result;
+    do {
+        kernel_result = cusp::ktt::multiply(A, x, y, conf, true);
+    } while(kernel_result.HasRemainingProfilingRuns());
 
     for (const auto& computation_result : kernel_result.GetResults())
     {
@@ -421,46 +401,195 @@ size_t get_actual_read_bytes(const cusp::dia_matrix<int, float, cusp::device_mem
         }
         else
         {
-            std::cout << "Transactions: " << profiling_data.GetCounter("dram_read_transactions").GetValueUint() << "\n";
-            std::cout << "Throughput: " << profiling_data.GetCounter("dram_read_throughput").GetValueUint() << "\n";
-            std::cout << "Bytes: " << profiling_data.GetCounter("l2_global_load_bytes").GetValueUint() << "\n";
-            return profiling_data.GetCounter("dram_read_bytes").GetValueUint();
+            return profiling_data.GetCounters();
         }
     }
 
-    return size_t(-1);
+    return {};
+}
+
+size_t get_actual_read_bytes(::ktt::Tuner& tuner,
+                             const cusp::dia_matrix<int, float, cusp::device_memory>& A,
+                             const cusp::array1d<float, cusp::device_memory>& x,
+                             cusp::array1d<float, cusp::device_memory>& y,
+                             const ::ktt::KernelConfiguration& conf)
+{
+    const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"});
+
+    if (counters.empty()) {
+        throw std::runtime_error("Profiling failed.");
+    }
+
+    return counters[0].GetValueUint();
+}
+
+size_t get_actual_read_bytes(::ktt::Tuner& tuner,
+                             const cusp::dia_matrix<int, float, cusp::host_memory>& host_matrix)
+{
+    cusp::dia_matrix<int, float, cusp::device_memory> A = host_matrix;
+    cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                                                  { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+
+    return get_actual_read_bytes(tuner, A, x, y, conf);
+}
+
+int get_max_size(int num_diags, size_t global_size)
+{
+    // Assume we have a square matrix of size n.
+    //
+    // On the gpu we need:
+    //   - offsets   -> num_diags * sizeof(int) bytes
+    //   - diagonals -> num_diags * n * sizeof(float) bytes
+    //   - x         ->             n * sizeof(float) bytes
+    //   - y         ->             n * sizeof(float) bytes
+    //
+    // It must hold:
+    //
+    //   global_size >= num_diags*sizeof(int) + (num_diags + 2)*n*sizeof(float)
+    //
+    // So:
+    //
+    //   n <= (global_size - num_diags*sizeof(int)) / (num_diags + 2)*sizeof(float)
+    //
+    return (global_size - num_diags*sizeof(int)) / ((num_diags + 2)*sizeof(float));
+}
+
+void test_poisson(::ktt::Tuner& tuner,
+                  const cusp::dia_matrix<int, float, cusp::host_memory>& host_matrix,
+                  const std::string& name)
+{
+    auto expected_bytes = min_read_bytes(host_matrix);
+    auto actual_bytes = get_actual_read_bytes(tuner, host_matrix);
+    std::cout << "  " << name << ": " << size_str(actual_bytes) << " vs " << size_str(expected_bytes) << "\n";
+}
+
+void test_poisson_sizes(::ktt::Tuner& tuner, size_t global_size)
+{
+    std::cout << "Testing poisson\n";
+
+    int n = get_max_size(5, global_size);
+    int grid_size = std::sqrt(n);
+    cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "5pt");
+
+    n = get_max_size(9, global_size);
+    grid_size = std::sqrt(n);
+    cusp::gallery::poisson9pt(dia_host_matrix, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "9pt");
+
+    n = get_max_size(27, global_size);
+    grid_size = std::cbrt(n);
+    cusp::gallery::poisson27pt(dia_host_matrix, grid_size, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "27pt");
+
+    n = get_max_size(7, global_size);
+    grid_size = std::cbrt(n);
+    cusp::gallery::poisson7pt(dia_host_matrix, grid_size, grid_size, grid_size);
+    test_poisson(tuner, dia_host_matrix, "7pt");
+
+
+    // std::cout << "  Memory requirements: "
+    //           << size_str(dia_problem_size(dia_host_matrix.num_rows,
+    //                                        dia_host_matrix.num_cols,
+    //                                        dia_host_matrix.diagonal_offsets.size()))
+    //           << "\n";
+}
+
+void test_poisson_7pt(::ktt::Tuner& tuner, size_t global_limit)
+{
+    std::cout << "Testing different sizes of poisson 7pt problem (actual vs min)\n";
+
+    size_t mb = (size_t(1) << 20);
+    cusp::dia_matrix<int, float, cusp::host_memory> host_matrix;
+
+    for (size_t mem = 512*mb; mem <= global_limit; mem += 512*mb)
+    {
+        int n = get_max_size(7, mem);
+        int grid_size = std::cbrt(n);
+
+        cusp::gallery::poisson7pt(host_matrix, grid_size, grid_size, grid_size);
+
+        auto expected_bytes = min_read_bytes(host_matrix);
+        auto actual_bytes = get_actual_read_bytes(tuner, host_matrix);
+        float ratio = float(actual_bytes)/expected_bytes;
+
+        std::cout << "  "
+                  << size_str(mem) << " (" << grid_size << ") -> "
+                  << size_str(actual_bytes) << " vs "
+                  << size_str(expected_bytes) << " ("
+                  << ratio << ")\n";
+    }
 }
 
 void test_l2()
 {
     auto& tuner = cusp::ktt::get_tuner();
 
-    const size_t max_bytes = 2u*(1u << 30); // 2 GB
-    const size_t n = (1u << 20)*4;
+    tuner.SetLoggingLevel(::ktt::LoggingLevel::Off);
 
-    for (int d = 32; d < 4096; d *= 2)
+    cudaDeviceProp prop;
+    cudaError_t err = cudaGetDeviceProperties(&prop, 0);
+    if (err != cudaSuccess)
     {
-        if (dia_matrix_size(n, n, d) <= max_bytes)
-        {
-            std::cout << d << " diags: \n";
+        std::cout << "Failed to read device properties.\n";
+        return;
+    }
 
-            auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, d);
+    size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem);
+    size_t max_floats = max_bytes/sizeof(float);
 
-            cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
-            cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
-            cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+    std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n";
+    std::cout << "Global memory available: " << size_str(prop.totalGlobalMem) << "\n";
+    std::cout << "Global limit: " << size_str(max_bytes) << "\n";
 
-            auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
-            auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } });
+    test_poisson_sizes(tuner, max_bytes);
+    test_poisson_7pt(tuner, max_bytes);
 
-            auto expected_bytes = min_read_bytes(A, x);
-            auto actual_bytes = get_actual_read_bytes(A, x, y, conf);
+    // std::cout << "Using size: " << size_str(max_bytes) << "\n";
 
-            std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, d)) << "\n";
-            std::cout << "Expected: " << size_str(expected_bytes) << "\n";
-            std::cout << "Actual:   " << size_str(actual_bytes) << "\n";
-        }
-    }
+    // // const size_t n = (1u << 20)*4;
+
+    // for (int dist = 1; dist <= 1024; dist *= 2)
+    // {
+    //     for (int diags = 4; diags <= 1024; diags *= 2)
+    //     {
+    //         int actual_diags = diags + 1; // to make it nicely symmetric
+    //         int n = max_floats/(actual_diags + 1);
+
+    //         if (dist*diags > n)
+    //         {
+    //             std::cout << "skipping\n";
+    //             continue;
+    //         }
+
+    //         auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, actual_diags);
+
+    //         cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
+    //         cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
+    //         cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+
+    //         auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    //         auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+    //                                                                       { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+
+    //         auto expected_bytes = min_read_bytes(A, x);
+    //         auto actual_bytes = get_actual_read_bytes(A, x, y, conf);
+
+    //         float ration = float(actual_bytes)/expected_bytes;
+
+    //         std::cout << "(" << size_str(n) << ", " << dist << ", " << actual_diags << ") -> " << ration << "\n";
+
+    //         // std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, actual_diags)) << "\n";
+    //         // std::cout << "Expected: " << size_str(expected_bytes) << "\n";
+    //         // std::cout << "Actual:   " << size_str(actual_bytes) << "\n";
+    //         // std::cout << "------------------------------------------------\n";
+    //     }
+    // }
 
     // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11);
 
@@ -485,15 +614,16 @@ void test_matrix()
     const int n = 8*(1u << 20);
     //const int n = (1u << 20)/2;
 
-    //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024);
-    auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32);
-    cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
-
-    // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
+    cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390);
+    // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32);
 
+    cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
     cusp::array1d<float, cusp::device_memory> y(A.num_rows);
 
+    // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
+
     //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11);
     //cusp::dia_matrix<int, float, cusp::device_memory> A2 = dia_host_matrix;
 
@@ -515,12 +645,21 @@ void test_matrix()
 
     auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
                                                                    { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+    size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2);
+
+    conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) },
+                                                              { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+    size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2);
+
+    std::cout << "non_cached: " << size_str(non_cached) << "\n";
+    std::cout << "cached:     " << size_str(cached) << "\n";
+
     //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
 
     // cusp::ktt::tune(A, x, y);
 
-    cusp::ktt::multiply(A, x, y, conf2);
-    //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
+    // cusp::ktt::multiply(A, x, y, conf2);
+    // std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
 
     //auto expected_bytes = min_read_bytes(A, x);
     //std::cout << "Expected: " << size_str(expected_bytes) << "\n";
@@ -541,8 +680,8 @@ int main(void)
     auto& tuner = cusp::ktt::get_tuner();
     tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds);
 
-    //test_l2();
-    test_matrix();
+    test_l2();
+    // test_matrix();
 
     // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256);
     // cusp::coo_matrix<int, float, cusp::device_memory> A = orig;

From 302a4579a6d59f180cb7de76f42b8a92fc080ff8 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@fi.muni.cz>
Date: Mon, 28 Nov 2022 17:32:27 +0100
Subject: [PATCH 07/49] Fix out of memory errors

---
 main.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cu b/main.cu
index 352b3d50..f08791a1 100644
--- a/main.cu
+++ b/main.cu
@@ -540,7 +540,7 @@ void test_l2()
         return;
     }
 
-    size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem);
+    size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20));
     size_t max_floats = max_bytes/sizeof(float);
 
     std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n";

From 5be16e0fccfbd984d0ba5c1611bb329b31ece679 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@fi.muni.cz>
Date: Mon, 28 Nov 2022 20:11:28 +0100
Subject: [PATCH 08/49] Implement prefetching

---
 cusp/system/cuda/ktt/dia_multiply.h       | 17 +++++-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 71 ++++++++++++++++++++---
 main.cu                                   | 30 +++++-----
 3 files changed, 93 insertions(+), 25 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index d009bfce..7ad15283 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -23,7 +23,22 @@ namespace dia {
 inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
 {
     tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1 });
-    tuner.AddParameter(kernel.kernel_id, "PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4, 8 });
+    tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4 });
+    tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 1, 2, 3 });
+
+    // only one type of prefetching can be applied at once
+    tuner.AddConstraint(kernel.kernel_id,
+                        {"SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"},
+                        [] (const std::vector<uint64_t>& values) {
+                            return values[0] == 0 || values[1] == 0;
+                        });
+
+    // only one type of prefetching can be applied at once
+    tuner.AddConstraint(kernel.kernel_id,
+                        {"KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"},
+                        [] (const std::vector<uint64_t>& values) {
+                            return values[0] == 1 || (values[1] == 0 && values[2] == 0);
+                        });
 }
 
 } // namespace dia
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 6c25a200..5c90b44d 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -103,9 +103,9 @@ blocked_offsets_dia_kernel(const int num_rows,
 
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
-#if PREFETCH_FACTOR > 0
-    __shared__ ValueType1 values_prefetched[PREFETCH_FACTOR*BLOCK_SIZE];
-    __shared__ ValueType2 x_prefetched[PREFETCH_FACTOR*BLOCK_SIZE];
+#if SHARED_PREFETCH_FACTOR > 0
+    __shared__ ValueType1 values_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE];
+    __shared__ ValueType2 x_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE];
 #endif
 
     __shared__ IndexType offsets[BLOCK_SIZE];
@@ -124,7 +124,7 @@ blocked_offsets_dia_kernel(const int num_rows,
 
         if (thread_id < num_rows)
         {
-#if PREFETCH_FACTOR == 0
+#if SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0
             for (IndexType i = 0; i < batch_size; ++i)
             {
                 IndexType col = offsets[i] + thread_id;
@@ -135,13 +135,66 @@ blocked_offsets_dia_kernel(const int num_rows,
                     sum += diag_val*x_val;
                 }
             }
-#else
-            int rest = batch_size % PREFETCH_FACTOR;
+#elif REGISTER_PREFETCH_FACTOR > 0
+            int end = batch_size - (batch_size % REGISTER_PREFETCH_FACTOR);
+
+            for (IndexType i = 0; i < end; i += REGISTER_PREFETCH_FACTOR)
+            {
+                IndexType col1 = offsets[i] + thread_id;
+                ValueType1 diag_val1 = 0;
+                ValueType2 x_val1 = 0;
+                if (col1 >= 0 && col1 < num_cols)
+                {
+                    diag_val1 = values[ (offset_base + i)*pitch + thread_id ];
+                    x_val1 = __ldg(x + col1);
+                }
+#if REGISTER_PREFETCH_FACTOR > 1
+                IndexType col2 = offsets[i + 1] + thread_id;
+                ValueType1 diag_val2 = 0;
+                ValueType2 x_val2 = 0;
+                if (col2 >= 0 && col2 < num_cols)
+                {
+                    diag_val2 = values[ (offset_base + i + 1)*pitch + thread_id ];
+                    x_val2 = __ldg(x + col2);
+                }
+#endif
+#if REGISTER_PREFETCH_FACTOR > 2
+                IndexType col3 = offsets[i + 2] + thread_id;
+                ValueType1 diag_val3 = 0;
+                ValueType2 x_val3 = 0;
+                if (col3 >= 0 && col3 < num_cols)
+                {
+                    diag_val3 = values[ (offset_base + i + 2)*pitch + thread_id ];
+                    x_val3 = __ldg(x + col3);
+                }
+#endif
+
+                sum += diag_val1 * x_val1;
+#if REGISTER_PREFETCH_FACTOR > 1
+                sum += diag_val2 * x_val2;
+#endif
+#if REGISTER_PREFETCH_FACTOR > 2
+                sum += diag_val3 * x_val3;
+#endif
+            }
+
+            for (IndexType i = end; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + thread_id;
+                if (col >= 0 && col < num_cols)
+                {
+                    auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
+                    auto x_val = __ldg(x + col);
+                    sum += diag_val*x_val;
+                }
+            }
+#elif SHARED_PREFETCH_FACTOR > 0
+            int rest = batch_size % SHARED_PREFETCH_FACTOR;
             int end = batch_size - rest;
 
-            for (IndexType i = 0; i < end; i += PREFETCH_FACTOR)
+            for (IndexType i = 0; i < end; i += SHARED_PREFETCH_FACTOR)
             {
-                for (int j = 0; j < PREFETCH_FACTOR; ++j)
+                for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j)
                 {
                     IndexType col = offsets[i + j] + thread_id;
                     if (col >= 0 && col < num_cols)
@@ -157,7 +210,7 @@ blocked_offsets_dia_kernel(const int num_rows,
                     }
                 }
 
-                for (int j = 0; j < PREFETCH_FACTOR; ++j)
+                for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j)
                 {
                     sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x];
                 }
diff --git a/main.cu b/main.cu
index f08791a1..034c65fd 100644
--- a/main.cu
+++ b/main.cu
@@ -614,9 +614,9 @@ void test_matrix()
     const int n = 8*(1u << 20);
     //const int n = (1u << 20)/2;
 
-    cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
-    cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390);
-    // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32);
+    // cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    // cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390);
+    auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64);
 
     cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
@@ -637,24 +637,24 @@ void test_matrix()
     //     //"l2_global_load_bytes"
     // });
 
-    //cusp::ktt::tune(A2, x, y);
-    //cusp::ktt::tune(A, x, y);
+    std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, 64)) << "\n";
+    // cusp::ktt::tune(A, x, y);
 
     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
     // cusp::ktt::multiply(A, x, y, conf1);
 
     auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
-                                                                   { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
-    size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2);
+                                                                   { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(2) } });
+    // size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2);
 
-    conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) },
-                                                              { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
-    size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2);
+    // conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) },
+    //                                                           { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+    // size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2);
 
-    std::cout << "non_cached: " << size_str(non_cached) << "\n";
-    std::cout << "cached:     " << size_str(cached) << "\n";
+    // std::cout << "non_cached: " << size_str(non_cached) << "\n";
+    // std::cout << "cached:     " << size_str(cached) << "\n";
 
-    //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
+    std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
 
     // cusp::ktt::tune(A, x, y);
 
@@ -680,8 +680,8 @@ int main(void)
     auto& tuner = cusp::ktt::get_tuner();
     tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds);
 
-    test_l2();
-    // test_matrix();
+    // test_l2();
+    test_matrix();
 
     // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256);
     // cusp::coo_matrix<int, float, cusp::device_memory> A = orig;

From bee9571357d55f21c235d807eb386d3bd57f1d8f Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 29 Nov 2022 13:25:52 +0100
Subject: [PATCH 09/49] Use -lineinfo in debug mode in tests

---
 build/build-env.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/build/build-env.py b/build/build-env.py
index fbd7f216..3d19bc53 100644
--- a/build/build-env.py
+++ b/build/build-env.py
@@ -277,6 +277,10 @@ def addKTT(env):
     env.Append(RPATH = [ ktt_lib_path ])
     env.Append(LIBS = [ "ktt" ])
 
+    if env['mode'] == 'debug':
+        env.Append(CFLAGS   = ['-DKTT_LINE_INFO'])
+        env.Append(CXXFLAGS = ['-DKTT_LINE_INFO'])
+
 
 def Environment(buildDir):
     # allow the user discretion to choose the MSVC version
@@ -465,7 +469,7 @@ def Environment(buildDir):
             env['ENV']['DYLD_LIBRARY_PATH'] = os.environ['DYLD_LIBRARY_PATH']
         elif 'LD_LIBRARY_PATH' in os.environ:
             env['ENV']['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH']
-    
+
     addKTT(env)
 
     # generate help text

From e036ae72a66788ecf0a1c7017018d273e9c3be88 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 29 Nov 2022 13:26:52 +0100
Subject: [PATCH 10/49] Implement prefetching

---
 cusp/system/cuda/ktt/dia_multiply.h       |  16 +++-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 107 ++++++++++++++--------
 main.cu                                   |  27 ++++--
 3 files changed, 99 insertions(+), 51 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 7ad15283..dad0f6aa 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -23,22 +23,30 @@ namespace dia {
 inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
 {
     tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1 });
-    tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4 });
+    tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4, 8 });
     tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 1, 2, 3 });
+    tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector<uint64_t>{ 0, 1 });
 
     // only one type of prefetching can be applied at once
     tuner.AddConstraint(kernel.kernel_id,
-                        {"SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"},
+                        { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
                         [] (const std::vector<uint64_t>& values) {
                             return values[0] == 0 || values[1] == 0;
                         });
 
-    // only one type of prefetching can be applied at once
+    // prefetching is used only for the blocked offsets kernel
     tuner.AddConstraint(kernel.kernel_id,
-                        {"KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"},
+                        { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
                         [] (const std::vector<uint64_t>& values) {
                             return values[0] == 1 || (values[1] == 0 && values[2] == 0);
                         });
+
+    // different register prefetching implementations are used only when register prefetching is used
+    tuner.AddConstraint(kernel.kernel_id,
+                        { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" },
+                        [] (const std::vector<uint64_t>& values) {
+                            return values[0] > 0 || values[1] == 0;
+                        });
 }
 
 } // namespace dia
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 5c90b44d..9d5050a5 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -10,43 +10,66 @@ __device__ T max(T a, T b) {
     return a > b ? a : b;
 }
 
-// template<typename IndexType,
-//          typename ValueType1,
-//          typename ValueType2,
-//          int K>
-// struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
-// {
-//     using parent = UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>;
-
-//     IndexType col;
-//     ValueType1 diag_val = 0;
-//     ValueType2 x_val = 0;
-
-//     __device__ void prefetch(const IndexType* offsets,
-//                              const ValueType1* values,
-//                              const ValueType2* x,
-//                              IndexType pitch,
-//                              IndexType row,
-//                              IndexType base,
-//                              IndexType cols)
-//     {
-//         if (col >= 0 && col < cols)
-//         {
-//             diag_val = values[]
-//             parent::prefetch(offsets, values, x, row, base + 1);
-//         }
-//     }
-
-//     template<typename ValueType3>
-//     __device__ ValueType3 do_iter(IndexType cols)
-//     {
-//         if (col < 0 || col >= cols)
-//         {
-//             return 0;
-//         }
-//         return diag_val*x_val + parent::do_iter<ValueType3>(cols);
-//     }
-// };
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         int K>
+struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
+{
+    using parent = UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>;
+
+    IndexType col;
+    ValueType1 diag_val = 0;
+    ValueType2 x_val = 0;
+
+    __device__ void prefetch(const IndexType* offsets,
+                             const ValueType1* values,
+                             const ValueType2* x,
+                             int pitch,
+                             int cols,
+                             IndexType row,
+                             IndexType offset_base,
+                             IndexType i)
+    {
+        col = offsets[i] + row;
+
+        if (col >= 0 && col < cols)
+        {
+            diag_val = values[(offset_base + i)*pitch + row];
+            x_val = __ldg(x + col);
+        }
+
+        parent::prefetch(offsets, values, x, pitch, cols, row, offset_base, i + 1);
+    }
+
+    template<typename ValueType3>
+    __device__ ValueType3 do_iter()
+    {
+        return static_cast<ValueType3>(diag_val*x_val) + parent::template do_iter<ValueType3>();
+    }
+};
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2>
+struct UnrolledLoop<IndexType, ValueType1, ValueType2, 0>
+{
+    __device__ void prefetch(const IndexType* offsets,
+                             const ValueType1* values,
+                             const ValueType2* x,
+                             int pitch,
+                             int cols,
+                             IndexType row,
+                             IndexType offset_base,
+                             IndexType i) {}
+
+    template<typename ValueType3>
+    __device__ ValueType3 do_iter()
+    {
+        return 0;
+    }
+};
+
 
 template <typename IndexType,
           typename ValueType1,
@@ -140,6 +163,14 @@ blocked_offsets_dia_kernel(const int num_rows,
 
             for (IndexType i = 0; i < end; i += REGISTER_PREFETCH_FACTOR)
             {
+#if REGISTER_PREFETCH_TYPE == 0
+
+                UnrolledLoop<IndexType, ValueType1, ValueType2, REGISTER_PREFETCH_FACTOR> loop;
+                loop.prefetch(offsets, values, x, pitch, num_cols, thread_id, offset_base, i);
+                sum += loop.template do_iter<ValueType3>();
+
+#elif REGISTER_PREFETCH_TYPE == 1
+
                 IndexType col1 = offsets[i] + thread_id;
                 ValueType1 diag_val1 = 0;
                 ValueType2 x_val1 = 0;
@@ -176,6 +207,8 @@ blocked_offsets_dia_kernel(const int num_rows,
 #if REGISTER_PREFETCH_FACTOR > 2
                 sum += diag_val3 * x_val3;
 #endif
+
+#endif // REGISTER_PREFETCH_TYPE == 1
             }
 
             for (IndexType i = end; i < batch_size; ++i)
diff --git a/main.cu b/main.cu
index 034c65fd..51184a4d 100644
--- a/main.cu
+++ b/main.cu
@@ -611,12 +611,14 @@ void test_matrix()
 {
     auto& tuner = cusp::ktt::get_tuner();
 
-    const int n = 8*(1u << 20);
-    //const int n = (1u << 20)/2;
+    int n = 8*(1u << 20);
 
-    // cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
-    // cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390);
-    auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64);
+    n = get_max_size(5, 2*(size_t(1) << 30));
+    int grid_size = std::sqrt(n);
+    cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size);
+
+    // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64);
 
     cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
@@ -637,14 +639,15 @@ void test_matrix()
     //     //"l2_global_load_bytes"
     // });
 
-    std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, 64)) << "\n";
+    std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n";
     // cusp::ktt::tune(A, x, y);
 
     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
     // cusp::ktt::multiply(A, x, y, conf1);
 
     auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
-                                                                   { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(2) } });
+                                                                   { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
+                                                                   { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(4) } });
     // size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2);
 
     // conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) },
@@ -654,11 +657,15 @@ void test_matrix()
     // std::cout << "non_cached: " << size_str(non_cached) << "\n";
     // std::cout << "cached:     " << size_str(cached) << "\n";
 
-    std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
-
-    // cusp::ktt::tune(A, x, y);
+    // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
 
+    cusp::ktt::tune(A, x, y);
     // cusp::ktt::multiply(A, x, y, conf2);
+    // for (int i = 0; i < 10; ++i)
+    // {
+    //     std::cout << y[i] << "\n";
+    // }
+
     // std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n";
 
     //auto expected_bytes = min_read_bytes(A, x);

From 88123f896dc34695a0ddcc2ebd48e7c04a30bf7e Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 5 Dec 2022 16:24:05 +0100
Subject: [PATCH 11/49] Add tuning parameter for different way to load values
 from global memory

---
 cusp/system/cuda/ktt/dia_multiply.h       |   5 +-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 144 ++++++++++++----------
 main.cu                                   |   2 +-
 3 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index dad0f6aa..7a0f4486 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -24,8 +24,9 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k
 {
     tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1 });
     tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4, 8 });
-    tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 1, 2, 3 });
+    tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 3, 4 });
     tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector<uint64_t>{ 0, 1 });
+    tuner.AddParameter(kernel.kernel_id, "LOAD_TYPE", std::vector<uint64_t>{ 0, 1 });
 
     // only one type of prefetching can be applied at once
     tuner.AddConstraint(kernel.kernel_id,
@@ -41,7 +42,7 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k
                             return values[0] == 1 || (values[1] == 0 && values[2] == 0);
                         });
 
-    // different register prefetching implementations are used only when register prefetching is used
+    // different register prefetching implementations are used only when register prefetching is enabled
     tuner.AddConstraint(kernel.kernel_id,
                         { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" },
                         [] (const std::vector<uint64_t>& values) {
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 9d5050a5..17119b2d 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -1,5 +1,15 @@
 #pragma once
 
+
+#if REGISTER_PREFETCH_FACTOR > 0
+#define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR
+#elif SHARED_PREFETCH_FACTOR > 0
+#define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR
+#else
+#define PREFETCH_FACTOR 0
+#endif
+
+
 template<typename T>
 __device__ T min(T a, T b) {
     return a < b ? a : b;
@@ -10,6 +20,32 @@ __device__ T max(T a, T b) {
     return a > b ? a : b;
 }
 
+
+template<typename T>
+__device__ T load_diag_val(const T* val)
+{
+#if LOAD_TYPE == 0
+    return *val;
+#elif LOAD_TYPE == 1
+    // Cache streaming, likely to be accessed once.
+    // The ld.cs load cached streaming operation allocates global lines with evict-first policy in L1 and L2
+    // to limit cache pollution by temporary streaming data
+    return __ldcs(val);
+#endif
+}
+
+template<typename T>
+__device__ T load_x_val(const T* val)
+{
+#if LOAD_TYPE == 0
+    return *val;
+#else
+    // Needs compute capability at least 3.5
+    return __ldg(val);
+#endif
+}
+
+
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
@@ -35,8 +71,8 @@ struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
 
         if (col >= 0 && col < cols)
         {
-            diag_val = values[(offset_base + i)*pitch + row];
-            x_val = __ldg(x + col);
+            diag_val = load_diag_val(&values[(offset_base + i)*pitch + row]);
+            x_val = load_x_val(&x[col]);
         }
 
         parent::prefetch(offsets, values, x, pitch, cols, row, offset_base, i + 1);
@@ -86,20 +122,19 @@ naive_dia_kernel(const int num_rows,
                  const ValueType2* x,
                  ValueType3* y)
 {
-    using ValueType = ValueType3;
-
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
     if (thread_id < num_rows)
     {
-        ValueType sum = ValueType(0);
+        ValueType3 sum = ValueType3(0);
         for (IndexType i = 0; i < num_diagonals; ++i)
         {
             IndexType col = diagonal_offsets[i] + thread_id;
             if (col >= 0 && col < num_cols)
             {
-                auto diag_val = values[i*pitch + thread_id];
-                auto x_val = x[col];
+                auto diag_val = load_diag_val(&values[i*pitch + thread_id]);
+                auto x_val = load_x_val(&x[col]);
+
                 sum += diag_val * x_val;
             }
         }
@@ -107,6 +142,16 @@ naive_dia_kernel(const int num_rows,
     }
 }
 
+#define REGISTER_PREFETCH_ITERATION(iter) \
+    IndexType col##iter       = offsets[i + iter] + thread_id; \
+    ValueType1 diag_val##iter = 0; \
+    ValueType2 x_val##iter    = 0; \
+    if (col##iter >= 0 && col##iter < num_cols) \
+    { \
+        diag_val##iter = load_diag_val(&values[ (offset_base + i + iter)*pitch + thread_id ]); \
+        x_val##iter    = load_x_val(&x[col##iter]); \
+    }
+
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
@@ -122,8 +167,6 @@ blocked_offsets_dia_kernel(const int num_rows,
                            const ValueType2* x,
                            ValueType3* y)
 {
-    using ValueType = ValueType3;
-
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
 #if SHARED_PREFETCH_FACTOR > 0
@@ -132,7 +175,7 @@ blocked_offsets_dia_kernel(const int num_rows,
 #endif
 
     __shared__ IndexType offsets[BLOCK_SIZE];
-    ValueType sum = ValueType(0);
+    ValueType3 sum = ValueType3(0);
 
     for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
     {
@@ -153,16 +196,18 @@ blocked_offsets_dia_kernel(const int num_rows,
                 IndexType col = offsets[i] + thread_id;
                 if (col >= 0 && col < num_cols)
                 {
-                    auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
-                    auto x_val = __ldg(x + col);
+                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]);
+                    auto x_val = load_x_val(&x[col]);
+
                     sum += diag_val*x_val;
                 }
             }
-#elif REGISTER_PREFETCH_FACTOR > 0
-            int end = batch_size - (batch_size % REGISTER_PREFETCH_FACTOR);
+#else // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0
+            int end = batch_size - (batch_size % PREFETCH_FACTOR);
 
-            for (IndexType i = 0; i < end; i += REGISTER_PREFETCH_FACTOR)
+            for (IndexType i = 0; i < end; i += PREFETCH_FACTOR)
             {
+#if REGISTER_PREFETCH_FACTOR > 0
 #if REGISTER_PREFETCH_TYPE == 0
 
                 UnrolledLoop<IndexType, ValueType1, ValueType2, REGISTER_PREFETCH_FACTOR> loop;
@@ -171,70 +216,37 @@ blocked_offsets_dia_kernel(const int num_rows,
 
 #elif REGISTER_PREFETCH_TYPE == 1
 
-                IndexType col1 = offsets[i] + thread_id;
-                ValueType1 diag_val1 = 0;
-                ValueType2 x_val1 = 0;
-                if (col1 >= 0 && col1 < num_cols)
-                {
-                    diag_val1 = values[ (offset_base + i)*pitch + thread_id ];
-                    x_val1 = __ldg(x + col1);
-                }
-#if REGISTER_PREFETCH_FACTOR > 1
-                IndexType col2 = offsets[i + 1] + thread_id;
-                ValueType1 diag_val2 = 0;
-                ValueType2 x_val2 = 0;
-                if (col2 >= 0 && col2 < num_cols)
-                {
-                    diag_val2 = values[ (offset_base + i + 1)*pitch + thread_id ];
-                    x_val2 = __ldg(x + col2);
-                }
-#endif
+                REGISTER_PREFETCH_ITERATION(0);
+                REGISTER_PREFETCH_ITERATION(1);
 #if REGISTER_PREFETCH_FACTOR > 2
-                IndexType col3 = offsets[i + 2] + thread_id;
-                ValueType1 diag_val3 = 0;
-                ValueType2 x_val3 = 0;
-                if (col3 >= 0 && col3 < num_cols)
-                {
-                    diag_val3 = values[ (offset_base + i + 2)*pitch + thread_id ];
-                    x_val3 = __ldg(x + col3);
-                }
+                REGISTER_PREFETCH_ITERATION(2);
+#endif
+#if REGISTER_PREFETCH_FACTOR > 3
+                REGISTER_PREFETCH_ITERATION(3);
 #endif
 
+                sum += diag_val0 * x_val0;
                 sum += diag_val1 * x_val1;
-#if REGISTER_PREFETCH_FACTOR > 1
+#if REGISTER_PREFETCH_FACTOR > 2
                 sum += diag_val2 * x_val2;
 #endif
-#if REGISTER_PREFETCH_FACTOR > 2
+#if REGISTER_PREFETCH_FACTOR > 3
                 sum += diag_val3 * x_val3;
 #endif
 
 #endif // REGISTER_PREFETCH_TYPE == 1
-            }
-
-            for (IndexType i = end; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + thread_id;
-                if (col >= 0 && col < num_cols)
-                {
-                    auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
-                    auto x_val = __ldg(x + col);
-                    sum += diag_val*x_val;
-                }
-            }
 #elif SHARED_PREFETCH_FACTOR > 0
-            int rest = batch_size % SHARED_PREFETCH_FACTOR;
-            int end = batch_size - rest;
 
-            for (IndexType i = 0; i < end; i += SHARED_PREFETCH_FACTOR)
-            {
                 for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j)
                 {
                     IndexType col = offsets[i + j] + thread_id;
                     if (col >= 0 && col < num_cols)
                     {
-                        auto val = values[ (offset_base + i + j)*pitch + thread_id ];
-                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] = val;
-                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] = __ldg(x + col);
+                        auto diag_val = load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]);
+                        auto x_val = load_x_val(&x[col]);
+
+                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] = diag_val;
+                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] = x_val;
                     }
                     else
                     {
@@ -247,6 +259,7 @@ blocked_offsets_dia_kernel(const int num_rows,
                 {
                     sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x];
                 }
+#endif // SHARED_PREFETCH_FACTOR > 0
             }
 
             for (IndexType i = end; i < batch_size; ++i)
@@ -254,12 +267,13 @@ blocked_offsets_dia_kernel(const int num_rows,
                 IndexType col = offsets[i] + thread_id;
                 if (col >= 0 && col < num_cols)
                 {
-                    auto diag_val = values[ (offset_base + i)*pitch + thread_id ];
-                    auto x_val = __ldg(x + col);
+                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]);
+                    auto x_val = load_x_val(&x[col]);
+
                     sum += diag_val*x_val;
                 }
             }
-#endif
+#endif // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0
         }
 
         __syncthreads();
diff --git a/main.cu b/main.cu
index 51184a4d..7943c796 100644
--- a/main.cu
+++ b/main.cu
@@ -639,7 +639,7 @@ void test_matrix()
     //     //"l2_global_load_bytes"
     // });
 
-    std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n";
+    std::cout << "Device memory needed: " << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n";
     // cusp::ktt::tune(A, x, y);
 
     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });

From c839c64d888e6242715167ac46c0b24a629abe2c Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 7 Dec 2022 11:35:40 +0100
Subject: [PATCH 12/49] Experiment with x vector caching and fix some bugs

---
 cusp/ktt/matrix_generation.h |  10 +-
 main.cu                      | 226 +++++++++++++++++++++++++++++++----
 2 files changed, 209 insertions(+), 27 deletions(-)

diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h
index 6c5e0b15..914d2225 100644
--- a/cusp/ktt/matrix_generation.h
+++ b/cusp/ktt/matrix_generation.h
@@ -43,14 +43,16 @@ make_diagonal_symmetric_matrix(int rows,
 
     for (int i = 0; i < diagonal_count; ++i) {
         int offset = starting_offset + offset_step*i;
-        offsets.push_back(offset);
 
         int starting_row = offset < 0 ? -offset : 0;
         int starting_col = offset < 0 ? 0 : offset;
 
-        int ending_row = starting_row >= rows || starting_col >= cols
-                            ? 0
-                            : starting_row + std::min(rows - starting_row, cols - starting_col);
+        if (starting_row >= rows || starting_col >= cols)
+            continue;
+
+        offsets.push_back(offset);
+
+        int ending_row = starting_row + std::min(rows - starting_row, cols - starting_col);
 
         for (int row = starting_row; row < ending_row; ++row) {
             num_entries++;
diff --git a/main.cu b/main.cu
index 7943c796..c5f4e4da 100644
--- a/main.cu
+++ b/main.cu
@@ -14,9 +14,51 @@
 #include <type_traits>
 #include <sstream>
 #include <iomanip>
+#include <fstream>
 
 namespace krn = std::chrono;
 
+template<typename T, typename U, typename V>
+void write_csv(const std::vector<std::vector<T>>& data,
+               const std::vector<U>& col_labels,
+               const std::vector<V>& row_labels,
+               const std::string& path)
+{
+    if (row_labels.size() != data.size())
+        throw std::runtime_error("Weird number of rows or row labels.");
+
+    for (int i = 1; i < data.size(); ++i)
+    {
+        if (data[i].size() != data[i-1].size())
+            throw std::runtime_error("Not all rows have same length.");
+        if (i == 1 && data[i].size() != col_labels.size())
+            throw std::runtime_error("Bad number of column lables.");
+    }
+
+    std::ofstream file(path);
+
+    if (!file)
+        throw std::runtime_error("Cannot open: " + path);
+
+    const char* sep = "";
+    for (const auto& label : col_labels)
+    {
+        file << sep << label;
+        sep = ",";
+    }
+    file << "\n";
+
+    for (int i = 0; i < data.size(); ++i)
+    {
+        file << row_labels[i];
+        for (const auto& x : data[i])
+        {
+            file << "," << x;
+        }
+        file << "\n";
+    }
+}
+
 
 struct Timer
 {
@@ -269,6 +311,33 @@ convert2(const cusp::coo_matrix<IndexType, ValueType, cusp::host_memory>& matrix
     return res;
 }
 
+template<template<typename, typename, typename> typename Matrix,
+         typename IndexType,
+         typename ValueType>
+Matrix<IndexType, ValueType, cusp::device_memory>
+send_to_device(const Matrix<IndexType, ValueType, cusp::host_memory>& host_matrix)
+{
+    return host_matrix;
+}
+
+template<template<typename, typename, typename> typename Matrix,
+         typename IndexType,
+         typename ValueType,
+         typename MemorySpace>
+auto make_test_device_x(const Matrix<IndexType, ValueType, MemorySpace>& matrix)
+{
+    return cusp::array1d<ValueType, cusp::device_memory>(matrix.num_cols, 1);
+}
+
+template<template<typename, typename, typename> typename Matrix,
+         typename IndexType,
+         typename ValueType,
+         typename MemorySpace>
+auto make_test_device_y(const Matrix<IndexType, ValueType, MemorySpace>& matrix)
+{
+    return cusp::array1d<ValueType, cusp::device_memory>(matrix.num_rows, 0);
+}
+
 template<typename IndexType, typename ValueType>
 bool equals(const cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>& A,
             const cusp::dia_matrix<IndexType, ValueType, cusp::host_memory>& B)
@@ -353,7 +422,7 @@ size_t min_read_bytes(const cusp::dia_matrix<IndexT, ValueT, MemorySpace>& A)
     {
         IndexT start_row = offset >= 0 ? 0 : -offset;
         IndexT start_col = offset >= 0 ? offset : 0;
-        IndexT end_row = std::min(A.num_cols - start_col, A.num_rows - start_row);
+        IndexT end_row = start_row + std::min(A.num_cols - start_col, A.num_rows - start_row);
 
         // Assume that global reads are done in 32 byte transactions that must be aligned.
         IndexT elems_per_read = 32 / sizeof(ValueT);
@@ -371,7 +440,8 @@ profile_multiply(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
                  const cusp::array1d<float, cusp::device_memory>& x,
                  cusp::array1d<float, cusp::device_memory>& y,
                  const ::ktt::KernelConfiguration& conf,
-                 const std::vector<std::string>& counters)
+                 const std::vector<std::string>& counters,
+                ::ktt::KernelResult& result)
 {
     auto& tuner = cusp::ktt::get_tuner();
 
@@ -386,6 +456,8 @@ profile_multiply(const cusp::dia_matrix<int, float, cusp::device_memory>& A,
         kernel_result = cusp::ktt::multiply(A, x, y, conf, true);
     } while(kernel_result.HasRemainingProfilingRuns());
 
+    result = kernel_result;
+
     for (const auto& computation_result : kernel_result.GetResults())
     {
         if (!computation_result.HasProfilingData())
@@ -412,9 +484,13 @@ size_t get_actual_read_bytes(::ktt::Tuner& tuner,
                              const cusp::dia_matrix<int, float, cusp::device_memory>& A,
                              const cusp::array1d<float, cusp::device_memory>& x,
                              cusp::array1d<float, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& conf)
+                             const ::ktt::KernelConfiguration& conf,
+                             ::ktt::KernelResult* res_p = nullptr)
 {
-    const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"});
+    ::ktt::KernelResult res;
+    const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"}, res);
+
+    if (res_p) *res_p = res;
 
     if (counters.empty()) {
         throw std::runtime_error("Profiling failed.");
@@ -424,17 +500,22 @@ size_t get_actual_read_bytes(::ktt::Tuner& tuner,
 }
 
 size_t get_actual_read_bytes(::ktt::Tuner& tuner,
-                             const cusp::dia_matrix<int, float, cusp::host_memory>& host_matrix)
+                             const cusp::dia_matrix<int, float, cusp::host_memory>& host_matrix,
+                             ::ktt::KernelResult* res_p = nullptr)
 {
     cusp::dia_matrix<int, float, cusp::device_memory> A = host_matrix;
     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
     cusp::array1d<float, cusp::device_memory> y(A.num_rows);
 
     auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
-    auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
-                                                                  { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
-
-    return get_actual_read_bytes(tuner, A, x, y, conf);
+    // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
+    //                                                               { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
+    auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id,
+                                          { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                            { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
+                                            { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } });
+
+    return get_actual_read_bytes(tuner, A, x, y, conf, res_p);
 }
 
 int get_max_size(int num_diags, size_t global_size)
@@ -526,10 +607,79 @@ void test_poisson_7pt(::ktt::Tuner& tuner, size_t global_limit)
     }
 }
 
+void test_x_caching_uniform(::ktt::Tuner& tuner, size_t global_limit)
+{
+    std::vector<int> distances;
+    std::vector<int> diag_counts;
+
+    for (int dist = 64; dist <= 2048; dist *= 2)
+        distances.push_back(dist);
+
+    for (int count = 64; count <= 2048; count *= 2)
+        diag_counts.push_back(count + 1);
+
+    std::vector<std::vector<float>> bytes;
+    std::vector<std::vector<size_t>> times;
+
+    std::cout << "TESTING UNIFORM MATRICES\n"
+              << "Taget device memory: " << size_str(global_limit) << "\n";
+
+    for (auto dist : distances)
+    {
+        std::cout << "Distance: " << dist << "\n";
+        bytes.emplace_back();
+        times.emplace_back();
+
+        ::ktt::KernelResult kernel_result;
+        size_t reference_time = 1;
+
+        for (auto diag_count : diag_counts)
+        {
+            size_t n = get_max_size(diag_count, global_limit);
+            if (n < 1024) // That is too small.
+            {
+                std::cout << "skipping\n";
+                bytes.back().push_back(-1);
+                times.back().push_back(0);
+                continue;
+            }
+
+            auto host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, diag_count);
+
+            if (host_matrix.diagonal_offsets.size() != diag_count)
+            {
+                std::cout << "skipping\n";
+                bytes.back().push_back(-1);
+                times.back().push_back(0);
+                continue;
+            }
+
+            auto expected_bytes = min_read_bytes(host_matrix);
+            auto actual_bytes = get_actual_read_bytes(tuner, host_matrix, &kernel_result);
+            float ratio = float(actual_bytes)/expected_bytes;
+
+            if (reference_time == 1)
+            {
+                reference_time = kernel_result.GetKernelDuration();
+            }
+
+            size_t time = kernel_result.GetKernelDuration();
+            float time_ratio = float(time)/reference_time;
+
+            std::cout << diag_count << "(" << n << ") -> " << ratio << "(" << actual_bytes << " : " << expected_bytes << ")\n";
+                     // << " (" << time_ratio << " : " << time << ")\n";
+
+            bytes.back().push_back(ratio);
+            times.back().push_back(time);
+        }
+    }
+
+    write_csv(bytes, distances, diag_counts, "bytes_transfered.csv");
+}
+
 void test_l2()
 {
     auto& tuner = cusp::ktt::get_tuner();
-
     tuner.SetLoggingLevel(::ktt::LoggingLevel::Off);
 
     cudaDeviceProp prop;
@@ -540,15 +690,15 @@ void test_l2()
         return;
     }
 
-    size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20));
-    size_t max_floats = max_bytes/sizeof(float);
+    size_t max_bytes = std::min(2*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20));
 
     std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n";
     std::cout << "Global memory available: " << size_str(prop.totalGlobalMem) << "\n";
     std::cout << "Global limit: " << size_str(max_bytes) << "\n";
 
-    test_poisson_sizes(tuner, max_bytes);
-    test_poisson_7pt(tuner, max_bytes);
+    //test_poisson_sizes(tuner, max_bytes);
+    //test_poisson_7pt(tuner, max_bytes);
+    test_x_caching_uniform(tuner, max_bytes);
 
     // std::cout << "Using size: " << size_str(max_bytes) << "\n";
 
@@ -611,18 +761,21 @@ void test_matrix()
 {
     auto& tuner = cusp::ktt::get_tuner();
 
-    int n = 8*(1u << 20);
+    int n = 130879;
 
-    n = get_max_size(5, 2*(size_t(1) << 30));
-    int grid_size = std::sqrt(n);
-    cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
-    cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size);
+    //n = get_max_size(5, 2*(size_t(1) << 30));
+    //int grid_size = std::sqrt(n);
+    //cusp::dia_matrix<int, float, cusp::host_memory> dia_host_matrix;
+    //cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size);
 
-    // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64);
+    auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 512, 2049);
+
+    std::cout << min_read_bytes(dia_host_matrix) << "\n";
 
     cusp::dia_matrix<int, float, cusp::device_memory> A = dia_host_matrix;
     cusp::array1d<float, cusp::device_memory> x(A.num_cols, 1);
     cusp::array1d<float, cusp::device_memory> y(A.num_rows);
+    cusp::array1d<float, cusp::device_memory> yy(A.num_rows);
 
     // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx");
 
@@ -645,6 +798,11 @@ void test_matrix()
     // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } });
     // cusp::ktt::multiply(A, x, y, conf1);
 
+    auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id,
+                                          { { std::string("KERNEL_TYPE"), uint64_t(1) },
+                                            { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
+                                            { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } });
+
     auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) },
                                                                    { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) },
                                                                    { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(4) } });
@@ -654,12 +812,34 @@ void test_matrix()
     //                                                           { std::string("PREFETCH_FACTOR"), uint64_t(0) } });
     // size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2);
 
+    std::cout << A.diagonal_offsets[0] << "\n";
+
+    cusp::ktt::disable();
+    cusp::multiply(A, x, yy);
+    cusp::ktt::enable();
+
+    cusp::ktt::multiply(A, x, y, conf);
+
+    if (y == yy) {
+        std::cout << "GOOD!\n";
+    } else {
+        std::cout << "WRONG!\n";
+    }
+
+    for (int i = 0; i < y.size(); ++i)
+    {
+        if (y[i] != yy[i])
+        {
+            std::cout << i << " -> " << y[i] - yy[i] << "\n";
+        }
+    }
+
     // std::cout << "non_cached: " << size_str(non_cached) << "\n";
     // std::cout << "cached:     " << size_str(cached) << "\n";
 
     // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n";
 
-    cusp::ktt::tune(A, x, y);
+    // cusp::ktt::tune(A, x, y);
     // cusp::ktt::multiply(A, x, y, conf2);
     // for (int i = 0; i < 10; ++i)
     // {
@@ -687,8 +867,8 @@ int main(void)
     auto& tuner = cusp::ktt::get_tuner();
     tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds);
 
-    // test_l2();
-    test_matrix();
+    test_l2();
+    // test_matrix();
 
     // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256);
     // cusp::coo_matrix<int, float, cusp::device_memory> A = orig;

From ae3b003b71c9a0a56e3c49b07f52aaca435ca6e6 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 28 Dec 2022 16:47:31 +0100
Subject: [PATCH 13/49] Add a.out to .gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 769857d7..ff73601b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -122,3 +122,4 @@ performance/spmm/spmm
 *.user
 
 .vscode
+a.out

From 956130deb6657dba04a639f4b6406ee9d70a9469 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 28 Dec 2022 16:48:52 +0100
Subject: [PATCH 14/49] Throw exception when some diagonals don't fit

---
 cusp/ktt/matrix_generation.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h
index 914d2225..b30e616a 100644
--- a/cusp/ktt/matrix_generation.h
+++ b/cusp/ktt/matrix_generation.h
@@ -3,6 +3,7 @@
 #include <cusp/dia_matrix.h>
 
 #include <algorithm> // min
+#include <stdexcept>
 
 namespace cusp {
 
@@ -48,7 +49,7 @@ make_diagonal_symmetric_matrix(int rows,
         int starting_col = offset < 0 ? 0 : offset;
 
         if (starting_row >= rows || starting_col >= cols)
-            continue;
+            throw std::runtime_error("make_diagonal_symmetric_matrix: Too many diagonals.");
 
         offsets.push_back(offset);
 

From ccf10295be394043af745434f131c3ba8c673eeb Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 28 Dec 2022 16:53:52 +0100
Subject: [PATCH 15/49] Add striped kernel, kernel for timing acesses and block
 size parameter

---
 cusp/system/cuda/ktt/dia_multiply.h       |  40 +++-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 268 ++++++++++++++++++++--
 2 files changed, 278 insertions(+), 30 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 7a0f4486..847599be 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -22,11 +22,24 @@ namespace dia {
 
 inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
 {
-    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1 });
+    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1, 2 });
     tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4, 8 });
     tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 3, 4 });
     tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector<uint64_t>{ 0, 1 });
     tuner.AddParameter(kernel.kernel_id, "LOAD_TYPE", std::vector<uint64_t>{ 0, 1 });
+    tuner.AddParameter(kernel.kernel_id, "STRIPING_FACTOR", std::vector<uint64_t>{ 2, 4, 8 });
+    tuner.AddParameter(kernel.kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 256, 512 });
+
+    tuner.AddThreadModifier(
+        kernel.kernel_id,
+        { kernel.definition_ids[0] },
+        ::ktt::ModifierType::Local,
+        ::ktt::ModifierDimension::X,
+        { std::string("BLOCK_SIZE") },
+        [] (const uint64_t defaultSize, const std::vector<uint64_t>& parameters) {
+            return parameters[0];
+        }
+    );
 
     // only one type of prefetching can be applied at once
     tuner.AddConstraint(kernel.kernel_id,
@@ -48,6 +61,13 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k
                         [] (const std::vector<uint64_t>& values) {
                             return values[0] > 0 || values[1] == 0;
                         });
+
+    // don't try different striping factors for non-striped kernels
+    tuner.AddConstraint(kernel.kernel_id,
+                    { "KERNEL_TYPE", "STRIPING_FACTOR" },
+                    [] (const std::vector<uint64_t>& values) {
+                        return values[0] == 2 || values[1] == 2;
+                    });
 }
 
 } // namespace dia
@@ -60,21 +80,20 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
 
     std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h";
 
-    const size_t BLOCK_SIZE = 256;
-    const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks(
-                                ktt_dia_vector_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>,
-                                BLOCK_SIZE, 0);
+    const size_t block_size = 256;
+    const size_t max_blocks = cusp::system::cuda::detail::max_active_blocks(
+                                ktt_dia_vector_kernel<IndexType, ValueType1, ValueType2, ValueType3>,
+                                block_size, 0);
 
     std::vector< std::string > type_names {
         std::string(NAMEOF_TYPE(IndexType)),
         std::string(NAMEOF_TYPE(ValueType1)),
         std::string(NAMEOF_TYPE(ValueType2)),
-        std::string(NAMEOF_TYPE(ValueType3)),
-        std::to_string(BLOCK_SIZE)
+        std::string(NAMEOF_TYPE(ValueType3))
     };
 
-    const ::ktt::DimensionVector blockDimensions(BLOCK_SIZE);
-    const ::ktt::DimensionVector gridDimensions(MAX_BLOCKS);
+    const ::ktt::DimensionVector blockDimensions(block_size);
+    const ::ktt::DimensionVector gridDimensions(max_blocks);
 
     auto definition_id = tuner.AddKernelDefinitionFromFile(
         "ktt_dia_vector_kernel",
@@ -132,7 +151,8 @@ auto get_launcher(const kernel_context& ctx,
                   size_t num_cols,
                   bool profile = false)
 {
-    return [=] (::ktt::ComputeInterface& interface) {
+    return [=] (::ktt::ComputeInterface& interface)
+    {
         ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]);
         ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) );
 
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 17119b2d..ccf6338b 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -2,11 +2,15 @@
 
 
 #if REGISTER_PREFETCH_FACTOR > 0
-#define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR
+    #define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR
 #elif SHARED_PREFETCH_FACTOR > 0
-#define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR
+    #define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR
 #else
-#define PREFETCH_FACTOR 0
+    #define PREFETCH_FACTOR 0
+#endif
+
+#ifndef BLOCK_SIZE
+    #define BLOCK_SIZE 256
 #endif
 
 
@@ -110,8 +114,7 @@ struct UnrolledLoop<IndexType, ValueType1, ValueType2, 0>
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __device__ void
 naive_dia_kernel(const int num_rows,
                  const int num_cols,
@@ -155,8 +158,7 @@ naive_dia_kernel(const int num_rows,
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __device__ void
 blocked_offsets_dia_kernel(const int num_rows,
                            const int num_cols,
@@ -288,8 +290,237 @@ blocked_offsets_dia_kernel(const int num_rows,
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
+__device__ void
+striped_dia_kernel(const int num_rows,
+                           const int num_cols,
+                           const int num_diagonals,
+                           const int pitch,
+                           const IndexType* diagonal_offsets,
+                           const ValueType1* values,
+                           const ValueType2* x,
+                           ValueType3* y)
+{
+#ifdef STRIPING_FACTOR
+    const IndexType num_groups = STRIPING_FACTOR;
+#else
+    const IndexType num_groups = 1;
+#endif
+
+    const IndexType group_size = BLOCK_SIZE/num_groups;
+    const IndexType stride = num_rows/num_groups;
+
+    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+
+    const IndexType group_id = threadIdx.x / group_size;
+    const IndexType group_lane_id = threadIdx.x % group_size;
+
+    const IndexType row = group_id*stride + blockIdx.x*group_size
+                            + group_lane_id;
+
+    // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row);
+
+    __shared__ IndexType offsets[BLOCK_SIZE];
+    ValueType3 sum = ValueType3(0);
+
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
+    {
+        if (offset_base + threadIdx.x < num_diagonals)
+        {
+            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
+        }
+
+        __syncthreads();
+
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+
+        if (row < num_rows)
+        {
+            for (IndexType i = 0; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + row;
+                if (col >= 0 && col < num_cols)
+                {
+                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
+                    auto x_val = load_x_val(&x[col]);
+
+                    sum += diag_val*x_val;
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+    if (row < num_rows)
+        y[row] = sum;
+}
+
+
+// template <typename IndexType,
+//           typename ValueType1,
+//           typename ValueType2,
+//           typename ValueType3>
+// __device__ void
+// double_striped_dia_kernel(const int num_rows,
+//                            const int num_cols,
+//                            const int num_diagonals,
+//                            const int pitch,
+//                            const IndexType* diagonal_offsets,
+//                            const ValueType1* values,
+//                            const ValueType2* x,
+//                            ValueType3* y)
+// {
+// #ifdef STRIPING_FACTOR
+//     const IndexType desired_distance = 256;
+//     const IndexType warps_per_stripe = 1;
+// #else
+//     const IndexType desired_distance = 256;
+//     const IndexType warps_per_stripe = 1;
+// #endif
+
+//     IndexType stripes_needed =
+//         (num_rows + desired_distance - 1) / desired_distance;
+
+//     IndexType warps_needed = 32 * stripes_needed;
+//     IndexType warps_available = BLOCK_SIZE / 32;
+
+//     IndexType num_sectors;
+//     if (warps_needed < warps_available)
+//     {
+//         // we dont need multiple sectors
+//     }
+//     else
+//     {
+//         IndexType num_sectors = warps_needed / warps_available;
+//         IndexType blocks_per_sector = gridDim.x / num_sectors;
+
+//         IndexType sector_id = blockIdx.x / blocks_per_sector;
+//         IndexType stripe_id = threadIdx.x / 32;
+//         IndexType lane_id = threadIdx.x % 32;
+//     }
+
+//     const IndexType group_size = BLOCK_SIZE/num_groups;
+//     const IndexType stride = num_rows/num_groups;
+
+//     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+
+//     const IndexType group_id = threadIdx.x / group_size;
+//     const IndexType group_lane_id = threadIdx.x % group_size;
+
+//     const IndexType row = group_id*stride + blockIdx.x*group_size
+//                             + group_lane_id;
+
+//     // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row);
+
+//     __shared__ IndexType offsets[BLOCK_SIZE];
+//     ValueType3 sum = ValueType3(0);
+
+//     for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
+//     {
+//         if (offset_base + threadIdx.x < num_diagonals)
+//         {
+//             offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
+//         }
+
+//         __syncthreads();
+
+//         int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+
+//         if (row < num_rows)
+//         {
+//             for (IndexType i = 0; i < batch_size; ++i)
+//             {
+//                 IndexType col = offsets[i] + row;
+//                 if (col >= 0 && col < num_cols)
+//                 {
+//                     auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
+//                     auto x_val = load_x_val(&x[col]);
+
+//                     sum += diag_val*x_val;
+//                 }
+//             }
+//         }
+
+//         __syncthreads();
+//     }
+
+//     if (row < num_rows)
+//         y[row] = sum;
+// }
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+__global__ void
+timed_dia_kernel(const int num_rows,
+                           const int num_cols,
+                           const int num_diagonals,
+                           const int pitch,
+                           const IndexType* diagonal_offsets,
+                           const ValueType1* values,
+                           const ValueType2* x,
+                           ValueType3* y,
+                           long long int* times,
+                           const int times_pitch)
+{
+    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+    const IndexType row = thread_id;
+
+    __shared__ IndexType offsets[BLOCK_SIZE];
+    ValueType3 sum = ValueType3(0);
+
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
+    {
+        if (offset_base + threadIdx.x < num_diagonals)
+        {
+            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
+        }
+
+        __syncthreads();
+
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+
+        if (row < num_rows)
+        {
+            for (IndexType i = 0; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + row;
+                if (col >= 0 && col < num_cols)
+                {
+                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
+                    auto to_load = &x[col];
+
+                    long long int start = clock64();
+                    auto x_val = load_x_val(to_load);
+                    long long int end = clock64();
+
+                    if (threadIdx.x % 32 == 0)
+                    {
+                        IndexType k = row / 32;
+                        __stwt(times + (offset_base + i)*times_pitch + k, end - start + 1);
+                    }
+
+                    sum += diag_val*x_val;
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+    if (row < num_rows)
+        y[row] = sum;
+}
+
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
 __device__ void
 cached_x_dia_kernel(const int num_rows,
                     const int num_cols,
@@ -358,8 +589,7 @@ cached_x_dia_kernel(const int num_rows,
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __device__ void
 experimental_dia_kernel(const int num_rows,
                     const int num_cols,
@@ -425,8 +655,7 @@ experimental_dia_kernel(const int num_rows,
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __device__ void
 cached_x_naive_dia_kernel(const int num_rows,
                  const int num_cols,
@@ -472,8 +701,7 @@ cached_x_naive_dia_kernel(const int num_rows,
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int BLOCK_SIZE>
+          typename ValueType3>
 __launch_bounds__(BLOCK_SIZE,1) __global__ void
 ktt_dia_vector_kernel(
                 const int num_rows,
@@ -486,19 +714,19 @@ ktt_dia_vector_kernel(
                 ValueType3* y)
 {
 #if KERNEL_TYPE == 0
-    naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #elif KERNEL_TYPE == 1
-    blocked_offsets_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    blocked_offsets_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #elif KERNEL_TYPE == 2
-    cached_x_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    striped_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #elif KERNEL_TYPE == 3
-    experimental_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+    experimental_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 // #elif KERNEL_TYPE == 4
-//     cached_x_naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3, BLOCK_SIZE>
+//     cached_x_naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
 //         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #endif
 }

From bd0d1c715485ed4f944244a86dddbb9158c3b320 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 3 Jan 2023 14:24:44 +0100
Subject: [PATCH 16/49] Make experimental version of two-level striping

---
 cusp/system/cuda/ktt/dia_multiply.h       |  19 +++
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 186 +++++++++++-----------
 2 files changed, 110 insertions(+), 95 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 847599be..18c09884 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -30,6 +30,20 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k
     tuner.AddParameter(kernel.kernel_id, "STRIPING_FACTOR", std::vector<uint64_t>{ 2, 4, 8 });
     tuner.AddParameter(kernel.kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 256, 512 });
 
+    // START OF HACKS ---------------------
+    tuner.AddParameter(kernel.kernel_id, "BLOCKS_PER_SECTOR", std::vector<uint64_t>{ 0 });
+    tuner.AddParameter(kernel.kernel_id, "CHUNK_SIZE", std::vector<uint64_t>{ 0 });
+    tuner.AddParameter(kernel.kernel_id, "STRIPE_SIZE", std::vector<uint64_t>{ 0 });
+    tuner.AddParameter(kernel.kernel_id, "GRID_SIZE", std::vector<uint64_t>{ 0 });
+    tuner.AddParameter(kernel.kernel_id, "SECTOR_MAPPING_TYPE",
+                       std::vector<uint64_t>{ 0, 1 });
+
+    tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" },
+                        [] (const std::vector<uint64_t>& values) {
+                            return values[0] == 0;
+                        });
+    // END OF HACKS -----------------------
+
     tuner.AddThreadModifier(
         kernel.kernel_id,
         { kernel.definition_ids[0] },
@@ -156,6 +170,11 @@ auto get_launcher(const kernel_context& ctx,
         ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]);
         ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) );
 
+        auto conf = interface.GetCurrentConfiguration();
+        for (const auto& pair : conf.GetPairs())
+            if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0)
+                grid_size = ::ktt::DimensionVector(pair.GetValue());
+
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index ccf6338b..7f26658d 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -357,97 +357,96 @@ striped_dia_kernel(const int num_rows,
 }
 
 
-// template <typename IndexType,
-//           typename ValueType1,
-//           typename ValueType2,
-//           typename ValueType3>
-// __device__ void
-// double_striped_dia_kernel(const int num_rows,
-//                            const int num_cols,
-//                            const int num_diagonals,
-//                            const int pitch,
-//                            const IndexType* diagonal_offsets,
-//                            const ValueType1* values,
-//                            const ValueType2* x,
-//                            ValueType3* y)
-// {
-// #ifdef STRIPING_FACTOR
-//     const IndexType desired_distance = 256;
-//     const IndexType warps_per_stripe = 1;
-// #else
-//     const IndexType desired_distance = 256;
-//     const IndexType warps_per_stripe = 1;
-// #endif
-
-//     IndexType stripes_needed =
-//         (num_rows + desired_distance - 1) / desired_distance;
-
-//     IndexType warps_needed = 32 * stripes_needed;
-//     IndexType warps_available = BLOCK_SIZE / 32;
-
-//     IndexType num_sectors;
-//     if (warps_needed < warps_available)
-//     {
-//         // we dont need multiple sectors
-//     }
-//     else
-//     {
-//         IndexType num_sectors = warps_needed / warps_available;
-//         IndexType blocks_per_sector = gridDim.x / num_sectors;
-
-//         IndexType sector_id = blockIdx.x / blocks_per_sector;
-//         IndexType stripe_id = threadIdx.x / 32;
-//         IndexType lane_id = threadIdx.x % 32;
-//     }
-
-//     const IndexType group_size = BLOCK_SIZE/num_groups;
-//     const IndexType stride = num_rows/num_groups;
-
-//     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-//     const IndexType group_id = threadIdx.x / group_size;
-//     const IndexType group_lane_id = threadIdx.x % group_size;
-
-//     const IndexType row = group_id*stride + blockIdx.x*group_size
-//                             + group_lane_id;
-
-//     // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row);
-
-//     __shared__ IndexType offsets[BLOCK_SIZE];
-//     ValueType3 sum = ValueType3(0);
-
-//     for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
-//     {
-//         if (offset_base + threadIdx.x < num_diagonals)
-//         {
-//             offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-//         }
-
-//         __syncthreads();
-
-//         int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
-
-//         if (row < num_rows)
-//         {
-//             for (IndexType i = 0; i < batch_size; ++i)
-//             {
-//                 IndexType col = offsets[i] + row;
-//                 if (col >= 0 && col < num_cols)
-//                 {
-//                     auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
-//                     auto x_val = load_x_val(&x[col]);
-
-//                     sum += diag_val*x_val;
-//                 }
-//             }
-//         }
-
-//         __syncthreads();
-//     }
-
-//     if (row < num_rows)
-//         y[row] = sum;
-// }
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+__device__ void
+double_striped_dia_kernel(const int num_rows,
+                           const int num_cols,
+                           const int num_diagonals,
+                           const int pitch,
+                           const IndexType* diagonal_offsets,
+                           const ValueType1* values,
+                           const ValueType2* x,
+                           ValueType3* y)
+{
+#if KERNEL_TYPE == 3
+    IndexType blocks_per_sector = BLOCKS_PER_SECTOR;
+    IndexType stripe_size = STRIPE_SIZE;
+    IndexType chunk_size = CHUNK_SIZE;
+#else
+    IndexType blocks_per_sector = 0;
+    IndexType stripe_size = 0;
+    IndexType chunk_size = 0;
+    assert(false);
+#endif
+
+#if SECTOR_MAPPING_TYPE == 0
+    IndexType sector_id = blockIdx.x / blocks_per_sector;
+    IndexType sector_lane_id = blockIdx.x % blocks_per_sector;
+
+    IndexType stripe_id = threadIdx.x / (chunk_size*32);
+    IndexType stripe_lane_id = threadIdx.x % (chunk_size*32);
+
+    IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE
+                    + stripe_id*stripe_size
+                    + sector_lane_id*chunk_size*32
+                    + stripe_lane_id;
+#elif SECTOR_MAPPING_TYPE == 1
+    IndexType num_sectors = gridDim.x / blocks_per_sector;
+
+    IndexType sector_id = blockIdx.x % num_sectors;
+    IndexType sector_lane_id = blockIdx.x / num_sectors;
+
+    IndexType stripe_id = threadIdx.x / (chunk_size*32);
+    IndexType stripe_lane_id = threadIdx.x % (chunk_size*32);
+
+    IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE
+                    + stripe_id*stripe_size
+                    + sector_lane_id*chunk_size*32
+                    + stripe_lane_id;
+#endif
+
+    IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+
+    //printf("(%d, %d, %d) -> %d\n", blockIdx.x, threadIdx.x, thread_id, row);
+
+    __shared__ IndexType offsets[BLOCK_SIZE];
+    ValueType3 sum = ValueType3(0);
+
+    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
+    {
+        if (offset_base + threadIdx.x < num_diagonals)
+        {
+            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
+        }
+
+        __syncthreads();
+
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+
+        if (row < num_rows)
+        {
+            for (IndexType i = 0; i < batch_size; ++i)
+            {
+                IndexType col = offsets[i] + row;
+                if (col >= 0 && col < num_cols)
+                {
+                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
+                    auto x_val = load_x_val(&x[col]);
+
+                    sum += diag_val*x_val;
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+    if (row < num_rows)
+        y[row] = sum;
+}
 
 
 template <typename IndexType,
@@ -723,10 +722,7 @@ ktt_dia_vector_kernel(
     striped_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #elif KERNEL_TYPE == 3
-    experimental_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
+    double_striped_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-// #elif KERNEL_TYPE == 4
-//     cached_x_naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
-//         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
 #endif
 }

From 697bec30dd6e07a856c58a732fa04a8d814f9776 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 4 Jan 2023 10:10:17 +0100
Subject: [PATCH 17/49] Add support for csr format

---
 cusp/ktt/detail/ktt.inl                   |   5 +-
 cusp/ktt/ktt.h                            |  10 +-
 cusp/system/cuda/ktt/csr_multiply.h       | 218 +++++++++++++---------
 cusp/system/cuda/ktt/dia_multiply.h       |  18 +-
 cusp/system/cuda/ktt/kernels/csr_kernel.h |  23 +--
 cusp/system/cuda/ktt/multiply.h           |  94 +++++++++-
 cusp/system/cuda/ktt/utils.h              |  28 ++-
 cusp/system/detail/generic/multiply.inl   |   9 +-
 testing/ktt.cu                            |  22 ++-
 9 files changed, 281 insertions(+), 146 deletions(-)

diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl
index f2b3c745..e02b4529 100644
--- a/cusp/ktt/detail/ktt.inl
+++ b/cusp/ktt/detail/ktt.inl
@@ -101,12 +101,13 @@ template <typename Matrix,
 }
 
 
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
 std::vector<::ktt::KernelResult>
-tune(const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
      const cusp::array1d<ValueType2, cusp::device_memory>& x,
      cusp::array1d<ValueType3, cusp::device_memory>& y,
      std::optional<::ktt::ReferenceComputation> reference_computation)
diff --git a/cusp/ktt/ktt.h b/cusp/ktt/ktt.h
index cab2d637..88a5d026 100644
--- a/cusp/ktt/ktt.h
+++ b/cusp/ktt/ktt.h
@@ -1,9 +1,10 @@
 #pragma once
 
-#include <Ktt.h>
-
+#include <cusp/detail/config.h>
 #include <cusp/array1d.h>
 
+#include <Ktt.h>
+
 namespace cusp {
 
 namespace ktt {
@@ -54,12 +55,13 @@ ::ktt::KernelResult multiply(const Matrix& A,
                             const ::ktt::KernelConfiguration& configuration,
                             bool run_with_profiling = false);
 
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
 std::vector<::ktt::KernelResult>
-tune(const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
      const cusp::array1d<ValueType2, cusp::device_memory>& x,
      cusp::array1d<ValueType3, cusp::device_memory>& y,
      std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt);
diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h
index 0c0deec8..3303fe93 100644
--- a/cusp/system/cuda/ktt/csr_multiply.h
+++ b/cusp/system/cuda/ktt/csr_multiply.h
@@ -1,25 +1,18 @@
-#pragma once 
+#pragma once
 
-#include <iostream>
+#include <cusp/detail/config.h>
 
 #include <cusp/ktt/detail/external/nameof.hpp>
-#include <cusp/ktt/detail/ktt.inl>
-#include <cusp/system/cuda/arch.h> // max_active_blocks
-
-#include <cusp/system/cuda/ktt/kernels/csr_kernel.h>
 #include <cusp/system/cuda/ktt/utils.h>
 
-namespace cusp::system::cuda::ktt::csr {
-    inline ::ktt::KernelId get_kernel_id(::ktt::KernelDefinitionId id,  std::string kernel_name) {
-        static ::ktt::KernelId kernel = cusp::ktt::detail::tuner->CreateSimpleKernel(kernel_name, id);
-        return kernel;
-    }
-}
+#include <cusp/csr_matrix.h>
+#include <cusp/array1d.h>
+#include <cusp/system/cuda/utils.h>
+
+#include <thrust/fill.h>
+
+#include <iostream>
 
-template<>
-inline ::ktt::KernelId cusp::ktt::detail::get_kernel_id<cusp::csr_format>() {
-    return cusp::system::cuda::ktt::csr::get_kernel_id(0, ""); 
-}
 
 namespace cusp {
 
@@ -31,109 +24,148 @@ namespace ktt {
 
 namespace csr {
 
-inline void setup_tuning_parameters(::ktt::Tuner& tuner, ::ktt::KernelId kernel) {
-    tuner.AddParameter(kernel, "TEST_PARAM", std::vector<uint64_t>{0, 1, 2});
+
+inline void setup_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 128 });
+    tuner.AddParameter(kernel_id, "THREADS_PER_VECTOR", std::vector<uint64_t>{ 32 });
+
+    tuner.AddThreadModifier(
+        kernel.kernel_id,
+        { kernel.definition_ids[0] },
+        ::ktt::ModifierType::Local,
+        ::ktt::ModifierDimension::X,
+        { std::string("BLOCK_SIZE") },
+        [] (const uint64_t defaultSize, const std::vector<uint64_t>& parameters) {
+            return parameters[0];
+        }
+    );
 }
 
-} // namespace csr
 
-template <typename DerivedPolicy,
-          typename MatrixType,
-          typename VectorType1,
-          typename VectorType2>
-void multiply(cuda::execution_policy<DerivedPolicy>& exec,
-              const MatrixType& A,
-              const VectorType1& x,
-              VectorType2& y,
-              cusp::csr_format)
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+kernel_context initialize_kernel(::ktt::Tuner& tuner)
 {
-    static_assert( std::is_same_v<cusp::device_memory, typename MatrixType::memory_space>
-                   && std::is_same_v<cusp::device_memory, typename VectorType1::memory_space>
-                   && std::is_same_v<cusp::device_memory, typename VectorType2::memory_space>, 
-                   "All arguments must be in device memory." );
+    std::string kernel_path =
+        STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h";
 
-    using ValueType = typename MatrixType::value_type;
+    kernel_context kernel(tuner);
 
-    if (A.num_entries == 0) {
-        thrust::fill(y.begin(), y.end(), ValueType(0));
-        return;
-    }
+    std::vector< std::string > type_names {
+        std::string(NAMEOF_TYPE(IndexType)),
+        std::string(NAMEOF_TYPE(ValueType1)),
+        std::string(NAMEOF_TYPE(ValueType2)),
+        std::string(NAMEOF_TYPE(ValueType3)),
+    };
 
-    std::cout << "Hello from out thing\n";
+    // NOTE: These can be anything since they are awlays set in the launcher.
+    // So use some values that will hopefully cause a crash if not set properly
+    // in the launcher.
+    ::ktt::DimensionVector block_size(0);
+    ::ktt::DimensionVector grid_size(0);
 
-    // copied from csr_vector_spmv.h
-    typedef typename MatrixType::row_offsets_array_type::const_iterator     RowIterator;
-    typedef typename MatrixType::column_indices_array_type::const_iterator  ColumnIterator;
-    typedef typename MatrixType::values_array_type::const_iterator          ValueIterator1;
+    auto definition_id = tuner.AddKernelDefinitionFromFile(
+        "ktt_csr_vector_kernel",
+        kernel_path,
+        grid_size,
+        block_size,
+        type_names
+    );
 
-    typedef typename VectorType1::const_iterator                            ValueIterator2;
-    typedef typename VectorType2::iterator                                  ValueIterator3;
+    kernel.definition_ids.push_back(definition_id);
+    kernel.kernel_id = tuner.CreateSimpleKernel("CsrKernel", definition_id);
 
-    auto& tuner = *cusp::ktt::detail::tuner;
+    setup_tuning_parameters(kernel);
 
-    std::string path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h";
+    return kernel;
+}
 
-    const size_t THREADS_PER_BLOCK  = 128;
-    const size_t THREADS_PER_VECTOR = 32;
-    const size_t VECTORS_PER_BLOCK  = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
-    
-    std::vector< std::string > type_names {
-        std::string(NAMEOF_TYPE(typename MatrixType::index_type)),
-        std::string(NAMEOF_TYPE(typename MatrixType::value_type)),
-        std::string(NAMEOF_TYPE(typename VectorType1::value_type)),
-        std::string(NAMEOF_TYPE(typename VectorType2::value_type)),
-        std::to_string(VECTORS_PER_BLOCK),
-        std::to_string(THREADS_PER_VECTOR)
-    };
 
-    // TODO: we probably wanna change this later
-    const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks(
-                                  ktt_csr_vector_kernel<typename MatrixType::index_type, typename MatrixType::value_type,
-                                  typename VectorType1::value_type, typename VectorType2::value_type,
-                                  VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, (size_t) 0);
-    const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
+} // namespace csr
+
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::csr_format format)
+{
+    static kernel_context kernel =
+        csr::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner);
+
+    return kernel;
+}
+
 
-    const ::ktt::DimensionVector blockDimensions(THREADS_PER_BLOCK);
-    const ::ktt::DimensionVector gridDimensions(NUM_BLOCKS);
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_arguments(const kernel_context& kernel,
+              const cusp::csr_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+              const cusp::array1d<ValueType2, cusp::device_memory>& x,
+              cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    auto args = add_arguments(*kernel.tuner,
+                              A.num_rows, A.row_offsets, A.column_indices,
+                              A.values, x, y);
 
-    ::ktt::KernelDefinitionId definition = tuner.GetKernelDefinitionId("ktt_csr_vector_kernel", type_names);
+    kernel.tuner->SetArguments(kernel.definition_ids[0], args);
 
-    bool called_first_time = definition == ::ktt::InvalidKernelDefinitionId; 
+    return args;
+}
 
-    if (called_first_time) {
-        definition = tuner.AddKernelDefinitionFromFile(
-            "ktt_csr_vector_kernel", 
-            path, 
-            gridDimensions,
-            blockDimensions,
-            type_names);
-    }
 
-    ::ktt::KernelId kernel = csr::get_kernel_id(definition, "CsrKernel");
+// Returns the argument id of the y vector given a vector of arguments
+// returned by a previous call of `add_arguments`.
+inline ::ktt::ArgumentId
+get_output_argument(const std::vector<::ktt::ArgumentId>& arguments,
+                    cusp::csr_format)
+{
+    return arguments[5];
+}
 
-    if (called_first_time) {
-        csr::setup_tuning_parameters(tuner, kernel);
-    }
 
-    auto num_rows_id = tuner.AddArgumentScalar(A.num_rows);
-    auto Ap_id = add_arg(A.row_offsets); 
-    auto Aj_id = add_arg(A.column_indices);
-    auto Ax_id = add_arg(A.values);
-    auto x_id = add_arg(x);
-    auto y_id = add_arg(y, ::ktt::ArgumentAccessType::ReadWrite);
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+auto get_launcher(const kernel_context& ctx,
+                  const cusp::csr_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+                  const cusp::array1d<ValueType2, cusp::device_memory>& x,
+                  cusp::array1d<ValueType3, cusp::device_memory>& y,
+                  bool profile = false)
+{
+    return [&] (::ktt::ComputeInterface& interface)
+    {
+        auto conf = interface.GetCurrentConfiguration();
 
-    std::vector<::ktt::ArgumentId> args = { num_rows_id, Ap_id, Aj_id, Ax_id, x_id, y_id };
+        ::ktt::DimensionVector block_size =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]);
 
-    tuner.SetArguments(definition, args);
+        auto threads_per_vector = get_parameter_uint(conf, "THREADS_PER_VECTOR");
+        auto vectors_per_block = block_size.GetSizeX() / threads_per_vector;
 
-    tuner.TuneIteration(kernel, {});
+        ::ktt::DimensionVector grid_size(
+            DIVIDE_INTO(A.num_rows, vectors_per_block));
 
-    tuner.SetArguments(definition, {});
-    for (auto arg : args) {
-        tuner.RemoveArgument(arg);
-    }
+        if (!profile) {
+            interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
+        } else {
+            interface.RunKernelWithProfiling(ctx.definition_ids[0],
+                                             grid_size, block_size);
+        }
+    };
 }
 
+
 } // namespace ktt
 
 } // namespace cuda
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 18c09884..5e9a7414 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -1,11 +1,14 @@
 #pragma once
 
-#include <cusp/ktt/detail/external/nameof.hpp>
-#include <cusp/system/cuda/arch.h> // max_active_blocks
+#include <cusp/detail/config.h>
 
-#include <cusp/system/cuda/ktt/kernels/dia_kernel.h>
 #include <cusp/system/cuda/ktt/kernel.h>
 #include <cusp/system/cuda/ktt/utils.h>
+#include <cusp/ktt/detail/external/nameof.hpp>
+
+#include <cusp/system/cuda/arch.h> // max_active_blocks
+#include <cusp/system/cuda/utils.h>
+#include <cusp/dia_matrix.h>
 
 #include <optional>
 #include <iostream>
@@ -94,11 +97,6 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
 
     std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h";
 
-    const size_t block_size = 256;
-    const size_t max_blocks = cusp::system::cuda::detail::max_active_blocks(
-                                ktt_dia_vector_kernel<IndexType, ValueType1, ValueType2, ValueType3>,
-                                block_size, 0);
-
     std::vector< std::string > type_names {
         std::string(NAMEOF_TYPE(IndexType)),
         std::string(NAMEOF_TYPE(ValueType1)),
@@ -106,8 +104,8 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
         std::string(NAMEOF_TYPE(ValueType3))
     };
 
-    const ::ktt::DimensionVector blockDimensions(block_size);
-    const ::ktt::DimensionVector gridDimensions(max_blocks);
+    const ::ktt::DimensionVector blockDimensions(0);
+    const ::ktt::DimensionVector gridDimensions(0);
 
     auto definition_id = tuner.AddKernelDefinitionFromFile(
         "ktt_dia_vector_kernel",
diff --git a/cusp/system/cuda/ktt/kernels/csr_kernel.h b/cusp/system/cuda/ktt/kernels/csr_kernel.h
index f28773a1..85cd6fa6 100644
--- a/cusp/system/cuda/ktt/kernels/csr_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/csr_kernel.h
@@ -1,17 +1,11 @@
+#pragma once
 
-inline __device__ int fib(int n) {
-    if (n < 2) {
-        return n;
-    }
-    return fib(n-1) + fib(n-2);
-}
+#define VECTORS_PER_BLOCK (BLOCK_SIZE/THREADS_PER_VECTOR)
 
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          unsigned int VECTORS_PER_BLOCK,
-          unsigned int THREADS_PER_VECTOR>
+          typename ValueType3>
 __global__ void
 ktt_csr_vector_kernel(const unsigned int num_rows,
                        const IndexType*   Ap,
@@ -20,17 +14,6 @@ ktt_csr_vector_kernel(const unsigned int num_rows,
                        const ValueType2*  x,
                        ValueType3*        y)
 {
-
-#if TEST_PARAM == 0
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        printf("Test param is 0.\n");
-    }
-#else
-    if (threadIdx.x == 0 && blockIdx.x == 0) {
-        printf("Fib of 20 is %d.\n", fib(20));
-    }
-#endif
-
     typedef ValueType1 ValueType;
 
     __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2];  // padded to avoid reduction conditionals
diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h
index 5e6accab..4a8e9a34 100644
--- a/cusp/system/cuda/ktt/multiply.h
+++ b/cusp/system/cuda/ktt/multiply.h
@@ -1,9 +1,14 @@
-#pragma once 
+#pragma once
 
 #include <cusp/system/cuda/ktt/kernel.h>
-//#include <cusp/system/cuda/ktt/csr_multiply.h>
+
+#include <cusp/system/cuda/ktt/csr_multiply.h>
 #include <cusp/system/cuda/ktt/dia_multiply.h>
 
+#include <Ktt.h>
+
+#include <optional>
+
 namespace cusp {
 
 namespace system {
@@ -12,7 +17,7 @@ namespace cuda {
 
 namespace ktt {
 
-// TODO: should this be here??
+
 // Helpful function to get a kernel used for a given multiplication
 // instead of having to manually specify the types.
 template <typename MatrixType,
@@ -30,6 +35,89 @@ kernel_context get_kernel(::ktt::Tuner& tuner,
     return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, FormatType{});
 }
 
+
+template <typename MatrixType,
+          typename VectorType1,
+          typename VectorType2>
+::ktt::KernelResult multiply(::ktt::Tuner& tuner, const MatrixType& A,
+                             const VectorType1& x, VectorType2& y)
+{
+    const kernel_context& kernel = get_kernel(tuner, A, x, y);
+
+    auto args = add_arguments(kernel, A, x, y);
+
+    auto launcher = get_launcher(kernel, A, x, y, false);
+    tuner.SetLauncher(kernel.kernel_id, launcher);
+
+    auto res = tuner.TuneIteration(kernel.kernel_id, {});
+
+    remove_arguments(kernel, args);
+
+    return res;
+}
+
+
+template <typename MatrixType,
+          typename VectorType1,
+          typename VectorType2>
+::ktt::KernelResult
+multiply(::ktt::Tuner& tuner, const MatrixType& A, const VectorType1& x,
+         VectorType2& y, const ::ktt::KernelConfiguration& configuration,
+         bool run_with_profiling = false)
+{
+    const kernel_context& kernel = get_kernel(tuner, A, x, y);
+
+    auto args = add_arguments(kernel, A, x, y);
+
+    auto launcher = get_launcher(kernel, A, x, y, run_with_profiling);
+    tuner.SetLauncher(kernel.kernel_id, launcher);
+
+    auto result = tuner.Run(kernel.kernel_id, configuration, {});
+
+    remove_arguments(kernel, args);
+
+    return result;
+}
+
+
+template <typename MatrixType,
+          typename VectorType1,
+          typename VectorType2>
+std::vector<::ktt::KernelResult>
+tune(::ktt::Tuner& tuner, const MatrixType& A,
+     const VectorType1& x, VectorType2& y,
+     std::optional<::ktt::ReferenceComputation> ref_computation = std::nullopt)
+{
+    using Format = typename MatrixType::format;
+    using HostVector2 =
+        typename VectorType2::template rebind<cusp::host_memory>::type;
+
+    kernel_context kernel = get_kernel(tuner, A, x, y);
+    auto args = add_arguments(kernel, A, x, y);
+
+    if (ref_computation)
+    {
+        tuner.SetReferenceComputation(get_output_argument(args, Format{}),
+                                      *ref_computation);
+    }
+
+    HostVector2 host_y = y;
+    tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface)
+    {
+        // clear y so previous results don't affect validation
+        y = host_y;
+        auto launcher = get_launcher(kernel, A, x, y, false);
+        launcher(interface);
+    });
+
+    auto results = tuner.Tune(kernel.kernel_id);
+
+    remove_arguments(kernel, args);
+
+    return results;
+}
+
+
 } // namespace ktt
 
 } // namespace cuda
diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h
index a2d1c029..556f9542 100644
--- a/cusp/system/cuda/ktt/utils.h
+++ b/cusp/system/cuda/ktt/utils.h
@@ -3,6 +3,10 @@
 #include <Ktt.h>
 #include <cusp/system/cuda/ktt/kernel.h>
 
+#include <cstdint>   // uint64_t
+#include <stdexcept> // std::runtime_error
+
+
 #define STR(str) #str
 #define STRING(str) STR(str)
 
@@ -16,7 +20,7 @@ namespace ktt {
 
 template<typename T>
 void* cast(const T* ptr)
-{ 
+{
     return const_cast<void*>(static_cast<const void*>(ptr));
 }
 
@@ -65,7 +69,7 @@ void add_arg(::ktt::Tuner& tuner, std::vector<::ktt::ArgumentId>& argument_ids,
     argument_ids.push_back(id);
 }
 
-template<typename... Args> 
+template<typename... Args>
 std::vector<::ktt::ArgumentId> add_arguments(::ktt::Tuner& tuner, Args&&... args)
 {
     std::vector<::ktt::ArgumentId> argument_ids;
@@ -85,6 +89,26 @@ inline void remove_arguments(const kernel_context& kernel, const std::vector<::k
 }
 
 
+uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf,
+                            const std::string& name)
+{
+    for (const auto& pair : conf.GetPairs())
+        if (pair.GetName() == name)
+            return pair.GetValue();
+
+    throw std::runtime_error("No paramater with name: " + name);
+}
+
+double get_parameter_double(const ::ktt::KernelConfiguration& conf,
+                            const std::string& name)
+{
+    for (const auto& pair : conf.GetPairs())
+        if (pair.GetName() == name)
+            return pair.GetValueDouble();
+
+    throw std::runtime_error("No paramater with name: " + name);
+}
+
 } // namespace ktt
 
 } // namespace cuda
diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl
index 6e620701..501e0b80 100644
--- a/cusp/system/detail/generic/multiply.inl
+++ b/cusp/system/detail/generic/multiply.inl
@@ -109,7 +109,7 @@ multiply(thrust::execution_policy<DerivedPolicy> &exec,
 
 template <typename T, typename /*U*/ = void>
 struct has_rebind : std::false_type {};
- 
+
 template <typename T>
 struct has_rebind<T, std::void_t<typename T::rebind<int>>> : std::true_type {};
 
@@ -128,11 +128,14 @@ multiply(cusp::system::cuda::detail::execution_policy<DerivedPolicy> &exec,
 {
     typedef typename LinearOperator::format  MatrixFormat;
 
-    if constexpr ((/*cusp::detail::is_csr<LinearOperator>::value ||*/ cusp::detail::is_dia<LinearOperator>::value) && has_rebind_v<LinearOperator>) {
+    if constexpr ((cusp::detail::is_csr<LinearOperator>::value
+                        || cusp::detail::is_dia<LinearOperator>::value)
+                    && has_rebind_v<LinearOperator>)
+    {
         if (cusp::ktt::detail::is_enabled) {
             cusp::ktt::detail::lazy_init();
             cusp::system::cuda::ktt::multiply(*cusp::ktt::detail::tuner, A, B, C);
-            return;    
+            return;
         }
     }
 
diff --git a/testing/ktt.cu b/testing/ktt.cu
index 613e668a..e54f4289 100644
--- a/testing/ktt.cu
+++ b/testing/ktt.cu
@@ -29,8 +29,9 @@
     }                                                                    \
     DECLARE_UNITTEST(VTEST##Ktt##Fmt##Matrix);
 
-#define DECLARE_KTT_UNITTEST(VTEST) \
-    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST,Dia,dia)
+#define DECLARE_KTT_UNITTEST(VTEST)                     \
+    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \
+    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Csr, csr)
 
 
 struct UnitTestStopCondition : ::ktt::StopCondition
@@ -71,11 +72,12 @@ private:
 };
 
 
-void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& results,
-                         const std::string& ktt_logs,
-                         const std::string& arg_name,
-                         const std::string& filename = "unknown",
-                         int lineno = -1)
+void assert_tunning_results_valid(
+    const std::vector<::ktt::KernelResult>& results,
+    const std::string& ktt_logs,
+    const std::string& arg_name,
+    const std::string& filename = "unknown",
+    int lineno = -1)
 {
     bool failed = false;
     unittest::UnitTestFailure f;
@@ -106,8 +108,10 @@ void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& result
 
         failed = true;
 
-        f << "[" << filename << ":" << lineno << "] " << result.GetKernelName() << ": ";
-        f << "Encountered an error: " << reason << "\n\n";
+        f << "[" << filename << ":" << lineno << "] ";
+        f << result.GetKernelName() << ": ";
+        f << reason << "\n\n";
+
         f << "On matrix: " << arg_name << "\n\n";
 
         f << "In configuration:\n";

From b0f277766e12ec516970b47c7fe781ac60b99f5a Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 10 Jan 2023 19:11:22 +0100
Subject: [PATCH 18/49] Fix deprecation warning

---
 cusp/system/cuda/detail/synchronize.inl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cusp/system/cuda/detail/synchronize.inl b/cusp/system/cuda/detail/synchronize.inl
index 8256398e..73919e38 100644
--- a/cusp/system/cuda/detail/synchronize.inl
+++ b/cusp/system/cuda/detail/synchronize.inl
@@ -32,7 +32,7 @@ namespace detail
 
 void synchronize(const char *message)
 {
-  cudaError_t error = cudaThreadSynchronize();
+  cudaError_t error = cudaDeviceSynchronize();
   if(error)
   {
     throw thrust::system_error(error, thrust::cuda_category(), std::string("synchronize: ") + message);

From 256c3c8c0d78f04803885b6b096c12bb11523912 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 14 Feb 2023 20:45:00 +0100
Subject: [PATCH 19/49] Make it possible to use ell with ktt

---
 cusp/system/cuda/ktt/ell_multiply.h       | 160 ++++++++++++++++++++++
 cusp/system/cuda/ktt/kernels/ell_kernel.h |  55 ++++++++
 cusp/system/cuda/ktt/multiply.h           |   1 +
 cusp/system/detail/generic/multiply.inl   |   4 +-
 testing/ktt.cu                            |   2 +-
 5 files changed, 219 insertions(+), 3 deletions(-)
 create mode 100644 cusp/system/cuda/ktt/ell_multiply.h
 create mode 100644 cusp/system/cuda/ktt/kernels/ell_kernel.h

diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
new file mode 100644
index 00000000..f7a8da71
--- /dev/null
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <cusp/system/cuda/ktt/kernel.h>
+#include <cusp/system/cuda/ktt/utils.h>
+
+namespace cusp {
+
+namespace system {
+
+namespace cuda {
+
+namespace ktt {
+
+namespace ell {
+
+
+inline void setup_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 128 });
+    tuner.AddParameter(kernel_id, "UNCACHED_LOADS",
+                        std::vector<uint64_t>{ 0, 1 });
+
+    tuner.AddThreadModifier(
+        kernel_id,
+        { kernel.definition_ids[0] },
+        ::ktt::ModifierType::Local,
+        ::ktt::ModifierDimension::X,
+        { std::string("BLOCK_SIZE") },
+        [] (const uint64_t defaultSize, const std::vector<uint64_t>& parameters) {
+            return parameters[0];
+        }
+    );
+}
+
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+kernel_context initialize_kernel(::ktt::Tuner& tuner)
+{
+    std::string kernel_path =
+        STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/ell_kernel.h";
+
+    kernel_context kernel(tuner);
+
+    std::vector< std::string > type_names {
+        std::string(NAMEOF_TYPE(IndexType)),
+        std::string(NAMEOF_TYPE(ValueType1))
+    };
+
+    // NOTE: These can be anything since they are awlays set in the launcher.
+    // So use some values that will hopefully cause a crash if not set properly
+    // in the launcher.
+    ::ktt::DimensionVector block_size(0);
+    ::ktt::DimensionVector grid_size(0);
+
+    auto definition_id = tuner.AddKernelDefinitionFromFile(
+        "ktt_ell_kernel",
+        kernel_path,
+        grid_size,
+        block_size,
+        type_names
+    );
+
+    kernel.definition_ids.push_back(definition_id);
+    kernel.kernel_id = tuner.CreateSimpleKernel("EllKernel", definition_id);
+
+    setup_tuning_parameters(kernel);
+
+    return kernel;
+}
+
+
+} // namespace ell
+
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
+const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::ell_format format)
+{
+    static kernel_context kernel =
+        ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner);
+
+    return kernel;
+}
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_arguments(const kernel_context& kernel,
+              const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+              const cusp::array1d<ValueType2, cusp::device_memory>& x,
+              cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    IndexType pitch = A.column_indices.pitch;
+    IndexType num_entries_per_row = A.column_indices.num_cols;
+
+    auto args = add_arguments(*kernel.tuner,
+                              A.num_rows, A.num_cols, num_entries_per_row, pitch,
+                              A.column_indices.values, A.values.values, x, y);
+
+    kernel.tuner->SetArguments(kernel.definition_ids[0], args);
+
+    return args;
+}
+
+
+// Returns the argument id of the y vector given a vector of arguments
+// returned by a previous call of `add_arguments`.
+inline ::ktt::ArgumentId
+get_output_argument(const std::vector<::ktt::ArgumentId>& arguments,
+                    cusp::ell_format)
+{
+    return arguments[7];
+}
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+auto get_launcher(const kernel_context& ctx,
+                  const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+                  const cusp::array1d<ValueType2, cusp::device_memory>& x,
+                  cusp::array1d<ValueType3, cusp::device_memory>& y,
+                  bool profile = false)
+{
+    return [&] (::ktt::ComputeInterface& interface)
+    {
+        ::ktt::DimensionVector block_size =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]);
+        ::ktt::DimensionVector grid_size(
+            DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
+
+        if (!profile) {
+            interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
+        } else {
+            interface.RunKernelWithProfiling(ctx.definition_ids[0],
+                                             grid_size, block_size);
+        }
+    };
+}
+
+
+} // namespace ktt
+
+} // namespace cuda
+
+} // namespace system
+
+} // namespace cusp
diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
new file mode 100644
index 00000000..be2d250d
--- /dev/null
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -0,0 +1,55 @@
+
+template<typename T>
+__device__ T load_uncached(const T* addr)
+{
+#if UNCACHED_LOADS == 0
+    return *addr;
+#else
+    return __ldcv(addr);
+#endif
+}
+
+template <typename IndexType,
+          typename ValueType>
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_ell_kernel(const IndexType num_rows,
+                const IndexType num_cols,
+                const IndexType num_cols_per_row,
+                const IndexType pitch,
+                const IndexType * Aj,
+                const ValueType * Ax,
+                const ValueType * x,
+                ValueType * y)
+{
+    const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+
+    // if (row == 0)
+    // {
+    //     printf("Hello from ELL.\n");
+    // }
+
+    if (row < num_rows)
+    {
+        ValueType sum = 0;
+        IndexType offset = row;
+
+        for(IndexType n = 0; n < num_cols_per_row; n++)
+        {
+            const IndexType col = load_uncached(Aj + offset);
+
+            // This assumes that
+            // cusp::ell_matrix<...>::invalid_index is always < 0
+            if (col >= 0)
+            {
+                const ValueType x_j = x[col];
+                const ValueType A_ij = load_uncached(Ax + offset);
+
+                sum += A_ij * x_j;
+            }
+
+            offset += pitch;
+        }
+
+        y[row] = sum;
+    }
+}
diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h
index 4a8e9a34..4f5a0b84 100644
--- a/cusp/system/cuda/ktt/multiply.h
+++ b/cusp/system/cuda/ktt/multiply.h
@@ -4,6 +4,7 @@
 
 #include <cusp/system/cuda/ktt/csr_multiply.h>
 #include <cusp/system/cuda/ktt/dia_multiply.h>
+#include <cusp/system/cuda/ktt/ell_multiply.h>
 
 #include <Ktt.h>
 
diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl
index 501e0b80..99909c51 100644
--- a/cusp/system/detail/generic/multiply.inl
+++ b/cusp/system/detail/generic/multiply.inl
@@ -128,8 +128,8 @@ multiply(cusp::system::cuda::detail::execution_policy<DerivedPolicy> &exec,
 {
     typedef typename LinearOperator::format  MatrixFormat;
 
-    if constexpr ((cusp::detail::is_csr<LinearOperator>::value
-                        || cusp::detail::is_dia<LinearOperator>::value)
+    if constexpr ((cusp::detail::is_ell<LinearOperator>::value
+                    || cusp::detail::is_dia<LinearOperator>::value)
                     && has_rebind_v<LinearOperator>)
     {
         if (cusp::ktt::detail::is_enabled) {
diff --git a/testing/ktt.cu b/testing/ktt.cu
index e54f4289..aac96e34 100644
--- a/testing/ktt.cu
+++ b/testing/ktt.cu
@@ -31,7 +31,7 @@
 
 #define DECLARE_KTT_UNITTEST(VTEST)                     \
     DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \
-    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Csr, csr)
+    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell)
 
 
 struct UnitTestStopCondition : ::ktt::StopCondition

From cd09e89b542e6383c4faa21f71ebbf02fdc2dbd5 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 15 Feb 2023 10:29:55 +0100
Subject: [PATCH 20/49] Add different load types

---
 cusp/system/cuda/ktt/kernels/ell_kernel.h | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
index be2d250d..e8b55d2f 100644
--- a/cusp/system/cuda/ktt/kernels/ell_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -4,8 +4,12 @@ __device__ T load_uncached(const T* addr)
 {
 #if UNCACHED_LOADS == 0
     return *addr;
-#else
+#elif UNCACHED_LOADS == 1
     return __ldcv(addr);
+#elif UNCACHED_LOADS == 2
+    return __ldcs(addr);
+#elif UNCACHED_LOADS == 3
+    return __ldlu(addr);
 #endif
 }
 
@@ -16,18 +20,13 @@ ktt_ell_kernel(const IndexType num_rows,
                 const IndexType num_cols,
                 const IndexType num_cols_per_row,
                 const IndexType pitch,
-                const IndexType * Aj,
-                const ValueType * Ax,
-                const ValueType * x,
-                ValueType * y)
+                const IndexType* __restrict__ Aj,
+                const ValueType* __restrict__ Ax,
+                const ValueType* __restrict__ x,
+                ValueType* __restrict__ y)
 {
     const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
-    // if (row == 0)
-    // {
-    //     printf("Hello from ELL.\n");
-    // }
-
     if (row < num_rows)
     {
         ValueType sum = 0;

From ae7d024f593cb25826159116ba245de118094d1d Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Sun, 5 Mar 2023 20:29:53 +0100
Subject: [PATCH 21/49] Add convenience type for creating tuning configuration

---
 cusp/system/cuda/ktt/utils.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h
index 556f9542..a3f4af1e 100644
--- a/cusp/system/cuda/ktt/utils.h
+++ b/cusp/system/cuda/ktt/utils.h
@@ -18,6 +18,9 @@ namespace cuda {
 
 namespace ktt {
 
+using u64_vec = std::vector<uint64_t>;
+
+
 template<typename T>
 void* cast(const T* ptr)
 {

From 41a18e1c85f58f005f39c635d218026979200164 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Sun, 5 Mar 2023 20:30:34 +0100
Subject: [PATCH 22/49] Add new tuning parameters

---
 cusp/system/cuda/ktt/ell_multiply.h       |  32 +++-
 cusp/system/cuda/ktt/kernels/ell_kernel.h | 193 ++++++++++++++++++++--
 2 files changed, 206 insertions(+), 19 deletions(-)

diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index f7a8da71..19456107 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -19,9 +19,12 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
     auto& tuner = *kernel.tuner;
     auto kernel_id = kernel.kernel_id;
 
-    tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 128 });
-    tuner.AddParameter(kernel_id, "UNCACHED_LOADS",
-                        std::vector<uint64_t>{ 0, 1 });
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE",      u64_vec{ 128 });
+    tuner.AddParameter(kernel_id, "BREAK",           u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "UNCACHED_LOADS",  u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "DISABLE_UNROLL",  u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 });
+    tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4 });
 
     tuner.AddThreadModifier(
         kernel_id,
@@ -33,6 +36,24 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
             return parameters[0];
         }
     );
+
+    // BREAK can be used only when there is no prefetching
+    // TODO(KTT): Implement prefetching when using early break and remove this
+    tuner.AddConstraint(
+            kernel.kernel_id,
+            { "BREAK", "PREFETCH_FACTOR" },
+            [] (const std::vector<uint64_t>& values) {
+                return values[0] == 0 || values[1] == 0;
+            });
+
+
+    // All the warps working on one stripe of the matrix fit in a thread block.
+    tuner.AddConstraint(
+            kernel.kernel_id,
+            { "THREADS_PER_ROW", "BLOCK_SIZE" },
+            [] (const std::vector<uint64_t>& values) {
+                return 32*values[0] <= values[1];
+            });
 }
 
 
@@ -136,10 +157,13 @@ auto get_launcher(const kernel_context& ctx,
 {
     return [&] (::ktt::ComputeInterface& interface)
     {
+        const auto& conf = interface.GetCurrentConfiguration();
+        auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW");
+
         ::ktt::DimensionVector block_size =
             interface.GetCurrentLocalSize(ctx.definition_ids[0]);
         ::ktt::DimensionVector grid_size(
-            DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
+            threads_per_row * DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
 
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
index e8b55d2f..3f42ce92 100644
--- a/cusp/system/cuda/ktt/kernels/ell_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -1,6 +1,13 @@
 
+#if DISABLE_UNROLL == 1
+#define UNROLL 1
+#else
+#define UNROLL
+#endif
+
+
 template<typename T>
-__device__ T load_uncached(const T* addr)
+__device__ T load(const T* addr)
 {
 #if UNCACHED_LOADS == 0
     return *addr;
@@ -13,42 +20,198 @@ __device__ T load_uncached(const T* addr)
 #endif
 }
 
+
+template<typename IndexType,
+         typename ValueType,
+         int K>
+struct Prefetcher : Prefetcher<IndexType, ValueType,  K - 1>
+{
+    using parent = Prefetcher<IndexType, ValueType, K - 1>;
+
+    IndexType col;
+    ValueType A_val = 0;
+    ValueType x_val = 0;
+
+    __device__
+    void prefetch_cols(const IndexType* __restrict__ Aj,
+                      IndexType offset,
+                      IndexType pitch,
+                      IndexType i)
+    {
+        col = load(Aj + offset + i*pitch);
+        parent::prefetch_cols(Aj, offset, pitch, i + 1);
+    }
+
+    __device__
+    void prefetch_vals(const ValueType* __restrict__ Ax,
+                       const ValueType* __restrict__ x,
+                       IndexType offset,
+                       IndexType pitch,
+                       IndexType i)
+    {
+        if (col >= 0)
+        {
+            A_val = load(Ax + offset + i*pitch);
+            x_val = x[col];
+        }
+        parent::prefetch_vals(Ax, x, offset, pitch, i + 1);
+    }
+
+    __device__ ValueType accumulate_results()
+    {
+        return x_val*A_val + parent::accumulate_results();
+    }
+};
+
+template<typename IndexType,
+         typename ValueType>
+struct Prefetcher<IndexType, ValueType, 0>
+{
+    __device__
+    void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset,
+                       IndexType pitch, IndexType i) { }
+
+    __device__
+    void prefetch_vals(const ValueType* __restrict__ Ax,
+                       const ValueType* __restrict__ x, IndexType offset,
+                       IndexType pitch, IndexType i) { }
+
+    __device__ ValueType accumulate_results()
+    {
+        return 0;
+    }
+};
+
+
 template <typename IndexType,
           typename ValueType>
-__launch_bounds__(BLOCK_SIZE, 1) __global__ void
-ktt_ell_kernel(const IndexType num_rows,
-                const IndexType num_cols,
-                const IndexType num_cols_per_row,
-                const IndexType pitch,
-                const IndexType* __restrict__ Aj,
-                const ValueType* __restrict__ Ax,
-                const ValueType* __restrict__ x,
-                ValueType* __restrict__ y)
+__device__ void
+ktt_ell_kernel_basic(const IndexType num_rows,
+                     const IndexType num_cols,
+                     const IndexType num_cols_per_row,
+                     const IndexType pitch,
+                     const IndexType* __restrict__ Aj,
+                     const ValueType* __restrict__ Ax,
+                     const ValueType* __restrict__ x,
+                     ValueType* __restrict__ y)
 {
+
+#if THREADS_PER_ROW > 1
+    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+    const IndexType warp_id = thread_id/32;
+    const IndexType stripe_id = warp_id/THREADS_PER_ROW;
+    const IndexType warp_lane = thread_id % 32;
+    const IndexType stripe_lane = warp_id % THREADS_PER_ROW;
+    const IndexType block_lane = threadIdx.x/(32*THREADS_PER_ROW);
+
+    const IndexType row = stripe_id*32 + warp_lane;
+    const IndexType start_offset = row + stripe_lane * pitch;
+    const IndexType stride = THREADS_PER_ROW * pitch;
+
+    const IndexType rounded_up_count =
+        (num_cols_per_row + THREADS_PER_ROW - 1)/THREADS_PER_ROW;
+    const IndexType remainder = num_cols_per_row % THREADS_PER_ROW;
+
+    const IndexType elem_count =
+        remainder == 0 || stripe_lane < remainder
+            ? rounded_up_count
+            : rounded_up_count - 1;
+
+    //if (row == 1)
+    //    printf("%d -> %d %d %d %d\n", thread_id, rounded_up_count, remainder, stripe_lane, elem_count);
+
+    const int stripes_in_block = BLOCK_SIZE/(32*THREADS_PER_ROW);
+    __shared__ ValueType sums[stripes_in_block][32];
+
+    if (stripe_lane == 0)
+    {
+        sums[block_lane][warp_lane] = 0;
+    }
+    __syncthreads();
+#else
     const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x;
+    const IndexType start_offset = row;
+    const IndexType stride = pitch;
+    const IndexType elem_count = num_cols_per_row;
+#endif
 
     if (row < num_rows)
     {
         ValueType sum = 0;
-        IndexType offset = row;
+        IndexType offset = start_offset;
 
-        for(IndexType n = 0; n < num_cols_per_row; n++)
+#if PREFETCH_FACTOR > 0
+        IndexType num_iters = elem_count/PREFETCH_FACTOR;
+
+        #pragma unroll UNROLL
+        for(IndexType n = 0; n < num_iters; ++n)
         {
-            const IndexType col = load_uncached(Aj + offset);
+            Prefetcher<IndexType, ValueType, PREFETCH_FACTOR> prefetcher;
+
+            prefetcher.prefetch_cols(Aj, offset, stride, 0);
+            prefetcher.prefetch_vals(Ax, x, offset, stride, 0);
+            sum += prefetcher.accumulate_results();
+
+            offset += PREFETCH_FACTOR * stride;
+        }
+
+        IndexType remaining_iters = elem_count % PREFETCH_FACTOR;
+#else
+        IndexType remaining_iters = elem_count;
+#endif
+
+        #pragma unroll UNROLL
+        for (int n = 0; n < remaining_iters; ++n)
+        {
+            const IndexType col = load(Aj + offset);
 
             // This assumes that
             // cusp::ell_matrix<...>::invalid_index is always < 0
             if (col >= 0)
             {
                 const ValueType x_j = x[col];
-                const ValueType A_ij = load_uncached(Ax + offset);
+                const ValueType A_ij = load(Ax + offset);
 
                 sum += A_ij * x_j;
             }
+            else
+            {
+#if BREAK == 1
+                break;
+#endif
+            }
 
-            offset += pitch;
+            offset += stride;
         }
 
+#if THREADS_PER_ROW > 1
+        atomicAdd(&sums[block_lane][warp_lane], sum);
+
+        __syncthreads();
+
+        if (stripe_lane == 0)
+        {
+            y[row] = sums[block_lane][warp_lane];
+        }
+#else
         y[row] = sum;
+#endif
     }
 }
+
+
+template <typename IndexType,
+          typename ValueType>
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_ell_kernel(const IndexType num_rows,
+               const IndexType num_cols,
+               const IndexType num_cols_per_row,
+               const IndexType pitch,
+               const IndexType* __restrict__ Aj,
+               const ValueType* __restrict__ Ax,
+               const ValueType* __restrict__ x,
+               ValueType* __restrict__ y)
+{
+    ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch,
+                         Aj, Ax, x, y);
+}

From 4376fd1aff2c671d66d66dfcddfe51a17353b40c Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Sun, 5 Mar 2023 20:31:12 +0100
Subject: [PATCH 23/49] Shift all non-zeros to the start when converting from
 dia to ell

---
 .../detail/generic/conversions/dia_to_other.h | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/cusp/system/detail/generic/conversions/dia_to_other.h b/cusp/system/detail/generic/conversions/dia_to_other.h
index 45c26f47..bc30fb5a 100644
--- a/cusp/system/detail/generic/conversions/dia_to_other.h
+++ b/cusp/system/detail/generic/conversions/dia_to_other.h
@@ -41,6 +41,7 @@
 #include <thrust/iterator/zip_iterator.h>
 
 #include <cassert>
+#include <iostream>
 
 namespace cusp
 {
@@ -211,6 +212,62 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
 
     thrust::copy(exec, src.values.values.begin(), src.values.values.end(), dst.values.values.begin());
 
+    using StridedIndexIterator = thrust::transform_iterator<
+                                    cusp::multiplies_value<IndexType>,
+                                    IndexIterator>;
+    using RowElemIndicesIterator = thrust::transform_iterator<
+                                    cusp::plus_value<IndexType>,
+                                    StridedIndexIterator>;
+    using RowColsIterator = thrust::permutation_iterator<
+                                decltype(dst.column_indices.values.begin()),
+                                RowElemIndicesIterator>;
+    using RowValsIterator = thrust::permutation_iterator<
+                                decltype(dst.values.values.begin()),
+                                RowElemIndicesIterator>;
+
+    for (size_t row_idx = 0; row_idx < src.num_rows; ++row_idx)
+    {
+        StridedIndexIterator strided_cols(
+                                IndexIterator(0),
+                                cusp::multiplies_value<IndexType>(pitch));
+        RowElemIndicesIterator elem_indices(
+                                strided_cols,
+                                cusp::plus_value<IndexType>((IndexType)row_idx));
+
+        RowColsIterator row_cols(dst.column_indices.values.begin(), elem_indices);
+        RowValsIterator row_vals(dst.values.values.begin(), elem_indices);
+
+        thrust::stable_partition(exec,
+                                 row_vals,
+                                 row_vals + dst.column_indices.num_cols,
+                                 row_cols,
+                                 thrust::placeholders::_1 != IndexType(-1));
+
+        thrust::stable_partition(exec,
+                                 row_cols,
+                                 row_cols + dst.column_indices.num_cols,
+                                 thrust::placeholders::_1 != IndexType(-1));
+    }
+
+    // for (int row = 0; row < dst.num_rows; ++row)
+    // {
+    //     int last = 0;
+    //     for (int col = 0; col < dst.column_indices.num_cols; ++col)
+    //     {
+    //         if (dst.column_indices(row, col) >= 0)
+    //         {
+    //             auto tmp1 = dst.column_indices(row, last);
+    //             dst.column_indices(row, last) = dst.column_indices(row, col);
+    //             dst.column_indices(row, col) = tmp1;
+
+    //             auto tmp2 = dst.values(row, last);
+    //             dst.values(row, last) = dst.values(row, col);
+    //             dst.values(row, col) = tmp2;
+
+    //             last++;
+    //         }
+    //     }
+    // }
 }
 
 } // end namespace generic

From 17342e1188cc21b04af9ccacd3d8f943d8b89345 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 7 Mar 2023 09:02:22 +0100
Subject: [PATCH 24/49] Simplify the ell kernel

---
 cusp/system/cuda/ktt/ell_multiply.h       |  9 ++--
 cusp/system/cuda/ktt/kernels/ell_kernel.h | 59 ++++++-----------------
 2 files changed, 22 insertions(+), 46 deletions(-)

diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 19456107..8b217bb9 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -160,10 +160,13 @@ auto get_launcher(const kernel_context& ctx,
         const auto& conf = interface.GetCurrentConfiguration();
         auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW");
 
-        ::ktt::DimensionVector block_size =
-            interface.GetCurrentLocalSize(ctx.definition_ids[0]);
+        auto threads_in_block =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]).GetSizeX();
+
+        ::ktt::DimensionVector block_size(
+            threads_in_block/threads_per_row, threads_per_row);
         ::ktt::DimensionVector grid_size(
-            threads_per_row * DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
+            DIVIDE_INTO(A.num_rows, threads_in_block/threads_per_row));
 
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
index 3f42ce92..b6a29324 100644
--- a/cusp/system/cuda/ktt/kernels/ell_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -95,56 +95,33 @@ ktt_ell_kernel_basic(const IndexType num_rows,
                      const ValueType* __restrict__ x,
                      ValueType* __restrict__ y)
 {
-
-#if THREADS_PER_ROW > 1
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-    const IndexType warp_id = thread_id/32;
-    const IndexType stripe_id = warp_id/THREADS_PER_ROW;
-    const IndexType warp_lane = thread_id % 32;
-    const IndexType stripe_lane = warp_id % THREADS_PER_ROW;
-    const IndexType block_lane = threadIdx.x/(32*THREADS_PER_ROW);
-
-    const IndexType row = stripe_id*32 + warp_lane;
-    const IndexType start_offset = row + stripe_lane * pitch;
+    const IndexType row = blockDim.x*blockIdx.x + threadIdx.x;
     const IndexType stride = THREADS_PER_ROW * pitch;
 
-    const IndexType rounded_up_count =
-        (num_cols_per_row + THREADS_PER_ROW - 1)/THREADS_PER_ROW;
-    const IndexType remainder = num_cols_per_row % THREADS_PER_ROW;
-
-    const IndexType elem_count =
-        remainder == 0 || stripe_lane < remainder
-            ? rounded_up_count
-            : rounded_up_count - 1;
-
-    //if (row == 1)
-    //    printf("%d -> %d %d %d %d\n", thread_id, rounded_up_count, remainder, stripe_lane, elem_count);
-
-    const int stripes_in_block = BLOCK_SIZE/(32*THREADS_PER_ROW);
-    __shared__ ValueType sums[stripes_in_block][32];
+#if THREADS_PER_ROW > 1
+    __shared__ ValueType sums[BLOCK_SIZE/THREADS_PER_ROW];
 
-    if (stripe_lane == 0)
+    if (threadIdx.y == 0)
     {
-        sums[block_lane][warp_lane] = 0;
+        sums[threadIdx.x] = 0;
     }
+
     __syncthreads();
-#else
-    const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-    const IndexType start_offset = row;
-    const IndexType stride = pitch;
-    const IndexType elem_count = num_cols_per_row;
 #endif
 
     if (row < num_rows)
     {
         ValueType sum = 0;
-        IndexType offset = start_offset;
+        IndexType n = threadIdx.y;
+        IndexType offset = row + n * pitch;
 
 #if PREFETCH_FACTOR > 0
-        IndexType num_iters = elem_count/PREFETCH_FACTOR;
+        IndexType bound = num_cols_per_row
+                            - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW;
+        IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW;
 
         #pragma unroll UNROLL
-        for(IndexType n = 0; n < num_iters; ++n)
+        for(; n < bound; n += step)
         {
             Prefetcher<IndexType, ValueType, PREFETCH_FACTOR> prefetcher;
 
@@ -154,14 +131,10 @@ ktt_ell_kernel_basic(const IndexType num_rows,
 
             offset += PREFETCH_FACTOR * stride;
         }
-
-        IndexType remaining_iters = elem_count % PREFETCH_FACTOR;
-#else
-        IndexType remaining_iters = elem_count;
 #endif
 
         #pragma unroll UNROLL
-        for (int n = 0; n < remaining_iters; ++n)
+        for (; n < num_cols_per_row; n += THREADS_PER_ROW)
         {
             const IndexType col = load(Aj + offset);
 
@@ -185,13 +158,13 @@ ktt_ell_kernel_basic(const IndexType num_rows,
         }
 
 #if THREADS_PER_ROW > 1
-        atomicAdd(&sums[block_lane][warp_lane], sum);
+        atomicAdd(&sums[threadIdx.x], sum);
 
         __syncthreads();
 
-        if (stripe_lane == 0)
+        if (threadIdx.y == 0)
         {
-            y[row] = sums[block_lane][warp_lane];
+            y[row] = sums[threadIdx.x];
         }
 #else
         y[row] = sum;

From 39da0d7cb42dc873a648442f653f33f6572eaf80 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 8 Mar 2023 11:29:45 +0100
Subject: [PATCH 25/49] Instrument csr and ell kernels

---
 .../cuda/detail/multiply/csr_vector_spmv.h     | 18 ++++++++++++++++++
 cusp/system/cuda/detail/multiply/ell_spmv.h    | 18 ++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h
index 7bf4b98d..6f9e2c5c 100644
--- a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h
+++ b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h
@@ -24,6 +24,13 @@
 
 #include <algorithm>
 
+#ifdef TIME_CSR
+#include <chrono>
+#include <iostream>
+
+namespace krn = std::chrono;
+#endif
+
 namespace cusp
 {
 namespace system
@@ -190,11 +197,22 @@ void __spmv_csr_vector(cuda::execution_policy<DerivedPolicy>& exec,
 
     cudaStream_t s = stream(thrust::detail::derived_cast(exec));
 
+    #ifdef TIME_CSR
+        auto start = krn::steady_clock::now();
+    #endif
+
     spmv_csr_vector_kernel<RowIterator, ColumnIterator, ValueIterator1, ValueIterator2, ValueIterator3,
                            UnaryFunction, BinaryFunction1, BinaryFunction2,
                            VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, 0, s>>>
                            (A.num_rows, A.row_offsets.begin(), A.column_indices.begin(), A.values.begin(), x.begin(), y.begin(),
                             initialize, combine, reduce);
+
+    #ifdef TIME_CSR
+        cudaStreamSynchronize(s);
+        auto end = krn::steady_clock::now();
+        size_t time = krn::duration_cast<krn::microseconds>(end - start).count();
+        std::cout << "cusp csr kernel: " << time << " us\n";
+    #endif
 }
 
 template <typename DerivedPolicy,
diff --git a/cusp/system/cuda/detail/multiply/ell_spmv.h b/cusp/system/cuda/detail/multiply/ell_spmv.h
index 2a5eab78..18bdecf5 100644
--- a/cusp/system/cuda/detail/multiply/ell_spmv.h
+++ b/cusp/system/cuda/detail/multiply/ell_spmv.h
@@ -28,6 +28,13 @@
 #include <cassert>
 #include <algorithm>
 
+#ifdef TIME_ELL
+#include <chrono>
+#include <iostream>
+
+namespace krn = std::chrono;
+#endif
+
 namespace cusp
 {
 namespace system
@@ -132,8 +139,19 @@ void multiply(cuda::execution_policy<DerivedPolicy>& exec,
 
     cudaStream_t s = stream(thrust::detail::derived_cast(exec));
 
+#ifdef TIME_ELL
+    auto start = krn::steady_clock::now();
+#endif
+
     spmv_ell_kernel<IndexType,ValueType,UnaryFunction,BinaryFunction1,BinaryFunction2,BLOCK_SIZE> <<<NUM_BLOCKS, BLOCK_SIZE, 0, s>>>
     (A.num_rows, A.num_cols, num_entries_per_row, pitch, J, V, x_ptr, y_ptr, initialize, combine, reduce);
+
+#ifdef TIME_ELL
+    cudaStreamSynchronize(s);
+    auto end = krn::steady_clock::now();
+    size_t time = krn::duration_cast<krn::microseconds>(end - start).count();
+    std::cout << "cusp ell kernel: " << time << " us\n";
+#endif
 }
 
 } // end namespace detail

From b09f3df268353b54b42301d9df3d17d4d30b48e4 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 13 Mar 2023 15:02:44 +0100
Subject: [PATCH 26/49] Implement ellr_matrix

---
 cusp/detail/ellr_matrix.inl | 131 ++++++++++++++++++++++++++++++++++++
 cusp/ellr_matrix.h          |  89 ++++++++++++++++++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 cusp/detail/ellr_matrix.inl
 create mode 100644 cusp/ellr_matrix.h

diff --git a/cusp/detail/ellr_matrix.inl b/cusp/detail/ellr_matrix.inl
new file mode 100644
index 00000000..2bc5ee4b
--- /dev/null
+++ b/cusp/detail/ellr_matrix.inl
@@ -0,0 +1,131 @@
+#include <cusp/array2d.h>
+#include <cusp/convert.h>
+#include <cusp/detail/utils.h>
+
+#include <thrust/transform.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/device_ptr.h> // raw_pointer_cast
+
+namespace cusp {
+
+
+template<typename IndexType>
+struct ell_row_length
+{
+    const IndexType* column_indices;
+    size_t num_cols_per_row;
+    size_t pitch;
+
+    __host__ __device__ size_t operator()(IndexType row_idx)
+    {
+        size_t len = 0;
+
+        while (len < num_cols_per_row && row_idx + len*pitch < 8
+                && column_indices[row_idx + len*pitch] >= 0)
+        {
+            len++;
+        }
+
+        return len;
+    }
+};
+
+template<typename IndexType, typename ValueType, typename MemorySpace>
+void
+compute_row_lengths(cusp::ellr_matrix<IndexType, ValueType, MemorySpace>& A)
+{
+    thrust::counting_iterator<IndexType> row_idx_it(0);
+
+    thrust::transform(
+        row_idx_it,
+        row_idx_it + A.num_rows,
+        A.row_lengths.begin(),
+        ell_row_length<IndexType>{
+            thrust::raw_pointer_cast(&A.column_indices(0, 0)),
+            A.column_indices.num_cols,
+            A.column_indices.pitch }
+    );
+}
+
+
+//////////////////
+// Constructors //
+//////////////////
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::ellr_matrix(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+             const size_t num_entries_per_row, const size_t alignment)
+    : Parent(num_rows, num_cols, num_entries, num_entries_per_row, alignment)
+{
+    row_lengths.resize(num_rows);
+}
+
+// construct from a different matrix
+template <typename IndexType, typename ValueType, class MemorySpace>
+template <typename MatrixType>
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::ellr_matrix(const MatrixType& matrix)
+{
+    cusp::convert(matrix, *this);
+    update_row_lengths();
+}
+
+//////////////////////
+// Member Functions //
+//////////////////////
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::swap(ellr_matrix& matrix)
+{
+    Parent::swap(matrix);
+    row_lengths.swap(matrix.row_lengths);
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+         const size_t num_entries_per_row)
+{
+    Parent::resize(num_rows, num_cols, num_entries, num_entries_per_row);
+    row_lengths.resize(num_rows);
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+         const size_t num_entries_per_row, const size_t alignment)
+{
+    Parent::resize(num_rows, num_cols, num_entries,
+                   num_entries_per_row, alignment);
+    row_lengths.resize(num_rows);
+}
+
+// assignment from another matrix
+template <typename IndexType, typename ValueType, class MemorySpace>
+template <typename MatrixType>
+ellr_matrix<IndexType, ValueType, MemorySpace>&
+ellr_matrix<IndexType, ValueType, MemorySpace>
+::operator=(const MatrixType& matrix)
+{
+    cusp::convert(matrix, *this);
+    update_row_lengths();
+
+    return *this;
+}
+
+template <typename IndexType, typename ValueType, class MemorySpace>
+void ellr_matrix<IndexType, ValueType, MemorySpace>::update_row_lengths()
+{
+    if (row_lengths.size() != this->num_rows)
+        row_lengths.resize(this->num_rows);
+
+    compute_row_lengths(*this);
+}
+
+
+} // namespace cusp
diff --git a/cusp/ellr_matrix.h b/cusp/ellr_matrix.h
new file mode 100644
index 00000000..92f62cc9
--- /dev/null
+++ b/cusp/ellr_matrix.h
@@ -0,0 +1,89 @@
+#include <cusp/detail/config.h>
+
+#include <cusp/ell_matrix.h>
+
+#include <type_traits>
+
+namespace cusp
+{
+
+
+template <typename IndexType, typename ValueType, typename MemorySpace>
+class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
+{
+
+    using Parent = cusp::ell_matrix<IndexType, ValueType, MemorySpace>;
+
+public:
+
+    using row_lengths_array_type = cusp::array1d<size_t, MemorySpace>;
+
+    row_lengths_array_type row_lengths;
+
+    /*! Construct an empty \p ellr_matrix.
+     */
+    ellr_matrix(void) {}
+
+    /*! Construct an \p ell_matrix with a specific shape, number of nonzero entries,
+     *  and maximum number of nonzero entries per row.
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero matrix entries.
+     *  \param num_entries_per_row Maximum number of nonzeros per row.
+     *  \param alignment Amount of padding used to align the data structure (default 32).
+     */
+    ellr_matrix(const size_t num_rows, const size_t num_cols,
+                const size_t num_entries, const size_t num_entries_per_row,
+                const size_t alignment = 32);
+
+    /*! Construct an \p ell_matrix from another matrix.
+     *
+     *  \param matrix Another sparse or dense matrix.
+     */
+    template <typename MatrixType>
+    ellr_matrix(const MatrixType& matrix);
+
+    /*! Resize matrix dimensions and underlying storage
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero matrix entries.
+     *  \param num_entries_per_row Maximum number of nonzeros per row.
+     */
+    void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+                const size_t num_entries_per_row);
+
+    /*! Resize matrix dimensions and underlying storage
+     *
+     *  \param num_rows Number of rows.
+     *  \param num_cols Number of columns.
+     *  \param num_entries Number of nonzero matrix entries.
+     *  \param num_entries_per_row Maximum number of nonzeros per row.
+     *  \param alignment Amount of padding used to align the data structure (default 32).
+     */
+    void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
+                const size_t num_entries_per_row, const size_t alignment);
+
+    /*! Swap the contents of two \p ell_matrix objects.
+     *
+     *  \param matrix Another \p ell_matrix with the same IndexType and ValueType.
+     */
+    void swap(ellr_matrix& matrix);
+
+    /*! Assignment from another matrix.
+     *
+     *  \tparam MatrixType Format type of input matrix.
+     *
+     *  \param matrix Another sparse or dense matrix.
+     */
+    template <typename MatrixType>
+    ellr_matrix& operator=(const MatrixType& matrix);
+
+    void update_row_lengths();
+};
+
+
+} // namespace cusp
+
+#include <cusp/detail/ellr_matrix.inl>

From bdcdcf3b2d3589e3821481159115b3f19a51fb68 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 13 Mar 2023 15:08:36 +0100
Subject: [PATCH 27/49] Move ellr_matrix to ktt namespace

---
 cusp/{ => ktt}/detail/ellr_matrix.inl | 10 ++++++++--
 cusp/{ => ktt}/ellr_matrix.h          |  8 +++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)
 rename cusp/{ => ktt}/detail/ellr_matrix.inl (96%)
 rename cusp/{ => ktt}/ellr_matrix.h (97%)

diff --git a/cusp/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl
similarity index 96%
rename from cusp/detail/ellr_matrix.inl
rename to cusp/ktt/detail/ellr_matrix.inl
index 2bc5ee4b..6f5888cc 100644
--- a/cusp/detail/ellr_matrix.inl
+++ b/cusp/ktt/detail/ellr_matrix.inl
@@ -6,7 +6,11 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/device_ptr.h> // raw_pointer_cast
 
-namespace cusp {
+namespace cusp
+{
+
+namespace ktt
+{
 
 
 template<typename IndexType>
@@ -32,7 +36,7 @@ struct ell_row_length
 
 template<typename IndexType, typename ValueType, typename MemorySpace>
 void
-compute_row_lengths(cusp::ellr_matrix<IndexType, ValueType, MemorySpace>& A)
+compute_row_lengths(cusp::ktt::ellr_matrix<IndexType, ValueType, MemorySpace>& A)
 {
     thrust::counting_iterator<IndexType> row_idx_it(0);
 
@@ -128,4 +132,6 @@ void ellr_matrix<IndexType, ValueType, MemorySpace>::update_row_lengths()
 }
 
 
+} // namespace ktt
+
 } // namespace cusp
diff --git a/cusp/ellr_matrix.h b/cusp/ktt/ellr_matrix.h
similarity index 97%
rename from cusp/ellr_matrix.h
rename to cusp/ktt/ellr_matrix.h
index 92f62cc9..f887dcea 100644
--- a/cusp/ellr_matrix.h
+++ b/cusp/ktt/ellr_matrix.h
@@ -4,9 +4,13 @@
 
 #include <type_traits>
 
+
 namespace cusp
 {
 
+namespace ktt
+{
+
 
 template <typename IndexType, typename ValueType, typename MemorySpace>
 class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
@@ -84,6 +88,8 @@ class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
 };
 
 
+} // namespace ktt
+
 } // namespace cusp
 
-#include <cusp/detail/ellr_matrix.inl>
+#include <cusp/ktt/detail/ellr_matrix.inl>

From 0fa834f2d3b881671eb330fac81e40f35cafc5ae Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 13 Mar 2023 17:19:44 +0100
Subject: [PATCH 28/49] Add include guard

---
 cusp/ktt/ellr_matrix.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h
index f887dcea..1ba60ff8 100644
--- a/cusp/ktt/ellr_matrix.h
+++ b/cusp/ktt/ellr_matrix.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <cusp/detail/config.h>
 
 #include <cusp/ell_matrix.h>

From 2d43d7df6048f9f8ea5a3bcf8d4ec282358119a1 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 14 Mar 2023 14:47:11 +0100
Subject: [PATCH 29/49] Implement kernel for ellr matrix

---
 cusp/ktt/detail/ellr_matrix.inl             |   6 +-
 cusp/ktt/detail/ktt.inl                     |  24 +--
 cusp/ktt/ellr_matrix.h                      |   2 +-
 cusp/system/cuda/detail/multiply/dia_spmv.h |  18 ++
 cusp/system/cuda/ktt/dia_multiply.h         | 221 ++++++++++----------
 cusp/system/cuda/ktt/ell_multiply.h         | 114 ++++++++--
 cusp/system/cuda/ktt/kernels/ell_kernel.h   |  39 +++-
 cusp/system/cuda/ktt/multiply.h             |   2 +-
 testing/ktt.cu                              |  11 +-
 9 files changed, 282 insertions(+), 155 deletions(-)

diff --git a/cusp/ktt/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl
index 6f5888cc..8a67949d 100644
--- a/cusp/ktt/detail/ellr_matrix.inl
+++ b/cusp/ktt/detail/ellr_matrix.inl
@@ -20,11 +20,11 @@ struct ell_row_length
     size_t num_cols_per_row;
     size_t pitch;
 
-    __host__ __device__ size_t operator()(IndexType row_idx)
+    __host__ __device__ IndexType operator()(IndexType row_idx)
     {
-        size_t len = 0;
+        IndexType len = 0;
 
-        while (len < num_cols_per_row && row_idx + len*pitch < 8
+        while (len < num_cols_per_row
                 && column_indices[row_idx + len*pitch] >= 0)
         {
             len++;
diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl
index e02b4529..9e47d9be 100644
--- a/cusp/ktt/detail/ktt.inl
+++ b/cusp/ktt/detail/ktt.inl
@@ -116,18 +116,18 @@ tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
 }
 
 
-template<typename IndexType,
-         typename ValueType1,
-         typename ValueType2,
-         typename ValueType3,
-         typename Format>
-void reset_tuning()
-{
-    using namespace cusp::system::cuda::ktt;
+// template<typename IndexType,
+//          typename ValueType1,
+//          typename ValueType2,
+//          typename ValueType3,
+//          typename Format>
+// void reset_tuning()
+// {
+//     using namespace cusp::system::cuda::ktt;
 
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(get_tuner(), Format{});
-    detail::tuner->ClearData(kernel.kernel_id);
-}
+//     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(get_tuner(), Format{});
+//     detail::tuner->ClearData(kernel.kernel_id);
+// }
 
 
 template <typename MatrixType,
@@ -141,7 +141,7 @@ void reset_tuning(const MatrixType& A,
     using ValueType = typename MatrixType::value_type;
     using Format = typename MatrixType::format;
 
-    return reset_tuning<IndexType, ValueType, ValueType2, ValueType2, Format>();
+    return reset_tuning<IndexType, ValueType, ValueType2, ValueType2>(A);
 }
 
 
diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h
index 1ba60ff8..7f8d932c 100644
--- a/cusp/ktt/ellr_matrix.h
+++ b/cusp/ktt/ellr_matrix.h
@@ -22,7 +22,7 @@ class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
 
 public:
 
-    using row_lengths_array_type = cusp::array1d<size_t, MemorySpace>;
+    using row_lengths_array_type = cusp::array1d<IndexType, MemorySpace>;
 
     row_lengths_array_type row_lengths;
 
diff --git a/cusp/system/cuda/detail/multiply/dia_spmv.h b/cusp/system/cuda/detail/multiply/dia_spmv.h
index bf381825..81dbe069 100644
--- a/cusp/system/cuda/detail/multiply/dia_spmv.h
+++ b/cusp/system/cuda/detail/multiply/dia_spmv.h
@@ -26,6 +26,13 @@
 
 #include <algorithm>
 
+#ifdef TIME_DIA
+#include <chrono>
+#include <iostream>
+
+namespace krn = std::chrono;
+#endif
+
 namespace cusp
 {
 namespace system
@@ -165,8 +172,19 @@ void multiply(cuda::execution_policy<DerivedPolicy>& exec,
 
     cudaStream_t s = stream(thrust::detail::derived_cast(exec));
 
+#ifdef TIME_DIA
+    auto start = krn::steady_clock::now();
+#endif
+
     spmv_dia_kernel<OffsetsIterator, ValueIterator1, ValueIterator2, ValueIterator3, UnaryFunction, BinaryFunction1, BinaryFunction2, BLOCK_SIZE> <<<NUM_BLOCKS, BLOCK_SIZE, 0, s>>>
     (A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets.begin(), A.values.values.begin(), x.begin(), y.begin(), initialize, combine, reduce);
+
+#ifdef TIME_DIA
+    cudaStreamSynchronize(s);
+    auto end = krn::steady_clock::now();
+    size_t time = krn::duration_cast<krn::microseconds>(end - start).count();
+    std::cout << "cusp dia kernel: " << time << " us\n";
+#endif
 }
 
 } // end namespace detail
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 5e9a7414..88cd254e 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -87,11 +87,8 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k
                     });
 }
 
-} // namespace dia
-
-
 template<typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3>
-kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
+kernel_context initialize_kernel(::ktt::Tuner& tuner)
 {
     kernel_context kernel(tuner);
 
@@ -122,13 +119,20 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format)
     return kernel;
 }
 
+
+} // namespace dia
+
+
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
-         typename ValueType3>
-const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::dia_format format)
+         typename ValueType3,
+         typename MemorySpace>
+const kernel_context& get_kernel(
+        ::ktt::Tuner& tuner,
+        const cusp::dia_matrix<IndexType, ValueType1, MemorySpace>& A)
 {
-    static kernel_context kernel = initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, format);
+    static kernel_context kernel = dia::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner);
     return kernel;
 }
 
@@ -158,15 +162,22 @@ inline ::ktt::ArgumentId get_output_argument(const std::vector<::ktt::ArgumentId
     return arguments[7];
 }
 
-auto get_launcher(const kernel_context& ctx,
-                  size_t num_rows,
-                  size_t num_cols,
-                  bool profile = false)
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+auto get_launcher(
+        const kernel_context& ctx,
+        const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+        const cusp::array1d<ValueType2, cusp::device_memory>& x,
+        cusp::array1d<ValueType3, cusp::device_memory>& y,
+        bool profile = false)
 {
-    return [=] (::ktt::ComputeInterface& interface)
+    return [&, profile] (::ktt::ComputeInterface& interface)
     {
         ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]);
-        ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) );
+        ::ktt::DimensionVector grid_size( DIVIDE_INTO(A.num_rows, block_size.GetSizeX()) );
 
         auto conf = interface.GetCurrentConfiguration();
         for (const auto& pair : conf.GetPairs())
@@ -181,98 +192,98 @@ auto get_launcher(const kernel_context& ctx,
     };
 }
 
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-::ktt::KernelResult multiply(::ktt::Tuner& tuner,
-                             const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-                             const cusp::array1d<ValueType2, cusp::device_memory>& x,
-                             cusp::array1d<ValueType3, cusp::device_memory>& y)
-{
-    if (A.num_entries == 0) {
-        thrust::fill(y.begin(), y.end(), ValueType3(0));
-        ::ktt::KernelResult result("DiaKernel", {});
-        result.SetStatus(::ktt::ResultStatus::Ok);
-        return result;
-    }
-
-    const kernel_context& kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-    auto args = add_arguments(kernel, A, x, y);
-
-    tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols));
-
-    auto res = tuner.TuneIteration(kernel.kernel_id, {});
-    remove_arguments(kernel, args);
-
-    return res;
-}
-
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-::ktt::KernelResult multiply(::ktt::Tuner& tuner,
-                             const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-                             const cusp::array1d<ValueType2, cusp::device_memory>& x,
-                             cusp::array1d<ValueType3, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& configuration,
-                             bool run_with_profiling = false)
-{
-    if (A.num_entries == 0) {
-        thrust::fill(y.begin(), y.end(), ValueType3(0));
-        ::ktt::KernelResult result("DiaKernel", {});
-        result.SetStatus(::ktt::ResultStatus::Ok);
-        return result;
-    }
-
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-    auto args = add_arguments(kernel, A, x, y);
-
-    tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling));
-
-    auto result = tuner.Run(kernel.kernel_id, configuration, {});
-    remove_arguments(kernel, args);
-
-    return result;
-}
-
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-std::vector<::ktt::KernelResult>
-tune(::ktt::Tuner& tuner,
-     const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-     const cusp::array1d<ValueType2, cusp::device_memory>& x,
-     cusp::array1d<ValueType3, cusp::device_memory>& y,
-     std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt)
-{
-    if (A.num_entries == 0) {
-        return {};
-    }
-
-    kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-    auto args = add_arguments(kernel, A, x, y);
-
-    if (reference_computation) {
-        tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value());
-    }
-
-    cusp::array1d<ValueType3, cusp::host_memory> host_y = y;
-    tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) {
-        // clear y so previous results don't affect validation
-        y = host_y;
-        auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols);
-        launcher(interface);
-    });
-
-    auto results = tuner.Tune(kernel.kernel_id);
-
-    remove_arguments(kernel, args);
-
-    return results;
-}
+// template <typename IndexType,
+//           typename ValueType1,
+//           typename ValueType2,
+//           typename ValueType3>
+// ::ktt::KernelResult multiply(::ktt::Tuner& tuner,
+//                              const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+//                              const cusp::array1d<ValueType2, cusp::device_memory>& x,
+//                              cusp::array1d<ValueType3, cusp::device_memory>& y)
+// {
+//     if (A.num_entries == 0) {
+//         thrust::fill(y.begin(), y.end(), ValueType3(0));
+//         ::ktt::KernelResult result("DiaKernel", {});
+//         result.SetStatus(::ktt::ResultStatus::Ok);
+//         return result;
+//     }
+
+//     const kernel_context& kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
+//     auto args = add_arguments(kernel, A, x, y);
+
+//     tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols));
+
+//     auto res = tuner.TuneIteration(kernel.kernel_id, {});
+//     remove_arguments(kernel, args);
+
+//     return res;
+// }
+
+// template <typename IndexType,
+//           typename ValueType1,
+//           typename ValueType2,
+//           typename ValueType3>
+// ::ktt::KernelResult multiply(::ktt::Tuner& tuner,
+//                              const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+//                              const cusp::array1d<ValueType2, cusp::device_memory>& x,
+//                              cusp::array1d<ValueType3, cusp::device_memory>& y,
+//                              const ::ktt::KernelConfiguration& configuration,
+//                              bool run_with_profiling = false)
+// {
+//     if (A.num_entries == 0) {
+//         thrust::fill(y.begin(), y.end(), ValueType3(0));
+//         ::ktt::KernelResult result("DiaKernel", {});
+//         result.SetStatus(::ktt::ResultStatus::Ok);
+//         return result;
+//     }
+
+//     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
+//     auto args = add_arguments(kernel, A, x, y);
+
+//     tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling));
+
+//     auto result = tuner.Run(kernel.kernel_id, configuration, {});
+//     remove_arguments(kernel, args);
+
+//     return result;
+// }
+
+// template <typename IndexType,
+//           typename ValueType1,
+//           typename ValueType2,
+//           typename ValueType3>
+// std::vector<::ktt::KernelResult>
+// tune(::ktt::Tuner& tuner,
+//      const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+//      const cusp::array1d<ValueType2, cusp::device_memory>& x,
+//      cusp::array1d<ValueType3, cusp::device_memory>& y,
+//      std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt)
+// {
+//     if (A.num_entries == 0) {
+//         return {};
+//     }
+
+//     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
+//     auto args = add_arguments(kernel, A, x, y);
+
+//     if (reference_computation) {
+//         tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value());
+//     }
+
+//     cusp::array1d<ValueType3, cusp::host_memory> host_y = y;
+//     tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) {
+//         // clear y so previous results don't affect validation
+//         y = host_y;
+//         auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols);
+//         launcher(interface);
+//     });
+
+//     auto results = tuner.Tune(kernel.kernel_id);
+
+//     remove_arguments(kernel, args);
+
+//     return results;
+// }
 
 } // namespace ktt
 
diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 8b217bb9..036329aa 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -1,7 +1,10 @@
 #pragma once
 
+#include <cusp/ktt/ellr_matrix.h>
+
 #include <cusp/system/cuda/ktt/kernel.h>
 #include <cusp/system/cuda/ktt/utils.h>
+#include <cusp/ell_matrix.h>
 
 namespace cusp {
 
@@ -14,13 +17,12 @@ namespace ktt {
 namespace ell {
 
 
-inline void setup_tuning_parameters(const kernel_context& kernel)
+inline void setup_common_tuning_parameters(const kernel_context& kernel)
 {
     auto& tuner = *kernel.tuner;
     auto kernel_id = kernel.kernel_id;
 
     tuner.AddParameter(kernel_id, "BLOCK_SIZE",      u64_vec{ 128 });
-    tuner.AddParameter(kernel_id, "BREAK",           u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "UNCACHED_LOADS",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "DISABLE_UNROLL",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 });
@@ -37,22 +39,31 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
         }
     );
 
-    // BREAK can be used only when there is no prefetching
-    // TODO(KTT): Implement prefetching when using early break and remove this
+    // All the warps working on one stripe of the matrix fit in a thread block.
     tuner.AddConstraint(
             kernel.kernel_id,
-            { "BREAK", "PREFETCH_FACTOR" },
+            { "THREADS_PER_ROW", "BLOCK_SIZE" },
             [] (const std::vector<uint64_t>& values) {
-                return values[0] == 0 || values[1] == 0;
+                return 32*values[0] <= values[1];
             });
+}
 
+inline void setup_ell_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
 
-    // All the warps working on one stripe of the matrix fit in a thread block.
+    setup_common_tuning_parameters(kernel);
+
+    tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 });
+
+    // BREAK can be used only when there is no prefetching
+    // TODO(KTT): Implement prefetching when using early break and remove this
     tuner.AddConstraint(
             kernel.kernel_id,
-            { "THREADS_PER_ROW", "BLOCK_SIZE" },
+            { "BREAK", "PREFETCH_FACTOR" },
             [] (const std::vector<uint64_t>& values) {
-                return 32*values[0] <= values[1];
+                return values[0] == 0 || values[1] == 0;
             });
 }
 
@@ -60,8 +71,12 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
-         typename ValueType3>
-kernel_context initialize_kernel(::ktt::Tuner& tuner)
+         typename ValueType3,
+         typename TuningParamsInit>
+kernel_context initialize_kernel(::ktt::Tuner& tuner,
+                                 const std::string& kernel_function,
+                                 const std::string& kernel_name,
+                                 TuningParamsInit init)
 {
     std::string kernel_path =
         STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/ell_kernel.h";
@@ -80,7 +95,7 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner)
     ::ktt::DimensionVector grid_size(0);
 
     auto definition_id = tuner.AddKernelDefinitionFromFile(
-        "ktt_ell_kernel",
+        kernel_function,
         kernel_path,
         grid_size,
         block_size,
@@ -88,9 +103,9 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner)
     );
 
     kernel.definition_ids.push_back(definition_id);
-    kernel.kernel_id = tuner.CreateSimpleKernel("EllKernel", definition_id);
+    kernel.kernel_id = tuner.CreateSimpleKernel(kernel_name, definition_id);
 
-    setup_tuning_parameters(kernel);
+    init(kernel);
 
     return kernel;
 }
@@ -102,15 +117,56 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner)
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
-         typename ValueType3>
-const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::ell_format format)
+         typename ValueType3,
+         typename MemorySpace>
+const kernel_context& get_kernel(
+            ::ktt::Tuner& tuner,
+            const cusp::ell_matrix<IndexType, ValueType1, MemorySpace>& A)
 {
     static kernel_context kernel =
-        ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner);
+        ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
+            tuner, "ktt_ell_kernel", "EllKernel",
+            ell::setup_ell_tuning_parameters);
 
     return kernel;
 }
 
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3,
+         typename MemorySpace>
+const kernel_context& get_kernel(
+        ::ktt::Tuner& tuner,
+        const cusp::ktt::ellr_matrix<IndexType, ValueType1, MemorySpace>& A)
+{
+    static kernel_context kernel =
+        ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
+            tuner, "ktt_ellr_kernel", "EllrKernel",
+            ell::setup_common_tuning_parameters);
+
+    return kernel;
+}
+
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_common_arguments(
+        const kernel_context& kernel,
+        const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+        const cusp::array1d<ValueType2, cusp::device_memory>& x,
+        cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    IndexType pitch = A.column_indices.pitch;
+    IndexType num_entries_per_row = A.column_indices.num_cols;
+
+    return add_arguments(*kernel.tuner, A.num_rows, A.num_cols,
+                         num_entries_per_row, pitch, A.column_indices.values,
+                         A.values.values, x, y);
+}
 
 template <typename IndexType,
           typename ValueType1,
@@ -122,12 +178,26 @@ add_arguments(const kernel_context& kernel,
               const cusp::array1d<ValueType2, cusp::device_memory>& x,
               cusp::array1d<ValueType3, cusp::device_memory>& y)
 {
-    IndexType pitch = A.column_indices.pitch;
-    IndexType num_entries_per_row = A.column_indices.num_cols;
+    auto args = add_common_arguments(kernel, A, x, y);
+    kernel.tuner->SetArguments(kernel.definition_ids[0], args);
+    return args;
+}
+
+template <typename IndexType,
+          typename ValueType1,
+          typename ValueType2,
+          typename ValueType3>
+std::vector<::ktt::ArgumentId>
+add_arguments(
+    const kernel_context& kernel,
+    const cusp::ktt::ellr_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+    const cusp::array1d<ValueType2, cusp::device_memory>& x,
+    cusp::array1d<ValueType3, cusp::device_memory>& y)
+{
+    auto args = add_common_arguments(kernel, A, x, y);
+    auto aditional_args = add_arguments(*kernel.tuner, A.row_lengths);
 
-    auto args = add_arguments(*kernel.tuner,
-                              A.num_rows, A.num_cols, num_entries_per_row, pitch,
-                              A.column_indices.values, A.values.values, x, y);
+    args.push_back(aditional_args[0]);
 
     kernel.tuner->SetArguments(kernel.definition_ids[0], args);
 
diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
index b6a29324..98f605f5 100644
--- a/cusp/system/cuda/ktt/kernels/ell_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -88,36 +88,37 @@ template <typename IndexType,
 __device__ void
 ktt_ell_kernel_basic(const IndexType num_rows,
                      const IndexType num_cols,
-                     const IndexType num_cols_per_row,
+                     const IndexType max_cols_per_row,
                      const IndexType pitch,
                      const IndexType* __restrict__ Aj,
                      const ValueType* __restrict__ Ax,
                      const ValueType* __restrict__ x,
-                     ValueType* __restrict__ y)
+                     ValueType* __restrict__ y,
+                     const IndexType* __restrict__ row_lengths)
 {
     const IndexType row = blockDim.x*blockIdx.x + threadIdx.x;
     const IndexType stride = THREADS_PER_ROW * pitch;
 
 #if THREADS_PER_ROW > 1
-    __shared__ ValueType sums[BLOCK_SIZE/THREADS_PER_ROW];
+    __shared__ ValueType sums[BLOCK_SIZE / THREADS_PER_ROW];
 
     if (threadIdx.y == 0)
-    {
         sums[threadIdx.x] = 0;
-    }
 
     __syncthreads();
 #endif
 
     if (row < num_rows)
     {
+        const IndexType num_cols = row_lengths == NULL
+                                    ? max_cols_per_row
+                                    : row_lengths[row];
         ValueType sum = 0;
         IndexType n = threadIdx.y;
-        IndexType offset = row + n * pitch;
+        IndexType offset = row + n*pitch;
 
 #if PREFETCH_FACTOR > 0
-        IndexType bound = num_cols_per_row
-                            - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW;
+        IndexType bound = num_cols - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW;
         IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW;
 
         #pragma unroll UNROLL
@@ -134,7 +135,7 @@ ktt_ell_kernel_basic(const IndexType num_rows,
 #endif
 
         #pragma unroll UNROLL
-        for (; n < num_cols_per_row; n += THREADS_PER_ROW)
+        for (; n < num_cols; n += THREADS_PER_ROW)
         {
             const IndexType col = load(Aj + offset);
 
@@ -186,5 +187,23 @@ ktt_ell_kernel(const IndexType num_rows,
                ValueType* __restrict__ y)
 {
     ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch,
-                         Aj, Ax, x, y);
+                         Aj, Ax, x, y, (const IndexType*)NULL);
+}
+
+
+template <typename IndexType,
+          typename ValueType>
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_ellr_kernel(const IndexType num_rows,
+                const IndexType num_cols,
+                const IndexType num_cols_per_row,
+                const IndexType pitch,
+                const IndexType* __restrict__ Aj,
+                const ValueType* __restrict__ Ax,
+                const ValueType* __restrict__ x,
+                ValueType* __restrict__ y,
+                const IndexType* __restrict__ row_lengths)
+{
+    ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch,
+                         Aj, Ax, x, y, row_lengths);
 }
diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h
index 4f5a0b84..18ff8bd0 100644
--- a/cusp/system/cuda/ktt/multiply.h
+++ b/cusp/system/cuda/ktt/multiply.h
@@ -33,7 +33,7 @@ kernel_context get_kernel(::ktt::Tuner& tuner,
     using ValueType = typename MatrixType::value_type;
     using FormatType = typename MatrixType::format;
 
-    return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, FormatType{});
+    return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, A);
 }
 
 
diff --git a/testing/ktt.cu b/testing/ktt.cu
index aac96e34..d3bf2088 100644
--- a/testing/ktt.cu
+++ b/testing/ktt.cu
@@ -15,6 +15,7 @@
 #include <cusp/multiply.h>
 
 #include <cusp/ktt/ktt.h>
+#include <cusp/ktt/ellr_matrix.h>
 #include <cusp/ktt/matrix_generation.h>
 
 #include <iostream>
@@ -29,9 +30,17 @@
     }                                                                    \
     DECLARE_UNITTEST(VTEST##Ktt##Fmt##Matrix);
 
+#define DECLARE_KTT_ELLR_UNITTEST(VTEST)                                    \
+    void VTEST##Ktt##Ellr##Matrix(void)                                     \
+    {                                                                       \
+        VTEST< cusp::ktt::ellr_matrix<int, float, cusp::device_memory> >(); \
+    }                                                                       \
+    DECLARE_UNITTEST(VTEST##Ktt##Ellr##Matrix);
+
 #define DECLARE_KTT_UNITTEST(VTEST)                     \
     DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \
-    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell)
+    DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell) \
+    DECLARE_KTT_ELLR_UNITTEST(VTEST)
 
 
 struct UnitTestStopCondition : ::ktt::StopCondition

From de428af797f3c2d3b662fef2feb621b9918a1a02 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 20 Mar 2023 09:28:22 +0100
Subject: [PATCH 30/49] Add option to disable conversion exceptions

---
 cusp/system/detail/generic/conversions/coo_to_other.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cusp/system/detail/generic/conversions/coo_to_other.h b/cusp/system/detail/generic/conversions/coo_to_other.h
index 06a88b7f..b25fe8f8 100644
--- a/cusp/system/detail/generic/conversions/coo_to_other.h
+++ b/cusp/system/detail/generic/conversions/coo_to_other.h
@@ -137,7 +137,8 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
         DestinationType& dst,
         cusp::coo_format&,
         cusp::dia_format&,
-        size_t alignment = 32)
+        size_t alignment = 32,
+        bool dont_throw = false)
 {
     typedef typename DestinationType::index_type   IndexType;
     typedef typename DestinationType::value_type   ValueType;
@@ -156,7 +157,7 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
     const float size       = float(occupied_diagonals) * float(src.num_rows);
     const float fill_ratio = size / std::max(1.0f, float(src.num_entries));
 
-    if (max_fill < fill_ratio && size > threshold)
+    if (max_fill < fill_ratio && size > threshold && !dont_throw)
         throw cusp::format_conversion_exception("dia_matrix fill-in would exceed maximum tolerance");
 
     // compute number of occupied diagonals and enumerate them
@@ -219,7 +220,8 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
         cusp::coo_format&,
         cusp::ell_format&,
         size_t num_entries_per_row = 0,
-        size_t alignment = 32)
+        size_t alignment = 32,
+        bool dont_throw = false)
 {
     typedef typename DestinationType::index_type   IndexType;
     typedef typename DestinationType::value_type   ValueType;
@@ -242,7 +244,7 @@ convert(thrust::execution_policy<DerivedPolicy>& exec,
         const float size       = float(max_entries_per_row) * float(src.num_rows);
         const float fill_ratio = size / std::max(1.0f, float(src.num_entries));
 
-        if (max_fill < fill_ratio && size > threshold)
+        if (max_fill < fill_ratio && size > threshold && !dont_throw)
             throw cusp::format_conversion_exception("ell_matrix fill-in would exceed maximum tolerance");
 
         num_entries_per_row = max_entries_per_row;

From 4ad20a90b53e3c0f3681f555c9018e9c8b26053f Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 20 Mar 2023 09:29:51 +0100
Subject: [PATCH 31/49] Remove ifs from ellr kernel

---
 cusp/system/cuda/ktt/kernels/ell_kernel.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
index 98f605f5..3d89bd60 100644
--- a/cusp/system/cuda/ktt/kernels/ell_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -49,11 +49,15 @@ struct Prefetcher : Prefetcher<IndexType, ValueType,  K - 1>
                        IndexType pitch,
                        IndexType i)
     {
+#ifndef ELLR
         if (col >= 0)
         {
+#endif
             A_val = load(Ax + offset + i*pitch);
             x_val = x[col];
+#ifndef ELLR
         }
+#endif
         parent::prefetch_vals(Ax, x, offset, pitch, i + 1);
     }
 
@@ -110,7 +114,7 @@ ktt_ell_kernel_basic(const IndexType num_rows,
 
     if (row < num_rows)
     {
-        const IndexType num_cols = row_lengths == NULL
+        const IndexType num_elems = row_lengths == NULL
                                     ? max_cols_per_row
                                     : row_lengths[row];
         ValueType sum = 0;
@@ -118,7 +122,7 @@ ktt_ell_kernel_basic(const IndexType num_rows,
         IndexType offset = row + n*pitch;
 
 #if PREFETCH_FACTOR > 0
-        IndexType bound = num_cols - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW;
+        IndexType bound = num_elems - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW;
         IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW;
 
         #pragma unroll UNROLL
@@ -135,18 +139,21 @@ ktt_ell_kernel_basic(const IndexType num_rows,
 #endif
 
         #pragma unroll UNROLL
-        for (; n < num_cols; n += THREADS_PER_ROW)
+        for (; n < num_elems; n += THREADS_PER_ROW)
         {
             const IndexType col = load(Aj + offset);
 
             // This assumes that
             // cusp::ell_matrix<...>::invalid_index is always < 0
+#ifndef ELLR
             if (col >= 0)
             {
+#endif
                 const ValueType x_j = x[col];
                 const ValueType A_ij = load(Ax + offset);
 
                 sum += A_ij * x_j;
+#ifndef ELLR
             }
             else
             {
@@ -154,6 +161,7 @@ ktt_ell_kernel_basic(const IndexType num_rows,
                 break;
 #endif
             }
+#endif
 
             offset += stride;
         }

From fa3e6a27af92a0045dcdfd62c4877cb3766f0f98 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 20 Mar 2023 09:33:21 +0100
Subject: [PATCH 32/49] Definne ELLR parameter for the ellr kernel

---
 cusp/system/cuda/ktt/ell_multiply.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 036329aa..957ba961 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -67,6 +67,16 @@ inline void setup_ell_tuning_parameters(const kernel_context& kernel)
             });
 }
 
+inline void setup_ellr_tuning_parameters(const kernel_context& kernel)
+{
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    setup_common_tuning_parameters(kernel);
+
+    tuner.AddParameter(kernel_id, "ELLR", u64_vec{ 1 });
+}
+
 
 template<typename IndexType,
          typename ValueType1,
@@ -143,7 +153,7 @@ const kernel_context& get_kernel(
     static kernel_context kernel =
         ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
             tuner, "ktt_ellr_kernel", "EllrKernel",
-            ell::setup_common_tuning_parameters);
+            ell::setup_ellr_tuning_parameters);
 
     return kernel;
 }

From 8927db4be5407209fcbc9b70fd802cad92bdcdb6 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 20 Mar 2023 09:33:44 +0100
Subject: [PATCH 33/49] Add more block sizes to ell and ellr kernels

---
 cusp/system/cuda/ktt/ell_multiply.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 957ba961..a491a473 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -22,7 +22,7 @@ inline void setup_common_tuning_parameters(const kernel_context& kernel)
     auto& tuner = *kernel.tuner;
     auto kernel_id = kernel.kernel_id;
 
-    tuner.AddParameter(kernel_id, "BLOCK_SIZE",      u64_vec{ 128 });
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE",      u64_vec{ 128, 256, 512 });
     tuner.AddParameter(kernel_id, "UNCACHED_LOADS",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "DISABLE_UNROLL",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 });

From 27529520354cd280f9edc2f967bb6ccfb3e7da00 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 20 Mar 2023 09:34:14 +0100
Subject: [PATCH 34/49] Fix having two kernels per matrix type

---
 cusp/system/cuda/ktt/ell_multiply.h | 14 ++++++--------
 cusp/system/cuda/ktt/multiply.h     | 14 ++++++++++++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index a491a473..3981156f 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -127,11 +127,10 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner,
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
-         typename ValueType3,
-         typename MemorySpace>
+         typename ValueType3>
 const kernel_context& get_kernel(
-            ::ktt::Tuner& tuner,
-            const cusp::ell_matrix<IndexType, ValueType1, MemorySpace>& A)
+        ::ktt::Tuner& tuner,
+        const cusp::ell_matrix<IndexType, ValueType1, cusp::device_memory>& A)
 {
     static kernel_context kernel =
         ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
@@ -144,11 +143,10 @@ const kernel_context& get_kernel(
 template<typename IndexType,
          typename ValueType1,
          typename ValueType2,
-         typename ValueType3,
-         typename MemorySpace>
+         typename ValueType3>
 const kernel_context& get_kernel(
-        ::ktt::Tuner& tuner,
-        const cusp::ktt::ellr_matrix<IndexType, ValueType1, MemorySpace>& A)
+    ::ktt::Tuner& tuner,
+    const cusp::ktt::ellr_matrix<IndexType, ValueType1, cusp::device_memory>& A)
 {
     static kernel_context kernel =
         ell::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(
diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h
index 18ff8bd0..59729596 100644
--- a/cusp/system/cuda/ktt/multiply.h
+++ b/cusp/system/cuda/ktt/multiply.h
@@ -31,9 +31,19 @@ kernel_context get_kernel(::ktt::Tuner& tuner,
 {
     using IndexType = typename MatrixType::index_type;
     using ValueType = typename MatrixType::value_type;
-    using FormatType = typename MatrixType::format;
 
-    return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, A);
+    if constexpr (std::is_same_v<cusp::host_memory,
+                                 typename MatrixType::memory_space>)
+    {
+        using DeviceMatrix =
+            typename MatrixType::rebind<cusp::device_memory>::type;
+        return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(
+                    tuner, DeviceMatrix{});
+    }
+    else
+    {
+        return get_kernel<IndexType, ValueType, ValueType1, ValueType2>(tuner, A);
+    }
 }
 
 

From eb9fa2039c5d00cea5bb197a2431a51160600538 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 21 Mar 2023 08:58:01 +0100
Subject: [PATCH 35/49] Remove commented code

---
 cusp/system/cuda/ktt/dia_multiply.h | 92 -----------------------------
 1 file changed, 92 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 88cd254e..a82137bf 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -192,98 +192,6 @@ auto get_launcher(
     };
 }
 
-// template <typename IndexType,
-//           typename ValueType1,
-//           typename ValueType2,
-//           typename ValueType3>
-// ::ktt::KernelResult multiply(::ktt::Tuner& tuner,
-//                              const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-//                              const cusp::array1d<ValueType2, cusp::device_memory>& x,
-//                              cusp::array1d<ValueType3, cusp::device_memory>& y)
-// {
-//     if (A.num_entries == 0) {
-//         thrust::fill(y.begin(), y.end(), ValueType3(0));
-//         ::ktt::KernelResult result("DiaKernel", {});
-//         result.SetStatus(::ktt::ResultStatus::Ok);
-//         return result;
-//     }
-
-//     const kernel_context& kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-//     auto args = add_arguments(kernel, A, x, y);
-
-//     tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols));
-
-//     auto res = tuner.TuneIteration(kernel.kernel_id, {});
-//     remove_arguments(kernel, args);
-
-//     return res;
-// }
-
-// template <typename IndexType,
-//           typename ValueType1,
-//           typename ValueType2,
-//           typename ValueType3>
-// ::ktt::KernelResult multiply(::ktt::Tuner& tuner,
-//                              const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-//                              const cusp::array1d<ValueType2, cusp::device_memory>& x,
-//                              cusp::array1d<ValueType3, cusp::device_memory>& y,
-//                              const ::ktt::KernelConfiguration& configuration,
-//                              bool run_with_profiling = false)
-// {
-//     if (A.num_entries == 0) {
-//         thrust::fill(y.begin(), y.end(), ValueType3(0));
-//         ::ktt::KernelResult result("DiaKernel", {});
-//         result.SetStatus(::ktt::ResultStatus::Ok);
-//         return result;
-//     }
-
-//     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-//     auto args = add_arguments(kernel, A, x, y);
-
-//     tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling));
-
-//     auto result = tuner.Run(kernel.kernel_id, configuration, {});
-//     remove_arguments(kernel, args);
-
-//     return result;
-// }
-
-// template <typename IndexType,
-//           typename ValueType1,
-//           typename ValueType2,
-//           typename ValueType3>
-// std::vector<::ktt::KernelResult>
-// tune(::ktt::Tuner& tuner,
-//      const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-//      const cusp::array1d<ValueType2, cusp::device_memory>& x,
-//      cusp::array1d<ValueType3, cusp::device_memory>& y,
-//      std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt)
-// {
-//     if (A.num_entries == 0) {
-//         return {};
-//     }
-
-//     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner, cusp::dia_format{});
-//     auto args = add_arguments(kernel, A, x, y);
-
-//     if (reference_computation) {
-//         tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value());
-//     }
-
-//     cusp::array1d<ValueType3, cusp::host_memory> host_y = y;
-//     tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) {
-//         // clear y so previous results don't affect validation
-//         y = host_y;
-//         auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols);
-//         launcher(interface);
-//     });
-
-//     auto results = tuner.Tune(kernel.kernel_id);
-
-//     remove_arguments(kernel, args);
-
-//     return results;
-// }
 
 } // namespace ktt
 

From e48d543159ca998eb3d3b32eb97c172cce496444 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 21 Mar 2023 09:24:05 +0100
Subject: [PATCH 36/49] Fix formatting

---
 cusp/system/cuda/ktt/dia_multiply.h | 154 ++++++++++++++++------------
 1 file changed, 91 insertions(+), 63 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index a82137bf..02df5bd8 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -6,12 +6,9 @@
 #include <cusp/system/cuda/ktt/utils.h>
 #include <cusp/ktt/detail/external/nameof.hpp>
 
-#include <cusp/system/cuda/arch.h> // max_active_blocks
 #include <cusp/system/cuda/utils.h>
 #include <cusp/dia_matrix.h>
 
-#include <optional>
-#include <iostream>
 
 namespace cusp {
 
@@ -23,23 +20,28 @@ namespace ktt {
 
 namespace dia {
 
-inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel)
+
+inline void setup_tuning_parameters(const kernel_context& kernel)
 {
-    tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector<uint64_t>{ 0, 1, 2 });
-    tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 4, 8 });
-    tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector<uint64_t>{ 0, 2, 3, 4 });
-    tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector<uint64_t>{ 0, 1 });
-    tuner.AddParameter(kernel.kernel_id, "LOAD_TYPE", std::vector<uint64_t>{ 0, 1 });
-    tuner.AddParameter(kernel.kernel_id, "STRIPING_FACTOR", std::vector<uint64_t>{ 2, 4, 8 });
-    tuner.AddParameter(kernel.kernel_id, "BLOCK_SIZE", std::vector<uint64_t>{ 256, 512 });
-
-    // START OF HACKS ---------------------
-    tuner.AddParameter(kernel.kernel_id, "BLOCKS_PER_SECTOR", std::vector<uint64_t>{ 0 });
-    tuner.AddParameter(kernel.kernel_id, "CHUNK_SIZE", std::vector<uint64_t>{ 0 });
-    tuner.AddParameter(kernel.kernel_id, "STRIPE_SIZE", std::vector<uint64_t>{ 0 });
-    tuner.AddParameter(kernel.kernel_id, "GRID_SIZE", std::vector<uint64_t>{ 0 });
-    tuner.AddParameter(kernel.kernel_id, "SECTOR_MAPPING_TYPE",
-                       std::vector<uint64_t>{ 0, 1 });
+    auto& tuner = *kernel.tuner;
+    auto kernel_id = kernel.kernel_id;
+
+    tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 0, 1, 2 });
+    tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR",
+                       u64_vec{ 0, 2, 4, 8 });
+    tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR",
+                       u64_vec{ 0, 2, 3, 4 });
+    tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8 });
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 256, 512 });
+
+    // START OF HACKS to test striping ----
+    tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 });
+    tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 });
+    tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 });
+    tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 });
+    tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 });
 
     tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" },
                         [] (const std::vector<uint64_t>& values) {
@@ -53,46 +55,55 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k
         ::ktt::ModifierType::Local,
         ::ktt::ModifierDimension::X,
         { std::string("BLOCK_SIZE") },
-        [] (const uint64_t defaultSize, const std::vector<uint64_t>& parameters) {
+        [] (const uint64_t defaultSize, const u64_vec& parameters) {
             return parameters[0];
-        }
-    );
-
-    // only one type of prefetching can be applied at once
-    tuner.AddConstraint(kernel.kernel_id,
-                        { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
-                        [] (const std::vector<uint64_t>& values) {
-                            return values[0] == 0 || values[1] == 0;
-                        });
+        });
 
-    // prefetching is used only for the blocked offsets kernel
-    tuner.AddConstraint(kernel.kernel_id,
-                        { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
-                        [] (const std::vector<uint64_t>& values) {
-                            return values[0] == 1 || (values[1] == 0 && values[2] == 0);
-                        });
+    // Only one type of prefetching can be applied at once.
+    tuner.AddConstraint(
+        kernel.kernel_id,
+        { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
+        [] (const std::vector<uint64_t>& values) {
+            return values[0] == 0 || values[1] == 0;
+        });
 
-    // different register prefetching implementations are used only when register prefetching is enabled
-    tuner.AddConstraint(kernel.kernel_id,
-                        { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" },
-                        [] (const std::vector<uint64_t>& values) {
-                            return values[0] > 0 || values[1] == 0;
-                        });
+    // Prefetching is used only for the blocked offsets kernel.
+    tuner.AddConstraint(
+        kernel.kernel_id,
+        { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
+        [] (const std::vector<uint64_t>& values) {
+            return values[0] == 1 || (values[1] == 0 && values[2] == 0);
+        });
+
+    // Different register prefetching implementations are used
+    // only when register prefetching is actually enabled
+    tuner.AddConstraint(
+        kernel.kernel_id,
+        { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" },
+        [] (const std::vector<uint64_t>& values) {
+            return values[0] > 0 || values[1] == 0;
+        });
 
-    // don't try different striping factors for non-striped kernels
-    tuner.AddConstraint(kernel.kernel_id,
-                    { "KERNEL_TYPE", "STRIPING_FACTOR" },
-                    [] (const std::vector<uint64_t>& values) {
-                        return values[0] == 2 || values[1] == 2;
-                    });
+    // Don't try different striping factors for non-striped kernels.
+    tuner.AddConstraint(
+        kernel.kernel_id,
+        { "KERNEL_TYPE", "STRIPING_FACTOR" },
+        [] (const std::vector<uint64_t>& values) {
+            return values[0] == 2 || values[1] == 2;
+        });
 }
 
-template<typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3>
+
+template<typename IndexType,
+         typename ValueType1,
+         typename ValueType2,
+         typename ValueType3>
 kernel_context initialize_kernel(::ktt::Tuner& tuner)
 {
     kernel_context kernel(tuner);
 
-    std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h";
+    std::string kernel_path =
+        STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h";
 
     std::vector< std::string > type_names {
         std::string(NAMEOF_TYPE(IndexType)),
@@ -101,6 +112,9 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner)
         std::string(NAMEOF_TYPE(ValueType3))
     };
 
+    // NOTE: These can be anything since they are awlays set in the launcher.
+    // So use some values that will hopefully cause a crash if not set properly
+    // in the launcher.
     const ::ktt::DimensionVector blockDimensions(0);
     const ::ktt::DimensionVector gridDimensions(0);
 
@@ -114,7 +128,7 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner)
     kernel.definition_ids.push_back(definition_id);
     kernel.kernel_id = tuner.CreateSimpleKernel("DiaKernel", definition_id);
 
-    dia::setup_tuning_parameters(tuner, kernel);
+    dia::setup_tuning_parameters(kernel);
 
     return kernel;
 }
@@ -132,7 +146,10 @@ const kernel_context& get_kernel(
         ::ktt::Tuner& tuner,
         const cusp::dia_matrix<IndexType, ValueType1, MemorySpace>& A)
 {
-    static kernel_context kernel = dia::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>(tuner);
+    constexpr auto init_kernel =
+        dia::initialize_kernel<IndexType, ValueType1, ValueType2, ValueType3>;
+
+    static kernel_context kernel = init_kernel(tuner);
     return kernel;
 }
 
@@ -141,24 +158,30 @@ template <typename IndexType,
           typename ValueType2,
           typename ValueType3>
 std::vector<::ktt::ArgumentId> add_arguments(
-                                    const kernel_context& kernel,
-                                    const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
-                                    const cusp::array1d<ValueType2, cusp::device_memory>& x,
-                                    cusp::array1d<ValueType3, cusp::device_memory>& y)
+        const kernel_context& kernel,
+        const cusp::dia_matrix<IndexType, ValueType1, cusp::device_memory>& A,
+        const cusp::array1d<ValueType2, cusp::device_memory>& x,
+        cusp::array1d<ValueType3, cusp::device_memory>& y)
 {
     const IndexType num_diagonals = A.values.num_cols;
-    const IndexType pitch = A.values.pitch; // length of one diagonal + possible padding
+    const IndexType pitch = A.values.pitch;
+
+    auto args =
+        add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals,
+                      pitch, A.diagonal_offsets, A.values.values, x, y);
 
-    auto args = add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets, A.values.values, x, y);
     kernel.tuner->SetArguments(kernel.definition_ids[0], args);
 
     return args;
 }
 
 
-// Returns the argument id of the y vector given a vector of arguments returned by a previous
-// call of `add_arguments`.
-inline ::ktt::ArgumentId get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, cusp::dia_format) {
+// Returns the argument id of the y vector given a vector of arguments
+// returned by a previous call of `add_arguments`.
+inline ::ktt::ArgumentId get_output_argument(
+        const std::vector<::ktt::ArgumentId>& arguments,
+        cusp::dia_format)
+{
     return arguments[7];
 }
 
@@ -176,18 +199,23 @@ auto get_launcher(
 {
     return [&, profile] (::ktt::ComputeInterface& interface)
     {
-        ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]);
-        ::ktt::DimensionVector grid_size( DIVIDE_INTO(A.num_rows, block_size.GetSizeX()) );
+        ::ktt::DimensionVector block_size =
+            interface.GetCurrentLocalSize(ctx.definition_ids[0]);
+        ::ktt::DimensionVector grid_size(
+            DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
 
+        // START OF HACKS to test striping ----
         auto conf = interface.GetCurrentConfiguration();
         for (const auto& pair : conf.GetPairs())
             if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0)
                 grid_size = ::ktt::DimensionVector(pair.GetValue());
+        // END OF HACKS -----------------------
 
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
-            interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size);
+            interface.RunKernelWithProfiling(ctx.definition_ids[0],
+                                             grid_size, block_size);
         }
     };
 }

From 15d306114d00b2f7ae575e2cd4925fb7f85bd2ca Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 21 Mar 2023 13:23:40 +0100
Subject: [PATCH 37/49] Make prefetching macros more convenient to use

---
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 67 +++++++++++------------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 7f26658d..2160b007 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -9,10 +9,6 @@
     #define PREFETCH_FACTOR 0
 #endif
 
-#ifndef BLOCK_SIZE
-    #define BLOCK_SIZE 256
-#endif
-
 
 template<typename T>
 __device__ T min(T a, T b) {
@@ -145,16 +141,36 @@ naive_dia_kernel(const int num_rows,
     }
 }
 
-#define REGISTER_PREFETCH_ITERATION(iter) \
-    IndexType col##iter       = offsets[i + iter] + thread_id; \
-    ValueType1 diag_val##iter = 0; \
-    ValueType2 x_val##iter    = 0; \
-    if (col##iter >= 0 && col##iter < num_cols) \
-    { \
-        diag_val##iter = load_diag_val(&values[ (offset_base + i + iter)*pitch + thread_id ]); \
-        x_val##iter    = load_x_val(&x[col##iter]); \
+
+#define PREFETCH_LOAD_ITERATION(iter)                                          \
+    IndexType col##iter       = offsets[i + iter - 1] + thread_id;             \
+    ValueType1 diag_val##iter = 0;                                             \
+    ValueType2 x_val##iter    = 0;                                             \
+    if (col##iter >= 0 && col##iter < num_cols)                                \
+    {                                                                          \
+        IndexType idx = (offset_base + i + iter - 1)*pitch + thread_id;        \
+        diag_val##iter = load_diag_val(&values[idx]);                          \
+        x_val##iter    = load_x_val(&x[col##iter]);                            \
     }
 
+#define PREFETCH_ACCUM_ITERATION(iter) \
+    sum += diag_val##iter * x_val##iter;
+
+#define PREFETCH1(ITER) ITER(1);
+#define PREFETCH2(ITER) ITER(2); PREFETCH1(ITER)
+#define PREFETCH3(ITER) ITER(3); PREFETCH2(ITER)
+#define PREFETCH4(ITER) ITER(4); PREFETCH3(ITER)
+#define PREFETCH5(ITER) ITER(5); PREFETCH4(ITER)
+#define PREFETCH6(ITER) ITER(6); PREFETCH5(ITER)
+#define PREFETCH7(ITER) ITER(7); PREFETCH6(ITER)
+#define PREFETCH8(ITER) ITER(8); PREFETCH7(ITER)
+
+#define PREFETCH_IMPL(factor, ITER) PREFETCH##factor(ITER)
+
+#define PREFETCH_LOAD_VALS(factor) PREFETCH_IMPL(factor, PREFETCH_LOAD_ITERATION)
+#define PREFETCH_ACCUMULATE(factor) PREFETCH_IMPL(factor, PREFETCH_ACCUM_ITERATION)
+
+
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
@@ -182,13 +198,13 @@ blocked_offsets_dia_kernel(const int num_rows,
     for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
     {
         if (offset_base + threadIdx.x < num_diagonals)
-        {
             offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
 
         __syncthreads();
 
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
+        int batch_size = BLOCK_SIZE > num_diagonals - offset_base
+                            ? num_diagonals - offset_base
+                            : BLOCK_SIZE;
 
         if (thread_id < num_rows)
         {
@@ -218,23 +234,8 @@ blocked_offsets_dia_kernel(const int num_rows,
 
 #elif REGISTER_PREFETCH_TYPE == 1
 
-                REGISTER_PREFETCH_ITERATION(0);
-                REGISTER_PREFETCH_ITERATION(1);
-#if REGISTER_PREFETCH_FACTOR > 2
-                REGISTER_PREFETCH_ITERATION(2);
-#endif
-#if REGISTER_PREFETCH_FACTOR > 3
-                REGISTER_PREFETCH_ITERATION(3);
-#endif
-
-                sum += diag_val0 * x_val0;
-                sum += diag_val1 * x_val1;
-#if REGISTER_PREFETCH_FACTOR > 2
-                sum += diag_val2 * x_val2;
-#endif
-#if REGISTER_PREFETCH_FACTOR > 3
-                sum += diag_val3 * x_val3;
-#endif
+                PREFETCH_LOAD_VALS(PREFETCH_FACTOR);
+                PREFETCH_ACCUMULATE(PREFETCH_FACTOR);
 
 #endif // REGISTER_PREFETCH_TYPE == 1
 #elif SHARED_PREFETCH_FACTOR > 0
@@ -282,9 +283,7 @@ blocked_offsets_dia_kernel(const int num_rows,
     }
 
     if (thread_id < num_rows)
-    {
         y[thread_id] = sum;
-    }
 }
 
 template <typename IndexType,

From 50a199efc129ea21fed9c7ccd4b1a7fd43a26916 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 21 Mar 2023 13:46:23 +0100
Subject: [PATCH 38/49] Use a more readable implementation of prefetching

---
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 43 +++++++----------------
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 2160b007..8f2e144a 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -100,10 +100,7 @@ struct UnrolledLoop<IndexType, ValueType1, ValueType2, 0>
                              IndexType i) {}
 
     template<typename ValueType3>
-    __device__ ValueType3 do_iter()
-    {
-        return 0;
-    }
+    __device__ ValueType3 do_iter() { return 0; }
 };
 
 
@@ -208,48 +205,32 @@ blocked_offsets_dia_kernel(const int num_rows,
 
         if (thread_id < num_rows)
         {
-#if SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0
-            for (IndexType i = 0; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + thread_id;
-                if (col >= 0 && col < num_cols)
-                {
-                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]);
-                    auto x_val = load_x_val(&x[col]);
-
-                    sum += diag_val*x_val;
-                }
-            }
-#else // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0
-            int end = batch_size - (batch_size % PREFETCH_FACTOR);
+            IndexType i = 0;
 
-            for (IndexType i = 0; i < end; i += PREFETCH_FACTOR)
+#if PREFETCH_FACTOR > 0
+            for (; i < batch_size - PREFETCH_FACTOR + 1; i += PREFETCH_FACTOR)
             {
 #if REGISTER_PREFETCH_FACTOR > 0
-#if REGISTER_PREFETCH_TYPE == 0
 
+#if REGISTER_PREFETCH_TYPE == 0
                 UnrolledLoop<IndexType, ValueType1, ValueType2, REGISTER_PREFETCH_FACTOR> loop;
                 loop.prefetch(offsets, values, x, pitch, num_cols, thread_id, offset_base, i);
                 sum += loop.template do_iter<ValueType3>();
-
 #elif REGISTER_PREFETCH_TYPE == 1
-
                 PREFETCH_LOAD_VALS(PREFETCH_FACTOR);
                 PREFETCH_ACCUMULATE(PREFETCH_FACTOR);
-
 #endif // REGISTER_PREFETCH_TYPE == 1
-#elif SHARED_PREFETCH_FACTOR > 0
 
+#elif SHARED_PREFETCH_FACTOR > 0
                 for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j)
                 {
                     IndexType col = offsets[i + j] + thread_id;
                     if (col >= 0 && col < num_cols)
                     {
-                        auto diag_val = load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]);
-                        auto x_val = load_x_val(&x[col]);
-
-                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] = diag_val;
-                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] = x_val;
+                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] =
+                            load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]);;
+                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] =
+                            load_x_val(&x[col]);
                     }
                     else
                     {
@@ -264,8 +245,9 @@ blocked_offsets_dia_kernel(const int num_rows,
                 }
 #endif // SHARED_PREFETCH_FACTOR > 0
             }
+#endif // PREFETCH_FACTOR > 0
 
-            for (IndexType i = end; i < batch_size; ++i)
+            for (; i < batch_size; ++i)
             {
                 IndexType col = offsets[i] + thread_id;
                 if (col >= 0 && col < num_cols)
@@ -276,7 +258,6 @@ blocked_offsets_dia_kernel(const int num_rows,
                     sum += diag_val*x_val;
                 }
             }
-#endif // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0
         }
 
         __syncthreads();

From 1dd0488a6038c6535ec4fc12ffdc5369a77c51ed Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Tue, 21 Mar 2023 13:49:28 +0100
Subject: [PATCH 39/49] Remove old unused kernels

---
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 292 ++--------------------
 1 file changed, 22 insertions(+), 270 deletions(-)

diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 8f2e144a..0b7bc0a3 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -104,41 +104,6 @@ struct UnrolledLoop<IndexType, ValueType1, ValueType2, 0>
 };
 
 
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-__device__ void
-naive_dia_kernel(const int num_rows,
-                 const int num_cols,
-                 const int num_diagonals,
-                 const int pitch,
-                 const IndexType* diagonal_offsets,
-                 const ValueType1* values,
-                 const ValueType2* x,
-                 ValueType3* y)
-{
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-    if (thread_id < num_rows)
-    {
-        ValueType3 sum = ValueType3(0);
-        for (IndexType i = 0; i < num_diagonals; ++i)
-        {
-            IndexType col = diagonal_offsets[i] + thread_id;
-            if (col >= 0 && col < num_cols)
-            {
-                auto diag_val = load_diag_val(&values[i*pitch + thread_id]);
-                auto x_val = load_x_val(&x[col]);
-
-                sum += diag_val * x_val;
-            }
-        }
-        y[thread_id] = sum;
-    }
-}
-
-
 #define PREFETCH_LOAD_ITERATION(iter)                                          \
     IndexType col##iter       = offsets[i + iter - 1] + thread_id;             \
     ValueType1 diag_val##iter = 0;                                             \
@@ -267,6 +232,7 @@ blocked_offsets_dia_kernel(const int num_rows,
         y[thread_id] = sum;
 }
 
+
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
@@ -429,214 +395,12 @@ double_striped_dia_kernel(const int num_rows,
 }
 
 
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-__global__ void
-timed_dia_kernel(const int num_rows,
-                           const int num_cols,
-                           const int num_diagonals,
-                           const int pitch,
-                           const IndexType* diagonal_offsets,
-                           const ValueType1* values,
-                           const ValueType2* x,
-                           ValueType3* y,
-                           long long int* times,
-                           const int times_pitch)
-{
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-    const IndexType row = thread_id;
-
-    __shared__ IndexType offsets[BLOCK_SIZE];
-    ValueType3 sum = ValueType3(0);
-
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
-    {
-        if (offset_base + threadIdx.x < num_diagonals)
-        {
-            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
-
-        __syncthreads();
-
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
-
-        if (row < num_rows)
-        {
-            for (IndexType i = 0; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + row;
-                if (col >= 0 && col < num_cols)
-                {
-                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
-                    auto to_load = &x[col];
-
-                    long long int start = clock64();
-                    auto x_val = load_x_val(to_load);
-                    long long int end = clock64();
-
-                    if (threadIdx.x % 32 == 0)
-                    {
-                        IndexType k = row / 32;
-                        __stwt(times + (offset_base + i)*times_pitch + k, end - start + 1);
-                    }
-
-                    sum += diag_val*x_val;
-                }
-            }
-        }
-
-        __syncthreads();
-    }
-
-    if (row < num_rows)
-        y[row] = sum;
-}
-
-
-
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
 __device__ void
-cached_x_dia_kernel(const int num_rows,
-                    const int num_cols,
-                    const int num_diagonals,
-                    const int pitch,
-                    const IndexType* diagonal_offsets,
-                    const ValueType1* values,
-                    const ValueType2* x,
-                    ValueType3* y)
-{
-    using ValueType = ValueType3;
-
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-    __shared__ IndexType offsets[BLOCK_SIZE];
-    __shared__ ValueType x_cache[BLOCK_SIZE];
-    IndexType cache_start = -IndexType(BLOCK_SIZE);
-    IndexType cache_end = 0;
-
-    IndexType first_row = BLOCK_SIZE * blockIdx.x;
-    IndexType end_row = min(first_row + BLOCK_SIZE, num_rows);
-
-    ValueType sum = ValueType(0);
-
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) {
-        if (offset_base + threadIdx.x < num_diagonals) {
-            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
-
-        __syncthreads();
-
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
-
-        if (thread_id < num_rows)
-        {
-            for (IndexType i = 0; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + thread_id;
-                ValueType x_val = -1;
-
-                if (col >= 0 && col < num_cols)
-                {
-                    x_val = col < cache_end ? x_cache[col - cache_start] : x[col];
-                    ValueType diag_val = values[(offset_base + i)*pitch + thread_id];
-                    sum += x_val * diag_val;
-                }
-
-                __syncthreads();
-
-                x_cache[threadIdx.x] = x_val;
-                cache_start = first_row + offsets[i];
-                cache_end = end_row + offsets[i];
-
-                __syncthreads();
-            }
-        }
-
-        __syncthreads();
-    }
-
-    if (thread_id < num_rows) {
-        y[thread_id] = sum;
-    }
-}
-
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-__device__ void
-experimental_dia_kernel(const int num_rows,
-                    const int num_cols,
-                    const int num_diagonals,
-                    const int pitch,
-                    const IndexType* diagonal_offsets,
-                    const ValueType1* values,
-                    const ValueType2* x,
-                    ValueType3* y)
-{
-    using ValueType = ValueType3;
-
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-    __shared__ IndexType offsets[BLOCK_SIZE];
-    __shared__ ValueType x_cache[2*BLOCK_SIZE];
-
-    IndexType first_row = BLOCK_SIZE * blockIdx.x;
-    IndexType first_col = offsets[0] + first_row;
-
-    if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) {
-        x_cache[threadIdx.x] = x[first_col + threadIdx.x];
-    }
-
-    if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) {
-        x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x];
-    }
-
-    ValueType sum = ValueType(0);
-
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) {
-        if (offset_base + threadIdx.x < num_diagonals) {
-            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
-
-        __syncthreads();
-
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
-
-        if (thread_id < num_rows)
-        {
-            for (IndexType i = 0; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + thread_id;
-
-                if (col >= 0 && col < num_cols)
-                {
-                    ValueType x_val = x_cache[col - first_col];
-                    ValueType diag_val = values[(offset_base + i)*pitch + thread_id];
-                    sum += x_val * diag_val;
-                }
-            }
-        }
-
-        __syncthreads();
-    }
-
-    if (thread_id < num_rows) {
-        y[thread_id] = sum;
-    }
-}
-
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-__device__ void
-cached_x_naive_dia_kernel(const int num_rows,
+naive_dia_kernel(const int num_rows,
                  const int num_cols,
                  const int num_diagonals,
                  const int pitch,
@@ -645,31 +409,19 @@ cached_x_naive_dia_kernel(const int num_rows,
                  const ValueType2* x,
                  ValueType3* y)
 {
-    using ValueType = ValueType3;
-
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
-    __shared__ ValueType x_cache[2*BLOCK_SIZE];
-
-    IndexType first_row = BLOCK_SIZE * blockIdx.x;
-    IndexType first_col = diagonal_offsets[0] + first_row;
-
-    if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) {
-        x_cache[threadIdx.x] = x[first_col + threadIdx.x];
-    }
-
-    if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) {
-        x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x];
-    }
-
-
-    if (thread_id < num_rows) {
-        ValueType sum = ValueType(0);
-        for (IndexType i = 0; i < num_diagonals; ++i) {
+    if (thread_id < num_rows)
+    {
+        ValueType3 sum = ValueType3(0);
+        for (IndexType i = 0; i < num_diagonals; ++i)
+        {
             IndexType col = diagonal_offsets[i] + thread_id;
-            if (col >= 0 && col < num_cols) {
-                auto diag_val = values[i*pitch + thread_id];
-                auto x_val = x_cache[col - first_col];
+            if (col >= 0 && col < num_cols)
+            {
+                auto diag_val = load_diag_val(&values[i*pitch + thread_id]);
+                auto x_val = load_x_val(&x[col]);
+
                 sum += diag_val * x_val;
             }
         }
@@ -677,20 +429,20 @@ cached_x_naive_dia_kernel(const int num_rows,
     }
 }
 
+
 template <typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
-__launch_bounds__(BLOCK_SIZE,1) __global__ void
-ktt_dia_vector_kernel(
-                const int num_rows,
-                const int num_cols,
-                const int num_diagonals,
-                const int pitch,
-                const IndexType* diagonal_offsets,
-                const ValueType1* values,
-                const ValueType2* x,
-                ValueType3* y)
+__launch_bounds__(BLOCK_SIZE, 1) __global__ void
+ktt_dia_vector_kernel(const int num_rows,
+                      const int num_cols,
+                      const int num_diagonals,
+                      const int pitch,
+                      const IndexType* diagonal_offsets,
+                      const ValueType1* values,
+                      const ValueType2* x,
+                      ValueType3* y)
 {
 #if KERNEL_TYPE == 0
     naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>

From 1c6b3cf8d1a04475931a4d2f8868ba7644a52911 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Sun, 7 May 2023 13:19:41 +0200
Subject: [PATCH 40/49] Add more general function for matrix generation

---
 cusp/ktt/matrix_generation.h | 79 ++++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h
index b30e616a..712e0b55 100644
--- a/cusp/ktt/matrix_generation.h
+++ b/cusp/ktt/matrix_generation.h
@@ -4,52 +4,35 @@
 
 #include <algorithm> // min
 #include <stdexcept>
+#include <vector>
 
 namespace cusp {
 
 namespace ktt {
 
-/**
- * @brief Create a symetric matrix with given number of diagonals filled with ones.
- *
- * The main diagonal is always filled and then equal number of diagonals is placed
- * on both sides of it. That means the matrix is actually symetric only for
- * odd numbers of diagonals.
- *
- * If the number of diagonals is greater then would fit into the matrix,
- * the extra diagonals are just discarded.
- *
- * @param rows            The number of rows of the resulting matrix.
- * @param cols            The number of columns of the resulting matrix.
- * @param offset_step     Distance between two diagonals.
- * @param diagonal_count  The number of non-zero diagonals in the matrix.
- *
- * @return The matrix.
- */
+
 cusp::dia_matrix<int, float, cusp::host_memory>
-make_diagonal_symmetric_matrix(int rows,
-                               int cols,
-                               int offset_step,
-                               int diagonal_count)
+make_diagonal_matrix(int rows, int cols, const std::vector<int>& diag_offsets)
 {
     using DiaMatrix = cusp::dia_matrix<int, float, cusp::host_memory>;
 
+    int diagonal_count = diag_offsets.size();
+
     DiaMatrix::diagonal_offsets_array_type offsets;
     offsets.reserve(diagonal_count);
 
     DiaMatrix::values_array_type values(rows, diagonal_count, 0);
 
-    int starting_offset = -offset_step * diagonal_count/2;
     size_t num_entries = 0;
-
     for (int i = 0; i < diagonal_count; ++i) {
-        int offset = starting_offset + offset_step*i;
+        int offset = diag_offsets[i];
 
         int starting_row = offset < 0 ? -offset : 0;
         int starting_col = offset < 0 ? 0 : offset;
 
         if (starting_row >= rows || starting_col >= cols)
-            throw std::runtime_error("make_diagonal_symmetric_matrix: Too many diagonals.");
+            throw std::runtime_error(
+                    "make_diagonal_matrix: Diagonal out of bounds.");
 
         offsets.push_back(offset);
 
@@ -73,6 +56,52 @@ make_diagonal_symmetric_matrix(int rows,
     return res;
 }
 
+
+/**
+ * @brief Create a symetric matrix with given number of diagonals filled with ones.
+ *
+ * The main diagonal is always filled and then equal number of diagonals is placed
+ * on both sides of it. That means the matrix is actually symetric only for
+ * odd numbers of diagonals.
+ *
+ * If the number of diagonals is greater then would fit into the matrix
+ * an exception is thrown.
+ *
+ * @param rows            The number of rows of the resulting matrix.
+ * @param cols            The number of columns of the resulting matrix.
+ * @param offset_step     Distance between two diagonals.
+ * @param diagonal_count  The number of non-zero diagonals in the matrix.
+ *
+ * @return The matrix.
+ */
+cusp::dia_matrix<int, float, cusp::host_memory>
+make_diagonal_symmetric_matrix(int rows,
+                               int cols,
+                               int offset_step,
+                               int diagonal_count)
+{
+    std::vector<int> offsets;
+    offsets.reserve(diagonal_count);
+
+    int starting_offset = -offset_step * diagonal_count/2;
+
+    for (int i = 0; i < diagonal_count; ++i) {
+        int offset = starting_offset + offset_step*i;
+
+        int starting_row = offset < 0 ? -offset : 0;
+        int starting_col = offset < 0 ? 0 : offset;
+
+        if (starting_row >= rows || starting_col >= cols)
+            throw std::runtime_error(
+                    "make_diagonal_symmetric_matrix: Too many diagonals.");
+
+        offsets.push_back(offset);
+    }
+
+    return make_diagonal_matrix(rows, cols, offsets);
+}
+
+
 } // namespace ktt
 
 } // namespace cusp

From db83e9508b72e5c1ea1fc99bd86527931ec7d891 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 8 May 2023 19:25:05 +0200
Subject: [PATCH 41/49] Add check for builtin types

---
 cusp/system/detail/generic/multiply.inl | 47 ++++++++++++++++---------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl
index 99909c51..75e4c85c 100644
--- a/cusp/system/detail/generic/multiply.inl
+++ b/cusp/system/detail/generic/multiply.inl
@@ -27,10 +27,13 @@
 #include <cusp/system/detail/generic/multiply/spgemm.h>
 #include <cusp/system/detail/generic/multiply/spmv.h>
 
+#include <thrust/functional.h>
+
 #include <cusp/ktt/ktt.h>
 #include <cusp/system/cuda/ktt/multiply.h>
 
-#include <thrust/functional.h>
+#include <type_traits>
+
 
 namespace cusp
 {
@@ -107,6 +110,8 @@ multiply(thrust::execution_policy<DerivedPolicy> &exec,
     cusp::multiply(exec, A, B, C, initialize, combine, reduce);
 }
 
+
+// The has_rebind type traits is used to check whether matrix is a view
 template <typename T, typename /*U*/ = void>
 struct has_rebind : std::false_type {};
 
@@ -116,38 +121,48 @@ struct has_rebind<T, std::void_t<typename T::rebind<int>>> : std::true_type {};
 template<typename T>
 constexpr bool has_rebind_v = has_rebind<T>::value;
 
+
 template <typename DerivedPolicy,
-         typename LinearOperator,
+         typename Matrix,
          typename ValueType1,
          typename ValueType2>
-typename thrust::detail::disable_if_convertible<typename LinearOperator::format,cusp::unknown_format>::type
+typename thrust::detail::disable_if_convertible<
+    typename Matrix::format,
+    cusp::unknown_format
+>::type
 multiply(cusp::system::cuda::detail::execution_policy<DerivedPolicy> &exec,
-         const LinearOperator&  A,
-         const cusp::array1d<ValueType1, cusp::device_memory>& B,
-         cusp::array1d<ValueType2, cusp::device_memory>& C)
+         const Matrix& A,
+         const cusp::array1d<ValueType1, cusp::device_memory>& x,
+         cusp::array1d<ValueType2, cusp::device_memory>& y)
 {
-    typedef typename LinearOperator::format  MatrixFormat;
-
-    if constexpr ((cusp::detail::is_ell<LinearOperator>::value
-                    || cusp::detail::is_dia<LinearOperator>::value)
-                    && has_rebind_v<LinearOperator>)
+    using IndexType = typename Matrix::index_type;
+    using ValueType = typename Matrix::value_type;
+
+    if constexpr ((cusp::detail::is_ell<Matrix>::value
+                    || cusp::detail::is_dia<Matrix>::value
+                    && std::is_fundamental_v<IndexType>
+                    && std::is_fundamental_v<ValueType>
+                    && std::is_fundamental_v<ValueType1>
+                    && std::is_fundamental_v<ValueType2>)
+                    && has_rebind_v<Matrix>)
     {
-        if (cusp::ktt::detail::is_enabled) {
-            cusp::ktt::detail::lazy_init();
-            cusp::system::cuda::ktt::multiply(*cusp::ktt::detail::tuner, A, B, C);
+        if (cusp::ktt::detail::is_enabled)
+        {
+            cusp::system::cuda::ktt::multiply(cusp::ktt::get_tuner(), A, x, y);
             return;
         }
     }
 
-    typedef typename LinearOperator::value_type ValueType;
+    // The call is not meant for KTT. Pass it on...
 
     cusp::constant_functor<ValueType> initialize(0);
     thrust::multiplies<ValueType> combine;
     thrust::plus<ValueType> reduce;
 
-    cusp::multiply(exec, A, B, C, initialize, combine, reduce);
+    cusp::multiply(exec, A, x, y, initialize, combine, reduce);
 }
 
+
 template <typename DerivedPolicy,
          typename LinearOperator,
          typename MatrixOrVector1,

From edf609c6754df6ea7868dffae1b0ebbf98da263a Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 8 May 2023 19:25:17 +0200
Subject: [PATCH 42/49] Modify the tuning space

---
 cusp/system/cuda/ktt/dia_multiply.h | 45 +++++++++++++++++------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 02df5bd8..ff1ab7f6 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -26,27 +26,27 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
     auto& tuner = *kernel.tuner;
     auto kernel_id = kernel.kernel_id;
 
-    tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 0, 1, 2 });
-    tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR",
-                       u64_vec{ 0, 2, 4, 8 });
+    tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 1, 2 });
+    // tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR",
+    //                    u64_vec{ 0, 2, 4, 8 });
     tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR",
                        u64_vec{ 0, 2, 3, 4 });
     tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 });
-    tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8 });
-    tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 256, 512 });
+    tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8, 16 });
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 });
 
     // START OF HACKS to test striping ----
-    tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 });
-    tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 });
-    tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 });
-    tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 });
-    tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 });
-
-    tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" },
-                        [] (const std::vector<uint64_t>& values) {
-                            return values[0] == 0;
-                        });
+    // tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 });
+    // tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 });
+    // tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 });
+    // tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 });
+    // tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 });
+
+    // tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" },
+    //                     [] (const std::vector<uint64_t>& values) {
+    //                         return values[0] == 0;
+    //                     });
     // END OF HACKS -----------------------
 
     tuner.AddThreadModifier(
@@ -91,6 +91,13 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
         [] (const std::vector<uint64_t>& values) {
             return values[0] == 2 || values[1] == 2;
         });
+
+    tuner.AddConstraint(
+        kernel.kernel_id,
+        { "BLOCK_SIZE", "STRIPING_FACTOR" },
+        [] (const std::vector<uint64_t>& values) {
+            return values[0]/values[1] >= 32;
+        });
 }
 
 
@@ -205,10 +212,10 @@ auto get_launcher(
             DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
 
         // START OF HACKS to test striping ----
-        auto conf = interface.GetCurrentConfiguration();
-        for (const auto& pair : conf.GetPairs())
-            if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0)
-                grid_size = ::ktt::DimensionVector(pair.GetValue());
+        // auto conf = interface.GetCurrentConfiguration();
+        // for (const auto& pair : conf.GetPairs())
+        //     if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0)
+        //         grid_size = ::ktt::DimensionVector(pair.GetValue());
         // END OF HACKS -----------------------
 
         if (!profile) {

From 2e529a3222f0689b480adbb8edbf0d06f256499d Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Wed, 10 May 2023 10:49:57 +0200
Subject: [PATCH 43/49] Fix tuning reseting

---
 cusp/ktt/detail/ktt.inl | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl
index 9e47d9be..cfe25886 100644
--- a/cusp/ktt/detail/ktt.inl
+++ b/cusp/ktt/detail/ktt.inl
@@ -137,11 +137,9 @@ void reset_tuning(const MatrixType& A,
                   const cusp::array1d<ValueType1, cusp::device_memory>& x,
                   cusp::array1d<ValueType2, cusp::device_memory>& y)
 {
-    using IndexType = typename MatrixType::index_type;
-    using ValueType = typename MatrixType::value_type;
-    using Format = typename MatrixType::format;
-
-    return reset_tuning<IndexType, ValueType, ValueType2, ValueType2>(A);
+    auto& tuner = get_tuner();
+    const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
+    tuner.ClearData(kernel.kernel_id);
 }
 
 

From f3157a3bd016cc23e3f6b4d05913bf9895d19a9d Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Fri, 12 May 2023 09:14:53 +0200
Subject: [PATCH 44/49] Set the final values of tuning parameters

---
 cusp/system/cuda/ktt/dia_multiply.h       | 18 +-----------------
 cusp/system/cuda/ktt/ell_multiply.h       |  2 +-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 11 -----------
 3 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index ff1ab7f6..60cee249 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -26,28 +26,12 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
     auto& tuner = *kernel.tuner;
     auto kernel_id = kernel.kernel_id;
 
-    tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 1, 2 });
-    // tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR",
-    //                    u64_vec{ 0, 2, 4, 8 });
+    tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 });
     tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR",
                        u64_vec{ 0, 2, 3, 4 });
     tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 });
-    tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8, 16 });
-    tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 });
 
-    // START OF HACKS to test striping ----
-    // tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 });
-    // tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 });
-    // tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 });
-    // tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 });
-    // tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 });
-
-    // tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" },
-    //                     [] (const std::vector<uint64_t>& values) {
-    //                         return values[0] == 0;
-    //                     });
-    // END OF HACKS -----------------------
 
     tuner.AddThreadModifier(
         kernel.kernel_id,
diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 3981156f..4ea01dc5 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -26,7 +26,7 @@ inline void setup_common_tuning_parameters(const kernel_context& kernel)
     tuner.AddParameter(kernel_id, "UNCACHED_LOADS",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "DISABLE_UNROLL",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 });
-    tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4 });
+    tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4, 8 });
 
     tuner.AddThreadModifier(
         kernel_id,
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 0b7bc0a3..4a07ac69 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -444,17 +444,6 @@ ktt_dia_vector_kernel(const int num_rows,
                       const ValueType2* x,
                       ValueType3* y)
 {
-#if KERNEL_TYPE == 0
-    naive_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
-        (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#elif KERNEL_TYPE == 1
     blocked_offsets_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#elif KERNEL_TYPE == 2
-    striped_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
-        (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#elif KERNEL_TYPE == 3
-    double_striped_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
-        (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
-#endif
 }

From c4039d862741605326093e218f5a3967c54b23c4 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Fri, 12 May 2023 13:26:01 +0200
Subject: [PATCH 45/49] Use profiling only conditionally

---
 cusp/system/cuda/ktt/csr_multiply.h | 2 ++
 cusp/system/cuda/ktt/dia_multiply.h | 2 ++
 cusp/system/cuda/ktt/ell_multiply.h | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h
index 3303fe93..3e117fe6 100644
--- a/cusp/system/cuda/ktt/csr_multiply.h
+++ b/cusp/system/cuda/ktt/csr_multiply.h
@@ -159,8 +159,10 @@ auto get_launcher(const kernel_context& ctx,
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
+#ifdef PROFILE
             interface.RunKernelWithProfiling(ctx.definition_ids[0],
                                              grid_size, block_size);
+#endif
         }
     };
 }
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 60cee249..3c02a8e8 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -205,8 +205,10 @@ auto get_launcher(
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
+#ifdef PROFILE
             interface.RunKernelWithProfiling(ctx.definition_ids[0],
                                              grid_size, block_size);
+#endif
         }
     };
 }
diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 4ea01dc5..88d5cd3e 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -249,8 +249,10 @@ auto get_launcher(const kernel_context& ctx,
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
+#ifdef PROFILE
             interface.RunKernelWithProfiling(ctx.definition_ids[0],
                                              grid_size, block_size);
+#endif
         }
     };
 }

From cf135992b3d624144899ebaa77576975ea901210 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Fri, 12 May 2023 13:56:39 +0200
Subject: [PATCH 46/49] Fix dangling reference

---
 cusp/system/cuda/ktt/csr_multiply.h | 4 +---
 cusp/system/cuda/ktt/dia_multiply.h | 2 --
 cusp/system/cuda/ktt/ell_multiply.h | 4 +---
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h
index 3e117fe6..d35a2fd8 100644
--- a/cusp/system/cuda/ktt/csr_multiply.h
+++ b/cusp/system/cuda/ktt/csr_multiply.h
@@ -143,7 +143,7 @@ auto get_launcher(const kernel_context& ctx,
                   cusp::array1d<ValueType3, cusp::device_memory>& y,
                   bool profile = false)
 {
-    return [&] (::ktt::ComputeInterface& interface)
+    return [&, profile] (::ktt::ComputeInterface& interface)
     {
         auto conf = interface.GetCurrentConfiguration();
 
@@ -159,10 +159,8 @@ auto get_launcher(const kernel_context& ctx,
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
-#ifdef PROFILE
             interface.RunKernelWithProfiling(ctx.definition_ids[0],
                                              grid_size, block_size);
-#endif
         }
     };
 }
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 3c02a8e8..60cee249 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -205,10 +205,8 @@ auto get_launcher(
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
-#ifdef PROFILE
             interface.RunKernelWithProfiling(ctx.definition_ids[0],
                                              grid_size, block_size);
-#endif
         }
     };
 }
diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 88d5cd3e..92e31627 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -233,7 +233,7 @@ auto get_launcher(const kernel_context& ctx,
                   cusp::array1d<ValueType3, cusp::device_memory>& y,
                   bool profile = false)
 {
-    return [&] (::ktt::ComputeInterface& interface)
+    return [&, profile] (::ktt::ComputeInterface& interface)
     {
         const auto& conf = interface.GetCurrentConfiguration();
         auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW");
@@ -249,10 +249,8 @@ auto get_launcher(const kernel_context& ctx,
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
-#ifdef PROFILE
             interface.RunKernelWithProfiling(ctx.definition_ids[0],
                                              grid_size, block_size);
-#endif
         }
     };
 }

From d6561fbfc005640c1308fcdb3bc5a757c07061ac Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 15 May 2023 19:04:10 +0200
Subject: [PATCH 47/49] Add documentation

---
 cusp/ktt/ellr_matrix.h |   8 +--
 cusp/ktt/ktt.h         | 112 +++++++++++++++++++++++------------------
 2 files changed, 68 insertions(+), 52 deletions(-)

diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h
index 7f8d932c..829f5dfc 100644
--- a/cusp/ktt/ellr_matrix.h
+++ b/cusp/ktt/ellr_matrix.h
@@ -30,7 +30,7 @@ class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
      */
     ellr_matrix(void) {}
 
-    /*! Construct an \p ell_matrix with a specific shape, number of nonzero entries,
+    /*! Construct an \p ellr_matrix with a specific shape, number of nonzero entries,
      *  and maximum number of nonzero entries per row.
      *
      *  \param num_rows Number of rows.
@@ -43,7 +43,7 @@ class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
                 const size_t num_entries, const size_t num_entries_per_row,
                 const size_t alignment = 32);
 
-    /*! Construct an \p ell_matrix from another matrix.
+    /*! Construct an \p ellr_matrix from another matrix.
      *
      *  \param matrix Another sparse or dense matrix.
      */
@@ -71,9 +71,9 @@ class ellr_matrix : public cusp::ell_matrix<IndexType, ValueType, MemorySpace>
     void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries,
                 const size_t num_entries_per_row, const size_t alignment);
 
-    /*! Swap the contents of two \p ell_matrix objects.
+    /*! Swap the contents of two \p ellr_matrix objects.
      *
-     *  \param matrix Another \p ell_matrix with the same IndexType and ValueType.
+     *  \param matrix Another \p ellr_matrix with the same IndexType and ValueType.
      */
     void swap(ellr_matrix& matrix);
 
diff --git a/cusp/ktt/ktt.h b/cusp/ktt/ktt.h
index 88a5d026..37c89de5 100644
--- a/cusp/ktt/ktt.h
+++ b/cusp/ktt/ktt.h
@@ -15,94 +15,110 @@ inline void enable();
 
 inline ::ktt::Tuner& get_tuner();
 
+
 /**
  * @brief Perform the multiplication y = A*x using an autotuned kernel.
+ * This function performs a single step of dynamic autotuning.
  *
  * @tparam Matrix      The type of the sparse matrix to use.
- * @tparam ValueType1  The value type of the x vector.
- * @tparam ValueType2  The value type of the y vector.
+ * @tparam IndexType   The type used for indices.
+ * @tparam ValueType1  The value type of the matrix.
+ * @tparam ValueType2  The value type of the x vector.
+ * @tparam ValueType3  The value type of the y vector.
  *
  * @param A The matrix.
  * @param x The input vector.
  * @param y The output vector.
  */
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                             const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                             cusp::array1d<ValueType2, cusp::device_memory>& y);
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y);
+
 
-// NOTE: Should we have this?
 /**
  * @brief Perform the multiplication y = A*x without autotuning,
  * instead just run the kernel in the given configuration.
  *
  * @tparam Matrix      The type of the sparse matrix to use.
- * @tparam ValueType1  The value type of the x vector.
- * @tparam ValueType2  The value type of the y vector.
+ * @tparam IndexType   The type used for indices.
+ * @tparam ValueType1  The value type of the matrix.
+ * @tparam ValueType2  The value type of the x vector.
+ * @tparam ValueType3  The value type of the y vector.
  *
- * @param A The matrix.
- * @param x The input vector.
- * @param y The output vector.
+ * @param A                   The matrix.
+ * @param x                   The input vector.
+ * @param y                   The output vector.
+ * @param configuration       The configuration of the kernel.
+ * @param run_with_profiling  Flag used to toggle profiling.
  */
-template <typename Matrix,
-          typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                            const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                            cusp::array1d<ValueType2, cusp::device_memory>& y,
-                            const ::ktt::KernelConfiguration& configuration,
-                            bool run_with_profiling = false);
-
 template <template<typename, typename, typename> typename Matrix,
           typename IndexType,
           typename ValueType1,
           typename ValueType2,
           typename ValueType3>
-std::vector<::ktt::KernelResult>
-tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
      const cusp::array1d<ValueType2, cusp::device_memory>& x,
      cusp::array1d<ValueType3, cusp::device_memory>& y,
-     std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt);
+     const ::ktt::KernelConfiguration& configuration,
+     bool run_with_profiling = false);
+
 
 /**
- * TODO: Add some documentation.
+ * @brief Perform offline autotuning on the specified matrix and vectors.
  *
- * @brief
+ * @tparam Matrix      The type of the sparse matrix to use.
+ * @tparam IndexType   The type used for indices.
+ * @tparam ValueType1  The value type of the matrix.
+ * @tparam ValueType2  The value type of the x vector.
+ * @tparam ValueType3  The value type of the y vector.
  *
- * @tparam IndexType
- * @tparam ValueType1
- * @tparam ValueType2
- * @tparam ValueType3
- * @tparam Format
+ * @param A                      The matrix.
+ * @param x                      The input vector.
+ * @param y                      The output vector.
+ * @param reference_computation  The reference computation that computes
+                                 the correct result.
  */
-template <typename IndexType,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
           typename ValueType2,
-          typename ValueType3,
-          typename Format>
-void reset_tuning();
+          typename ValueType3>
+std::vector<::ktt::KernelResult>
+tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y,
+     std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt);
+
 
 /**
- * TODO: Add some documentation.
+ * @brief Resets the tuning for the kernel instantiation used for the given arguments.
  *
- * @brief
+ * @tparam MatrixType    Type of the sparse matrix.
+ * @tparam ValueType1    The value type of the x vector
+ * @tparam ValueType2    The value type of the y vector.
+ * @tparam MemorySpace1  The memory space of the x vector.
+ * @tparam MemorySpace2  The memory space of the y vector.
  *
- * @tparam MatrixType
- * @tparam ValueType1
- * @tparam ValueType2
- *
- * @param A
- * @param x
- * @param y
+ * @param A The matrix.
+ * @param x The input vector.
+ * @param y The output vector.
  */
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
 void reset_tuning(const MatrixType& A,
-                  const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                  cusp::array1d<ValueType2, cusp::device_memory>& y);
+                  const cusp::array1d<ValueType1, MemorySpace1>& x,
+                  cusp::array1d<ValueType2, MemorySpace2>& y);
+
 
 } // namespace ktt
 

From 9cc1988612c8dfca8ed8aacab778ea2b9724fed3 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 15 May 2023 19:05:25 +0200
Subject: [PATCH 48/49] Make last minute changes

---
 cusp/ktt/detail/ktt.inl                   |  61 ++--
 cusp/system/cuda/ktt/dia_multiply.h       |  45 +--
 cusp/system/cuda/ktt/ell_multiply.h       |   3 +-
 cusp/system/cuda/ktt/kernel.h             |  10 +-
 cusp/system/cuda/ktt/kernels/dia_kernel.h | 337 +++++-----------------
 cusp/system/cuda/ktt/kernels/ell_kernel.h |  24 +-
 cusp/system/cuda/ktt/multiply.h           |  33 ++-
 cusp/system/cuda/ktt/utils.h              |   4 +-
 8 files changed, 143 insertions(+), 374 deletions(-)

diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl
index cfe25886..63f97368 100644
--- a/cusp/ktt/detail/ktt.inl
+++ b/cusp/ktt/detail/ktt.inl
@@ -15,6 +15,7 @@ namespace ktt {
 
 namespace detail {
 
+
 inline std::unique_ptr<::ktt::Tuner> tuner;
 inline bool is_enabled = true;
 
@@ -41,9 +42,11 @@ inline void lazy_init()
             CUstream stream;
             cuStreamCreate(&stream, CU_STREAM_DEFAULT);
 
-            ::ktt::ComputeApiInitializer initializer(context, std::vector<::ktt::ComputeQueue>{ stream });
+            std::vector<::ktt::ComputeQueue> queues = { stream };
+            ::ktt::ComputeApiInitializer initializer(context, queues);
 
-            tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, initializer);
+            tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA,
+                                                   initializer);
         }
 
         std::string compiler_flags = "-std=c++17 ";
@@ -51,7 +54,6 @@ inline void lazy_init()
         compiler_flags += "-lineinfo ";
 #endif
         tuner->SetCompilerOptions(compiler_flags);
-
         tuner->SetValidationMode(::ktt::ValidationMode::OfflineTuning);
 
         std::atexit(cleanup);
@@ -77,25 +79,31 @@ inline ::ktt::Tuner& get_tuner()
 }
 
 
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-              const cusp::array1d<ValueType1, cusp::device_memory>& x,
-              cusp::array1d<ValueType2, cusp::device_memory>& y)
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y)
 {
     return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y);
 }
 
 
-template <typename Matrix,
+template <template<typename, typename, typename> typename Matrix,
+          typename IndexType,
           typename ValueType1,
-          typename ValueType2>
-::ktt::KernelResult multiply(const Matrix& A,
-                             const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                             cusp::array1d<ValueType2, cusp::device_memory>& y,
-                             const ::ktt::KernelConfiguration& configuration,
-                             bool run_with_profiling)
+          typename ValueType2,
+          typename ValueType3>
+::ktt::KernelResult multiply(
+     const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
+     const cusp::array1d<ValueType2, cusp::device_memory>& x,
+     cusp::array1d<ValueType3, cusp::device_memory>& y,
+     const ::ktt::KernelConfiguration& configuration,
+     bool run_with_profiling)
 {
     return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y, configuration, run_with_profiling);
 }
@@ -115,27 +123,14 @@ tune(const Matrix<IndexType, ValueType1, cusp::device_memory>& A,
     return cusp::system::cuda::ktt::tune(get_tuner(), A, x, y, reference_computation);
 }
 
-
-// template<typename IndexType,
-//          typename ValueType1,
-//          typename ValueType2,
-//          typename ValueType3,
-//          typename Format>
-// void reset_tuning()
-// {
-//     using namespace cusp::system::cuda::ktt;
-
-//     kernel_context kernel = get_kernel<IndexType, ValueType1, ValueType2, ValueType3>(get_tuner(), Format{});
-//     detail::tuner->ClearData(kernel.kernel_id);
-// }
-
-
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
 void reset_tuning(const MatrixType& A,
-                  const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                  cusp::array1d<ValueType2, cusp::device_memory>& y)
+                  const cusp::array1d<ValueType1, MemorySpace1>& x,
+                  cusp::array1d<ValueType2, MemorySpace2>& y)
 {
     auto& tuner = get_tuner();
     const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y);
diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h
index 60cee249..8380b6f2 100644
--- a/cusp/system/cuda/ktt/dia_multiply.h
+++ b/cusp/system/cuda/ktt/dia_multiply.h
@@ -27,10 +27,10 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
     auto kernel_id = kernel.kernel_id;
 
     tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 });
-    tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR",
+    tuner.AddParameter(kernel_id, "PREFETCH_FACTOR",
                        u64_vec{ 0, 2, 3, 4 });
-    tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 });
-    tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "PREFETCH_TYPE", u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "SPECIAL_LOADS", u64_vec{ 0, 1 });
 
 
     tuner.AddThreadModifier(
@@ -43,45 +43,15 @@ inline void setup_tuning_parameters(const kernel_context& kernel)
             return parameters[0];
         });
 
-    // Only one type of prefetching can be applied at once.
-    tuner.AddConstraint(
-        kernel.kernel_id,
-        { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
-        [] (const std::vector<uint64_t>& values) {
-            return values[0] == 0 || values[1] == 0;
-        });
-
-    // Prefetching is used only for the blocked offsets kernel.
-    tuner.AddConstraint(
-        kernel.kernel_id,
-        { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" },
-        [] (const std::vector<uint64_t>& values) {
-            return values[0] == 1 || (values[1] == 0 && values[2] == 0);
-        });
 
     // Different register prefetching implementations are used
     // only when register prefetching is actually enabled
     tuner.AddConstraint(
         kernel.kernel_id,
-        { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" },
+        { "PREFETCH_FACTOR", "PREFETCH_TYPE" },
         [] (const std::vector<uint64_t>& values) {
             return values[0] > 0 || values[1] == 0;
         });
-
-    // Don't try different striping factors for non-striped kernels.
-    tuner.AddConstraint(
-        kernel.kernel_id,
-        { "KERNEL_TYPE", "STRIPING_FACTOR" },
-        [] (const std::vector<uint64_t>& values) {
-            return values[0] == 2 || values[1] == 2;
-        });
-
-    tuner.AddConstraint(
-        kernel.kernel_id,
-        { "BLOCK_SIZE", "STRIPING_FACTOR" },
-        [] (const std::vector<uint64_t>& values) {
-            return values[0]/values[1] >= 32;
-        });
 }
 
 
@@ -195,13 +165,6 @@ auto get_launcher(
         ::ktt::DimensionVector grid_size(
             DIVIDE_INTO(A.num_rows, block_size.GetSizeX()));
 
-        // START OF HACKS to test striping ----
-        // auto conf = interface.GetCurrentConfiguration();
-        // for (const auto& pair : conf.GetPairs())
-        //     if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0)
-        //         grid_size = ::ktt::DimensionVector(pair.GetValue());
-        // END OF HACKS -----------------------
-
         if (!profile) {
             interface.RunKernel(ctx.definition_ids[0], grid_size, block_size);
         } else {
diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h
index 92e31627..d4ff4703 100644
--- a/cusp/system/cuda/ktt/ell_multiply.h
+++ b/cusp/system/cuda/ktt/ell_multiply.h
@@ -23,7 +23,7 @@ inline void setup_common_tuning_parameters(const kernel_context& kernel)
     auto kernel_id = kernel.kernel_id;
 
     tuner.AddParameter(kernel_id, "BLOCK_SIZE",      u64_vec{ 128, 256, 512 });
-    tuner.AddParameter(kernel_id, "UNCACHED_LOADS",  u64_vec{ 0, 1 });
+    tuner.AddParameter(kernel_id, "SPECIAL_LOADS",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "DISABLE_UNROLL",  u64_vec{ 0, 1 });
     tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 });
     tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4, 8 });
@@ -58,7 +58,6 @@ inline void setup_ell_tuning_parameters(const kernel_context& kernel)
     tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 });
 
     // BREAK can be used only when there is no prefetching
-    // TODO(KTT): Implement prefetching when using early break and remove this
     tuner.AddConstraint(
             kernel.kernel_id,
             { "BREAK", "PREFETCH_FACTOR" },
diff --git a/cusp/system/cuda/ktt/kernel.h b/cusp/system/cuda/ktt/kernel.h
index a34c8b16..4061c558 100644
--- a/cusp/system/cuda/ktt/kernel.h
+++ b/cusp/system/cuda/ktt/kernel.h
@@ -16,15 +16,17 @@ namespace ktt {
 struct kernel_context {
     ::ktt::Tuner* tuner = nullptr;
     ::ktt::KernelId kernel_id = ::ktt::InvalidKernelId;
-    
+
     // This is a vector to allow for composite kernels.
     std::vector<::ktt::KernelDefinitionId> definition_ids;
 
     kernel_context(::ktt::Tuner& tuner) : tuner(&tuner) { }
-    
-    kernel_context(::ktt::Tuner& tuner, const std::vector<::ktt::KernelDefinitionId>& definition_ids, ::ktt::KernelId kernel_id)
+
+    kernel_context(::ktt::Tuner& tuner,
+                   const std::vector<::ktt::KernelDefinitionId>& definition_ids,
+                   ::ktt::KernelId kernel_id)
         : tuner(&tuner),
-          definition_ids(definition_ids), 
+          definition_ids(definition_ids),
           kernel_id(kernel_id) { }
 };
 
diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h
index 4a07ac69..9c64f6e1 100644
--- a/cusp/system/cuda/ktt/kernels/dia_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h
@@ -1,14 +1,3 @@
-#pragma once
-
-
-#if REGISTER_PREFETCH_FACTOR > 0
-    #define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR
-#elif SHARED_PREFETCH_FACTOR > 0
-    #define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR
-#else
-    #define PREFETCH_FACTOR 0
-#endif
-
 
 template<typename T>
 __device__ T min(T a, T b) {
@@ -24,11 +13,12 @@ __device__ T max(T a, T b) {
 template<typename T>
 __device__ T load_diag_val(const T* val)
 {
-#if LOAD_TYPE == 0
+#if SPECIAL_LOADS == 0
     return *val;
-#elif LOAD_TYPE == 1
+#elif SPECIAL_LOADS == 1
     // Cache streaming, likely to be accessed once.
-    // The ld.cs load cached streaming operation allocates global lines with evict-first policy in L1 and L2
+    // The ld.cs load cached streaming operation allocates global lines with
+    // evict-first policy in L1 and L2
     // to limit cache pollution by temporary streaming data
     return __ldcs(val);
 #endif
@@ -37,9 +27,9 @@ __device__ T load_diag_val(const T* val)
 template<typename T>
 __device__ T load_x_val(const T* val)
 {
-#if LOAD_TYPE == 0
+#if SPECIAL_LOADS == 0
     return *val;
-#else
+#elif SPECIAL_LOADS == 1
     // Needs compute capability at least 3.5
     return __ldg(val);
 #endif
@@ -58,14 +48,14 @@ struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
     ValueType1 diag_val = 0;
     ValueType2 x_val = 0;
 
-    __device__ void prefetch(const IndexType* offsets,
-                             const ValueType1* values,
-                             const ValueType2* x,
-                             int pitch,
-                             int cols,
-                             IndexType row,
-                             IndexType offset_base,
-                             IndexType i)
+    __device__ void prefetch_vals(const IndexType* offsets,
+                                  const ValueType1* values,
+                                  const ValueType2* x,
+                                  int pitch,
+                                  int cols,
+                                  IndexType row,
+                                  IndexType offset_base,
+                                  IndexType i)
     {
         col = offsets[i] + row;
 
@@ -75,13 +65,15 @@ struct UnrolledLoop : UnrolledLoop<IndexType, ValueType1, ValueType2, K - 1>
             x_val = load_x_val(&x[col]);
         }
 
-        parent::prefetch(offsets, values, x, pitch, cols, row, offset_base, i + 1);
+        parent::prefetch_vals(offsets, values, x, pitch, cols, row,
+                              offset_base, i + 1);
     }
 
     template<typename ValueType3>
-    __device__ ValueType3 do_iter()
+    __device__ ValueType3 accumulate()
     {
-        return static_cast<ValueType3>(diag_val*x_val) + parent::template do_iter<ValueType3>();
+        return static_cast<ValueType3>(diag_val*x_val)
+                + parent::template accumulate<ValueType3>();
     }
 };
 
@@ -90,47 +82,48 @@ template<typename IndexType,
          typename ValueType2>
 struct UnrolledLoop<IndexType, ValueType1, ValueType2, 0>
 {
-    __device__ void prefetch(const IndexType* offsets,
-                             const ValueType1* values,
-                             const ValueType2* x,
-                             int pitch,
-                             int cols,
-                             IndexType row,
-                             IndexType offset_base,
-                             IndexType i) {}
+    __device__ void prefetch_vals(const IndexType* offsets,
+                                  const ValueType1* values,
+                                  const ValueType2* x,
+                                  int pitch,
+                                  int cols,
+                                  IndexType row,
+                                  IndexType offset_base,
+                                  IndexType i) {}
 
     template<typename ValueType3>
-    __device__ ValueType3 do_iter() { return 0; }
+    __device__ ValueType3 accumulate() { return 0; }
 };
 
 
-#define PREFETCH_LOAD_ITERATION(iter)                                          \
-    IndexType col##iter       = offsets[i + iter - 1] + thread_id;             \
+#define PREFETCH_VALS_ITERATION(iter)                                          \
+    IndexType col##iter       = offsets[i + iter - 1] + row;             \
     ValueType1 diag_val##iter = 0;                                             \
     ValueType2 x_val##iter    = 0;                                             \
     if (col##iter >= 0 && col##iter < num_cols)                                \
     {                                                                          \
-        IndexType idx = (offset_base + i + iter - 1)*pitch + thread_id;        \
+        IndexType idx = (offset_base + i + iter - 1)*pitch + row    ;        \
         diag_val##iter = load_diag_val(&values[idx]);                          \
         x_val##iter    = load_x_val(&x[col##iter]);                            \
     }
 
-#define PREFETCH_ACCUM_ITERATION(iter) \
+#define ACCUMULATE_ITERATION(iter) \
     sum += diag_val##iter * x_val##iter;
 
-#define PREFETCH1(ITER) ITER(1);
-#define PREFETCH2(ITER) ITER(2); PREFETCH1(ITER)
-#define PREFETCH3(ITER) ITER(3); PREFETCH2(ITER)
-#define PREFETCH4(ITER) ITER(4); PREFETCH3(ITER)
-#define PREFETCH5(ITER) ITER(5); PREFETCH4(ITER)
-#define PREFETCH6(ITER) ITER(6); PREFETCH5(ITER)
-#define PREFETCH7(ITER) ITER(7); PREFETCH6(ITER)
-#define PREFETCH8(ITER) ITER(8); PREFETCH7(ITER)
+#define REPEAT1(ITER) ITER(1);
+#define REPEAT2(ITER) ITER(2); REPEAT1(ITER)
+#define REPEAT3(ITER) ITER(3); REPEAT2(ITER)
+#define REPEAT4(ITER) ITER(4); REPEAT3(ITER)
+#define REPEAT5(ITER) ITER(5); REPEAT4(ITER)
+#define REPEAT6(ITER) ITER(6); REPEAT5(ITER)
+#define REPEAT7(ITER) ITER(7); REPEAT6(ITER)
+#define REPEAT8(ITER) ITER(8); REPEAT7(ITER)
 
-#define PREFETCH_IMPL(factor, ITER) PREFETCH##factor(ITER)
+#define PREFETCH_VALS_IMPL(factor) REPEAT##factor(PREFETCH_VALS_ITERATION)
+#define ACCUMULATE_IMPL(factor) REPEAT##factor(ACCUMULATE_ITERATION)
 
-#define PREFETCH_LOAD_VALS(factor) PREFETCH_IMPL(factor, PREFETCH_LOAD_ITERATION)
-#define PREFETCH_ACCUMULATE(factor) PREFETCH_IMPL(factor, PREFETCH_ACCUM_ITERATION)
+#define PREFETCH_VALS(factor) PREFETCH_VALS_IMPL(factor)
+#define ACCUMULATE(factor) ACCUMULATE_IMPL(factor)
 
 
 template <typename IndexType,
@@ -142,17 +135,12 @@ blocked_offsets_dia_kernel(const int num_rows,
                            const int num_cols,
                            const int num_diagonals,
                            const int pitch,
-                           const IndexType* diagonal_offsets,
-                           const ValueType1* values,
-                           const ValueType2* x,
-                           ValueType3* y)
+                           const IndexType* __restrict__ diagonal_offsets,
+                           const ValueType1* __restrict__ values,
+                           const ValueType2*__restrict__  x,
+                           ValueType3* __restrict__ y)
 {
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-#if SHARED_PREFETCH_FACTOR > 0
-    __shared__ ValueType1 values_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE];
-    __shared__ ValueType2 x_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE];
-#endif
+    const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
     __shared__ IndexType offsets[BLOCK_SIZE];
     ValueType3 sum = ValueType3(0);
@@ -168,218 +156,33 @@ blocked_offsets_dia_kernel(const int num_rows,
                             ? num_diagonals - offset_base
                             : BLOCK_SIZE;
 
-        if (thread_id < num_rows)
+        if (row < num_rows)
         {
             IndexType i = 0;
 
 #if PREFETCH_FACTOR > 0
             for (; i < batch_size - PREFETCH_FACTOR + 1; i += PREFETCH_FACTOR)
             {
-#if REGISTER_PREFETCH_FACTOR > 0
-
-#if REGISTER_PREFETCH_TYPE == 0
-                UnrolledLoop<IndexType, ValueType1, ValueType2, REGISTER_PREFETCH_FACTOR> loop;
-                loop.prefetch(offsets, values, x, pitch, num_cols, thread_id, offset_base, i);
-                sum += loop.template do_iter<ValueType3>();
-#elif REGISTER_PREFETCH_TYPE == 1
-                PREFETCH_LOAD_VALS(PREFETCH_FACTOR);
-                PREFETCH_ACCUMULATE(PREFETCH_FACTOR);
-#endif // REGISTER_PREFETCH_TYPE == 1
-
-#elif SHARED_PREFETCH_FACTOR > 0
-                for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j)
-                {
-                    IndexType col = offsets[i + j] + thread_id;
-                    if (col >= 0 && col < num_cols)
-                    {
-                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] =
-                            load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]);;
-                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] =
-                            load_x_val(&x[col]);
-                    }
-                    else
-                    {
-                        values_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType1(0);
-                        x_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType2(0);
-                    }
-                }
-
-                for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j)
-                {
-                    sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x];
-                }
-#endif // SHARED_PREFETCH_FACTOR > 0
-            }
-#endif // PREFETCH_FACTOR > 0
-
-            for (; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + thread_id;
-                if (col >= 0 && col < num_cols)
-                {
-                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]);
-                    auto x_val = load_x_val(&x[col]);
-
-                    sum += diag_val*x_val;
-                }
-            }
-        }
-
-        __syncthreads();
-    }
-
-    if (thread_id < num_rows)
-        y[thread_id] = sum;
-}
-
-
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-__device__ void
-striped_dia_kernel(const int num_rows,
-                           const int num_cols,
-                           const int num_diagonals,
-                           const int pitch,
-                           const IndexType* diagonal_offsets,
-                           const ValueType1* values,
-                           const ValueType2* x,
-                           ValueType3* y)
-{
-#ifdef STRIPING_FACTOR
-    const IndexType num_groups = STRIPING_FACTOR;
-#else
-    const IndexType num_groups = 1;
+#if PREFETCH_TYPE == 0
+                UnrolledLoop<IndexType, ValueType1, ValueType2, PREFETCH_FACTOR> loop;
+                loop.prefetch_vals(offsets, values, x, pitch, num_cols,
+                                   row, offset_base, i);
+                sum += loop.template accumulate<ValueType3>();
+#elif PREFETCH_TYPE == 1
+                PREFETCH_VALS(PREFETCH_FACTOR);
+                ACCUMULATE(PREFETCH_FACTOR);
 #endif
-
-    const IndexType group_size = BLOCK_SIZE/num_groups;
-    const IndexType stride = num_rows/num_groups;
-
-    const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-    const IndexType group_id = threadIdx.x / group_size;
-    const IndexType group_lane_id = threadIdx.x % group_size;
-
-    const IndexType row = group_id*stride + blockIdx.x*group_size
-                            + group_lane_id;
-
-    // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row);
-
-    __shared__ IndexType offsets[BLOCK_SIZE];
-    ValueType3 sum = ValueType3(0);
-
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
-    {
-        if (offset_base + threadIdx.x < num_diagonals)
-        {
-            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
-
-        __syncthreads();
-
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
-
-        if (row < num_rows)
-        {
-            for (IndexType i = 0; i < batch_size; ++i)
-            {
-                IndexType col = offsets[i] + row;
-                if (col >= 0 && col < num_cols)
-                {
-                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
-                    auto x_val = load_x_val(&x[col]);
-
-                    sum += diag_val*x_val;
-                }
             }
-        }
-
-        __syncthreads();
-    }
-
-    if (row < num_rows)
-        y[row] = sum;
-}
-
-
-template <typename IndexType,
-          typename ValueType1,
-          typename ValueType2,
-          typename ValueType3>
-__device__ void
-double_striped_dia_kernel(const int num_rows,
-                           const int num_cols,
-                           const int num_diagonals,
-                           const int pitch,
-                           const IndexType* diagonal_offsets,
-                           const ValueType1* values,
-                           const ValueType2* x,
-                           ValueType3* y)
-{
-#if KERNEL_TYPE == 3
-    IndexType blocks_per_sector = BLOCKS_PER_SECTOR;
-    IndexType stripe_size = STRIPE_SIZE;
-    IndexType chunk_size = CHUNK_SIZE;
-#else
-    IndexType blocks_per_sector = 0;
-    IndexType stripe_size = 0;
-    IndexType chunk_size = 0;
-    assert(false);
 #endif
 
-#if SECTOR_MAPPING_TYPE == 0
-    IndexType sector_id = blockIdx.x / blocks_per_sector;
-    IndexType sector_lane_id = blockIdx.x % blocks_per_sector;
-
-    IndexType stripe_id = threadIdx.x / (chunk_size*32);
-    IndexType stripe_lane_id = threadIdx.x % (chunk_size*32);
-
-    IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE
-                    + stripe_id*stripe_size
-                    + sector_lane_id*chunk_size*32
-                    + stripe_lane_id;
-#elif SECTOR_MAPPING_TYPE == 1
-    IndexType num_sectors = gridDim.x / blocks_per_sector;
-
-    IndexType sector_id = blockIdx.x % num_sectors;
-    IndexType sector_lane_id = blockIdx.x / num_sectors;
-
-    IndexType stripe_id = threadIdx.x / (chunk_size*32);
-    IndexType stripe_lane_id = threadIdx.x % (chunk_size*32);
-
-    IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE
-                    + stripe_id*stripe_size
-                    + sector_lane_id*chunk_size*32
-                    + stripe_lane_id;
-#endif
-
-    IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
-
-    //printf("(%d, %d, %d) -> %d\n", blockIdx.x, threadIdx.x, thread_id, row);
-
-    __shared__ IndexType offsets[BLOCK_SIZE];
-    ValueType3 sum = ValueType3(0);
-
-    for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE)
-    {
-        if (offset_base + threadIdx.x < num_diagonals)
-        {
-            offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x];
-        }
-
-        __syncthreads();
-
-        int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE;
-
-        if (row < num_rows)
-        {
-            for (IndexType i = 0; i < batch_size; ++i)
+            for (; i < batch_size; ++i)
             {
                 IndexType col = offsets[i] + row;
                 if (col >= 0 && col < num_cols)
                 {
-                    auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]);
+                    IndexType idx = (offset_base + i)*pitch + row;
+
+                    auto diag_val = load_diag_val(&values[idx]);
                     auto x_val = load_x_val(&x[col]);
 
                     sum += diag_val*x_val;
@@ -404,10 +207,10 @@ naive_dia_kernel(const int num_rows,
                  const int num_cols,
                  const int num_diagonals,
                  const int pitch,
-                 const IndexType* diagonal_offsets,
-                 const ValueType1* values,
-                 const ValueType2* x,
-                 ValueType3* y)
+                 const IndexType* __restrict__ diagonal_offsets,
+                 const ValueType1* __restrict__ values,
+                 const ValueType2* __restrict__ x,
+                 ValueType3* __restrict__ y)
 {
     const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x;
 
@@ -439,10 +242,10 @@ ktt_dia_vector_kernel(const int num_rows,
                       const int num_cols,
                       const int num_diagonals,
                       const int pitch,
-                      const IndexType* diagonal_offsets,
-                      const ValueType1* values,
-                      const ValueType2* x,
-                      ValueType3* y)
+                      const IndexType* __restrict__ diagonal_offsets,
+                      const ValueType1* __restrict__ values,
+                      const ValueType2 *__restrict__ x,
+                      ValueType3* __restrict__ y)
 {
     blocked_offsets_dia_kernel<IndexType, ValueType1, ValueType2, ValueType3>
         (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y);
diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h
index 3d89bd60..f71431a9 100644
--- a/cusp/system/cuda/ktt/kernels/ell_kernel.h
+++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h
@@ -9,14 +9,10 @@
 template<typename T>
 __device__ T load(const T* addr)
 {
-#if UNCACHED_LOADS == 0
+#if SPECIAL_LOADS == 0
     return *addr;
-#elif UNCACHED_LOADS == 1
+#elif SPECIAL_LOADS == 1
     return __ldcv(addr);
-#elif UNCACHED_LOADS == 2
-    return __ldcs(addr);
-#elif UNCACHED_LOADS == 3
-    return __ldlu(addr);
 #endif
 }
 
@@ -35,30 +31,30 @@ struct Prefetcher : Prefetcher<IndexType, ValueType,  K - 1>
     __device__
     void prefetch_cols(const IndexType* __restrict__ Aj,
                       IndexType offset,
-                      IndexType pitch,
+                      IndexType stride,
                       IndexType i)
     {
-        col = load(Aj + offset + i*pitch);
-        parent::prefetch_cols(Aj, offset, pitch, i + 1);
+        col = load(Aj + offset + i*stride);
+        parent::prefetch_cols(Aj, offset, stride, i + 1);
     }
 
     __device__
     void prefetch_vals(const ValueType* __restrict__ Ax,
                        const ValueType* __restrict__ x,
                        IndexType offset,
-                       IndexType pitch,
+                       IndexType stride,
                        IndexType i)
     {
 #ifndef ELLR
         if (col >= 0)
         {
 #endif
-            A_val = load(Ax + offset + i*pitch);
+            A_val = load(Ax + offset + i*stride);
             x_val = x[col];
 #ifndef ELLR
         }
 #endif
-        parent::prefetch_vals(Ax, x, offset, pitch, i + 1);
+        parent::prefetch_vals(Ax, x, offset, stride, i + 1);
     }
 
     __device__ ValueType accumulate_results()
@@ -73,12 +69,12 @@ struct Prefetcher<IndexType, ValueType, 0>
 {
     __device__
     void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset,
-                       IndexType pitch, IndexType i) { }
+                       IndexType stride, IndexType i) { }
 
     __device__
     void prefetch_vals(const ValueType* __restrict__ Ax,
                        const ValueType* __restrict__ x, IndexType offset,
-                       IndexType pitch, IndexType i) { }
+                       IndexType stride, IndexType i) { }
 
     __device__ ValueType accumulate_results()
     {
diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h
index 59729596..3f79d14b 100644
--- a/cusp/system/cuda/ktt/multiply.h
+++ b/cusp/system/cuda/ktt/multiply.h
@@ -23,11 +23,14 @@ namespace ktt {
 // instead of having to manually specify the types.
 template <typename MatrixType,
           typename ValueType1,
-          typename ValueType2>
-kernel_context get_kernel(::ktt::Tuner& tuner,
-                          const MatrixType& A,
-                          const cusp::array1d<ValueType1, cusp::device_memory>& x,
-                          cusp::array1d<ValueType2, cusp::device_memory>& y)
+          typename ValueType2,
+          typename MemorySpace1,
+          typename MemorySpace2>
+kernel_context get_kernel(
+    ::ktt::Tuner& tuner,
+    const MatrixType& A,
+    const cusp::array1d<ValueType1, MemorySpace1>& x,
+    cusp::array1d<ValueType2, MemorySpace2>& y)
 {
     using IndexType = typename MatrixType::index_type;
     using ValueType = typename MatrixType::value_type;
@@ -50,8 +53,11 @@ kernel_context get_kernel(::ktt::Tuner& tuner,
 template <typename MatrixType,
           typename VectorType1,
           typename VectorType2>
-::ktt::KernelResult multiply(::ktt::Tuner& tuner, const MatrixType& A,
-                             const VectorType1& x, VectorType2& y)
+::ktt::KernelResult
+multiply(::ktt::Tuner& tuner,
+         const MatrixType& A,
+         const VectorType1& x,
+         VectorType2& y)
 {
     const kernel_context& kernel = get_kernel(tuner, A, x, y);
 
@@ -72,8 +78,11 @@ template <typename MatrixType,
           typename VectorType1,
           typename VectorType2>
 ::ktt::KernelResult
-multiply(::ktt::Tuner& tuner, const MatrixType& A, const VectorType1& x,
-         VectorType2& y, const ::ktt::KernelConfiguration& configuration,
+multiply(::ktt::Tuner& tuner,
+         const MatrixType& A,
+         const VectorType1& x,
+         VectorType2& y,
+         const ::ktt::KernelConfiguration& configuration,
          bool run_with_profiling = false)
 {
     const kernel_context& kernel = get_kernel(tuner, A, x, y);
@@ -95,8 +104,10 @@ template <typename MatrixType,
           typename VectorType1,
           typename VectorType2>
 std::vector<::ktt::KernelResult>
-tune(::ktt::Tuner& tuner, const MatrixType& A,
-     const VectorType1& x, VectorType2& y,
+tune(::ktt::Tuner& tuner,
+     const MatrixType& A,
+     const VectorType1& x,
+     VectorType2& y,
      std::optional<::ktt::ReferenceComputation> ref_computation = std::nullopt)
 {
     using Format = typename MatrixType::format;
diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h
index a3f4af1e..e773b461 100644
--- a/cusp/system/cuda/ktt/utils.h
+++ b/cusp/system/cuda/ktt/utils.h
@@ -92,7 +92,7 @@ inline void remove_arguments(const kernel_context& kernel, const std::vector<::k
 }
 
 
-uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf,
+inline uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf,
                             const std::string& name)
 {
     for (const auto& pair : conf.GetPairs())
@@ -102,7 +102,7 @@ uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf,
     throw std::runtime_error("No paramater with name: " + name);
 }
 
-double get_parameter_double(const ::ktt::KernelConfiguration& conf,
+inline double get_parameter_double(const ::ktt::KernelConfiguration& conf,
                             const std::string& name)
 {
     for (const auto& pair : conf.GetPairs())

From b65e416f66d1715067f0facbf53b23ef881d53b9 Mon Sep 17 00:00:00 2001
From: Miroslav Demek <xdemek@efi.muni.cz>
Date: Mon, 15 May 2023 20:42:19 +0200
Subject: [PATCH 49/49] Remove warnings

---
 cusp/detail/temporary_array.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cusp/detail/temporary_array.h b/cusp/detail/temporary_array.h
index 08d27908..8dbed6b0 100644
--- a/cusp/detail/temporary_array.h
+++ b/cusp/detail/temporary_array.h
@@ -28,7 +28,7 @@
 #include <thrust/detail/temporary_array.h>
 
 #if THRUST_VERSION >= 100800
-#define TEMP_HOST_DEVICE_DECORATORS __host__ __device__
+#define TEMP_HOST_DEVICE_DECORATORS __host__
 #else
 #define TEMP_HOST_DEVICE_DECORATORS
 #endif