From 05bd82b2d9f8e634048c20a51c8effe6b4b7b8cd Mon Sep 17 00:00:00 2001 From: Mirec Date: Mon, 26 Sep 2022 15:10:47 +0200 Subject: [PATCH 01/49] Add sketchy files --- build.sh | 2 +- cusp/system/cuda/ktt/dia_multiply.h | 8 +- cusp/system/cuda/ktt/kernels/dia_kernel.h | 357 +++++++---- main.cu | 710 ++++++++++++++++++++++ 4 files changed, 960 insertions(+), 117 deletions(-) create mode 100644 main.cu diff --git a/build.sh b/build.sh index 4f53b52f..1c04f512 100644 --- a/build.sh +++ b/build.sh @@ -2,7 +2,7 @@ nvcc -DCUSP_PATH=$(realpath .) \ -I . -I ../KTT/Source \ -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \ --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \ - -std=c++17 -g -O3 -DKTT_LINE_INFO -lineinfo main.cu + -std=c++17 -g -O3 -lineinfo main.cu [ "$?" -eq "0" ] || exit 1 diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 6a7f31b2..c685f6ba 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -22,7 +22,7 @@ namespace dia { inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) { - tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1, 2 }); + tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1, 2, 3, 4 }); } } // namespace dia @@ -114,9 +114,7 @@ auto get_launcher(const kernel_context& ctx, if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { - do { - interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); - } while(interface.GetRemainingProfilingRuns(ctx.definition_ids[0]) > 0); + interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); } }; } @@ -166,6 +164,8 @@ ::ktt::KernelResult multiply(::ktt::Tuner& tuner, return result; } + std::cout << "HEllo\n"; + kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); auto args = add_arguments(kernel, A, x, y); diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index df7ade63..fda35831 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -1,5 +1,52 @@ #pragma once +template +__device__ T min(T a, T b) { + return a < b ? a : b; +} + +template +__device__ T max(T a, T b) { + return a > b ? a : b; +} + +// template +// struct UnrolledLoop : UnrolledLoop +// { +// using parent = UnrolledLoop; + +// IndexType col; +// ValueType1 diag_val = 0; +// ValueType2 x_val = 0; + +// __device__ void prefetch(const IndexType* offsets, +// const ValueType1* values, +// const ValueType2* x, +// IndexType pitch, +// IndexType row, +// IndexType base, +// IndexType cols) +// { +// if (col >= 0 && col < cols) +// { +// diag_val = values[] +// parent::prefetch(offsets, values, x, row, base + 1); +// } +// } + +// template +// __device__ ValueType3 do_iter(IndexType cols) +// { +// if (col < 0 || col >= cols) +// { +// return 0; +// } +// return diag_val*x_val + parent::do_iter(cols); +// } +// }; template __device__ void -ktt_dia_vector_kernel_simple( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +naive_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) { using ValueType = ValueType3; @@ -26,9 +72,9 @@ ktt_dia_vector_kernel_simple( for (IndexType i = 0; i < num_diagonals; ++i) { IndexType col = diagonal_offsets[i] + thread_id; if (col >= 0 && col < num_cols) { - auto val = values[i*pitch + thread_id]; - auto a = x[col]; - sum +=val*a; + auto diag_val = values[i*pitch + thread_id]; + auto x_val = x[col]; + sum += diag_val * x_val; } } y[thread_id] = sum; @@ -41,15 +87,14 @@ template __device__ void -ktt_dia_vector_kernel_blocked( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +blocked_offsets_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) { using ValueType = ValueType3; @@ -71,59 +116,203 @@ ktt_dia_vector_kernel_blocked( for (IndexType i = 0; i < batch_size; ++i) { IndexType col = offsets[i] + thread_id; if (col >= 0 && col < num_cols) { - sum += values[(offset_base + i)*pitch + thread_id]*x[col]; + auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; + auto x_val = __ldg(x + col); + sum += diag_val*x_val; } } - y[thread_id] = sum; } __syncthreads(); } + + if (thread_id < num_rows) { + y[thread_id] = sum; + } } -#define ITERATION(i, offset) \ - col = offset + thread_id; \ - if (col >= 0 && col < num_cols) { \ - sum += values[i*pitch + thread_id]*x[col]; \ +template +__device__ void +cached_x_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) +{ + using ValueType = ValueType3; + + const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + + __shared__ IndexType offsets[BLOCK_SIZE]; + __shared__ ValueType x_cache[BLOCK_SIZE]; + IndexType cache_start = -IndexType(BLOCK_SIZE); + IndexType cache_end = 0; + + IndexType first_row = BLOCK_SIZE * blockIdx.x; + IndexType end_row = min(first_row + BLOCK_SIZE, num_rows); + + ValueType sum = ValueType(0); + + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { + if (offset_base + threadIdx.x < num_diagonals) { + offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; + } + + __syncthreads(); + + int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + + if (thread_id < num_rows) + { + for (IndexType i = 0; i < batch_size; ++i) + { + IndexType col = offsets[i] + thread_id; + ValueType x_val = -1; + + if (col >= 0 && col < num_cols) + { + x_val = col < cache_end ? x_cache[col - cache_start] : x[col]; + ValueType diag_val = values[(offset_base + i)*pitch + thread_id]; + sum += x_val * diag_val; + } + + __syncthreads(); + + x_cache[threadIdx.x] = x_val; + cache_start = first_row + offsets[i]; + cache_end = end_row + offsets[i]; + + __syncthreads(); + } + } + + __syncthreads(); } + if (thread_id < num_rows) { + y[thread_id] = sum; + } +} + template __device__ void -ktt_dia_vector_kernel_crazy( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +experimental_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) { using ValueType = ValueType3; const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + __shared__ IndexType offsets[BLOCK_SIZE]; + __shared__ ValueType x_cache[2*BLOCK_SIZE]; + + IndexType first_row = BLOCK_SIZE * blockIdx.x; + IndexType first_col = offsets[0] + first_row; + + if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) { + x_cache[threadIdx.x] = x[first_col + threadIdx.x]; + } + + if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) { + x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x]; + } + ValueType sum = ValueType(0); + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { + if (offset_base + threadIdx.x < num_diagonals) { + offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; + } + + __syncthreads(); + + int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + + if (thread_id < num_rows) + { + for (IndexType i = 0; i < batch_size; ++i) + { + IndexType col = offsets[i] + thread_id; + + if (col >= 0 && col < num_cols) + { + ValueType x_val = x_cache[col - first_col]; + ValueType diag_val = values[(offset_base + i)*pitch + thread_id]; + sum += x_val * diag_val; + } + } + } + + __syncthreads(); + } + if (thread_id < num_rows) { - IndexType col; - ITERATION(0, -4); - ITERATION(1, -3); - ITERATION(2, -2); - ITERATION(3, -1); - ITERATION(4, -0); - ITERATION(5, -1); - ITERATION(6, -2); - ITERATION(7, -3); - ITERATION(8, -4); - ITERATION(9, -5); y[thread_id] = sum; } +} +template +__device__ void +cached_naive_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) +{ + using ValueType = ValueType3; + + const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + + __shared__ ValueType x_cache[2*BLOCK_SIZE]; + + IndexType first_row = BLOCK_SIZE * blockIdx.x; + IndexType first_col = diagonal_offsets[0] + first_row; + + if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) { + x_cache[threadIdx.x] = x[first_col + threadIdx.x]; + } + + if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) { + x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x]; + } + + + if (thread_id < num_rows) { + ValueType sum = ValueType(0); + for (IndexType i = 0; i < num_diagonals; ++i) { + IndexType col = diagonal_offsets[i] + thread_id; + if (col >= 0 && col < num_cols) { + auto diag_val = values[i*pitch + thread_id]; + auto x_val = x_cache[col - first_col]; + sum += diag_val * x_val; + } + } + y[thread_id] = sum; + } } template + naive_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #elif KERNEL_TYPE == 1 - ktt_dia_vector_kernel_blocked + blocked_offsets_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #elif KERNEL_TYPE == 2 - ktt_dia_vector_kernel_crazy + cached_x_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); +#elif KERNEL_TYPE == 3 + experimental_dia_kernel + (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); +// #elif KERNEL_TYPE == 4 +// cached_naive_dia_kernel +// (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #endif } - -// template -// __launch_bounds__(BLOCK_SIZE,1) __global__ void -// ktt_dia_vector_kernel( -// const int num_rows, -// const int num_cols, -// const int num_diagonals, -// const int pitch, -// const IndexType* diagonal_offsets, -// const ValueType1* values, -// const ValueType2* x, -// ValueType3* y) -// { -// typedef ValueType1 ValueType; - -// __shared__ IndexType offsets[BLOCK_SIZE]; - -// const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; -// const IndexType grid_size = BLOCK_SIZE * gridDim.x; - -// for(IndexType base = 0; base < num_diagonals; base += BLOCK_SIZE) -// { -// // read a chunk of the diagonal offsets into shared memory -// const IndexType chunk_size = IndexType(BLOCK_SIZE) < num_diagonals - base ? IndexType(BLOCK_SIZE) : num_diagonals - base; - -// if(threadIdx.x < chunk_size) -// offsets[threadIdx.x] = diagonal_offsets[base + threadIdx.x]; - -// __syncthreads(); - -// // process chunk -// for(IndexType row = thread_id; row < num_rows; row += grid_size) -// { -// ValueType sum = (base == 0) ? ValueType(0) : ValueType(0); - -// // index into values array -// IndexType idx = row + pitch * base; - -// for(IndexType n = 0; n < chunk_size; n++) -// { -// const IndexType col = row + offsets[n]; - -// if(col >= 0 && col < num_cols) -// { -// const ValueType A_ij = values[idx]; -// sum = sum + (A_ij * x[col]); -// } - -// idx += pitch; -// } - -// y[row] = sum; -// } - -// // wait until all threads are done reading offsets -// __syncthreads(); -// } -// } \ No newline at end of file diff --git a/main.cu b/main.cu new file mode 100644 index 00000000..f7edd966 --- /dev/null +++ b/main.cu @@ -0,0 +1,710 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "better_convert.hpp" + +namespace krn = std::chrono; + + +struct Timer +{ + Timer() : start_(krn::steady_clock::now()) { } + + void restart() { start_ = krn::steady_clock::now(); } + + void print_elapsed(const std::string& label) + { + auto duration = krn::duration_cast(krn::steady_clock::now() - start_).count(); + std::cout << label << ": " << duration << "\n"; + } + + void checkpoint(const std::string& label) + { + print_elapsed(label); + restart(); + } + +private: + krn::steady_clock::time_point start_; + +}; + + +struct ConversionTimer +{ + cusp::dia_matrix m; + + ConversionTimer() { + m = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 512); + } + + ConversionTimer(const ConversionTimer& o) { + std::cout << "COPY!\n"; + } + + template + void operator()() + { + using Format1 = typename Matrix1::format; + using Format2 = typename Matrix2::format; + + Matrix1 m1 = m; + + auto start = krn::steady_clock::now(); + Matrix2 m2 = m1; + auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + + std::cout << NAMEOF_TYPE(Format1) << " to " << NAMEOF_TYPE(Format2) << ": " << duration << "\n"; + } +}; + + +template +struct type_list +{ + static constexpr bool is_empty = true; +}; + +template< + typename First, + typename... Args +> +struct type_list +{ + using head = First; + using tail = type_list; + + static constexpr bool is_empty = false; +}; + + +template< + typename T, + typename List, + typename Op +> +void for_each_type(Op&& op) +{ + if constexpr (!List::is_empty) + { + if (!std::is_same_v) + { + op.template operator()(); + } + for_each_type(std::forward(op)); + } +} + +template +void for_each_pair(Op&& op) +{ + if constexpr (!List1::is_empty && !List2::is_empty) + { + for_each_type(std::forward(op)); + for_each_pair(std::forward(op)); + } +} + +template +void for_each_pair(Op&& op) +{ + for_each_pair(std::forward(op)); +} + +template +using format_list = type_list< + cusp::dia_matrix, + cusp::csr_matrix, + cusp::coo_matrix, + cusp::ell_matrix +>; + + +void print_ktt_status(const ::ktt::ResultStatus& status) { + switch (status) + { + case ::ktt::ResultStatus::Ok: + std::cout << "OK\n"; + break; + case ::ktt::ResultStatus::CompilationFailed: + std::cout << "COMPILATION FAILED\n"; + break; + case ::ktt::ResultStatus::ComputationFailed: + std::cout << "COMPUTATION FAILED\n"; + break; + case ::ktt::ResultStatus::DeviceLimitsExceeded: + std::cout << "DEVICE LIMIT EXCEEDED\n"; + break; + case ::ktt::ResultStatus::ValidationFailed: + std::cout << "VALIDATION FAILED\n"; + break; + } +} + +template +cusp::dia_matrix +convert(const cusp::coo_matrix& matrix) { + using DiaMatrix = cusp::dia_matrix; + + cusp::array1d diag_map(matrix.num_rows + matrix.num_cols, 0); + + auto start = krn::steady_clock::now(); + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + diag_map[diag_idx] = 1; + } + auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + std::cout << "First loop: " << duration << "\n"; + + typename DiaMatrix::diagonal_offsets_array_type offsets; + offsets.reserve(512); // arbitrary number + + start = krn::steady_clock::now(); + int diag_count = 0; + for (int i = 0; i < diag_map.size(); ++i) { + if (diag_map[i] != 0) { + diag_map[i] = diag_count; + offsets.push_back(i - matrix.num_rows + 1); + diag_count++; + } + } + duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + std::cout << "Third loop: " << duration << "\n"; + + + //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows; + int pitch = matrix.num_rows + 64/sizeof(float); + typename DiaMatrix::values_array_type values(matrix.num_rows, diag_count, ValueType(0), pitch); + //std::cout << values.pitch << "\n"; + + start = krn::steady_clock::now(); + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + int values_array_idx = diag_map[diag_idx]; + values(matrix.row_indices[i], values_array_idx) = matrix.values[i]; + } + duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + std::cout << "Second loop: " << duration << "\n"; + + DiaMatrix res; + + res.num_rows = matrix.num_rows; + res.num_cols = matrix.num_cols; + res.num_entries = matrix.num_entries; + + res.diagonal_offsets.swap(offsets); + res.values.swap(values); + + return res; +} + +template +cusp::dia_matrix +convert2(const cusp::coo_matrix& matrix) { + using DiaMatrix = cusp::dia_matrix; + + Timer timer; + + cusp::array1d diag_map(matrix.num_rows + matrix.num_cols, 0); + + timer.checkpoint("init map"); + + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + diag_map[diag_idx] = 1; + } + + timer.checkpoint("find diags"); + + typename DiaMatrix::diagonal_offsets_array_type offsets; + offsets.reserve(512); // arbitrary number + + int diag_count = 0; + for (int i = 0; i < diag_map.size(); ++i) { + if (diag_map[i] != 0) { + diag_map[i] = diag_count; + offsets.push_back(i - matrix.num_rows + 1); + diag_count++; + } + } + + timer.checkpoint("fill map"); + + + //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows; + typename DiaMatrix::values_array_type values(diag_count, matrix.num_rows, ValueType(0)); + //std::cout << values.pitch << "\n"; + + timer.checkpoint("init values"); + + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + int values_array_idx = diag_map[diag_idx]; + values(values_array_idx, matrix.row_indices[i]) = matrix.values[i]; + } + + timer.checkpoint("fill values"); + + DiaMatrix res; + + res.num_rows = matrix.num_rows; + res.num_cols = matrix.num_cols; + res.num_entries = matrix.num_entries; + + res.diagonal_offsets.swap(offsets); + res.values.swap(values); + + return res; +} + +template +bool equals(const cusp::dia_matrix& A, + const cusp::dia_matrix& B) +{ + return A.values == B.values && A.diagonal_offsets == B.diagonal_offsets; +} + +std::string size_str(uint64_t bytes) +{ + float size = bytes; + + auto get_str = [](float x, const char* units) + { + std::stringstream s; + s << std::setprecision(3) << std::fixed << x << units; + return s.str(); + }; + + if (size < 1024) + { + return get_str(size, " B"); + } + + size /= 1024; + if (size < 1024) + { + return get_str(size, " KB"); + } + + size /= 1024; + if (size < 1024) + { + return get_str(size, " MB"); + } + + size /= 1024; + return get_str(size, " GB"); +} + +std::string counter_str(const ::ktt::KernelProfilingCounter& counter) +{ + switch (counter.GetType()) { + case ::ktt::ProfilingCounterType::Double: + return std::to_string(counter.GetValueDouble()); + case ::ktt::ProfilingCounterType::Int: + return std::to_string(counter.GetValueInt()); + case ::ktt::ProfilingCounterType::Percent: + return std::to_string(counter.GetValueDouble()) + " %"; + case ::ktt::ProfilingCounterType::Throughput: + return std::to_string(counter.GetValueUint()) + " bytes/sec\n"; + case ::ktt::ProfilingCounterType::UnsignedInt: + return std::to_string(counter.GetValueUint()); + case ::ktt::ProfilingCounterType::UtilizationLevel: + return std::to_string(counter.GetValueUint()); + } + + return ""; +} + +size_t dia_matrix_size(size_t num_rows, size_t num_cols, size_t num_diags) +{ + return sizeof(int)*num_diags + num_rows*num_diags*sizeof(float); +} + +template +size_t min_read_bytes(const cusp::dia_matrix& A, + const cusp::array1d& x) +{ + // Assume that each column has at least one diagonal passing through it. + size_t result = x.size() * sizeof(ValueT2); + + for (IndexT offset : A.diagonal_offsets) + { + IndexT start_row = offset >= 0 ? 0 : -offset; + IndexT start_col = offset >= 0 ? offset : 0; + IndexT end_row = std::min(A.num_cols - start_col, A.num_rows - start_row); + + // Assume that global reads are done in 32 byte transactions that must be aligned. + IndexT elems_per_read = 32 / sizeof(ValueT1); + IndexT adjusted_start_row = start_row - (start_row % elems_per_read); + IndexT adjusted_end_row = end_row + elems_per_read - (end_row % elems_per_read); + + result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT1); + } + + return result; +} + +::ktt::KernelResult profile_kernel(const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& conf) +{ + ::ktt::KernelResult result; + std::vector<::ktt::KernelResult> results; + do { + result = cusp::ktt::multiply(A, x, y, conf, true); + results.push_back(result); + } while(result.HasRemainingProfilingRuns()); + + auto& tuner = cusp::ktt::get_tuner(); + tuner.SaveResults(results, "results.json", ::ktt::OutputFormat::JSON); + + return result; +} + + +size_t get_actual_read_bytes(const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& conf) +{ + auto& tuner = cusp::ktt::get_tuner(); + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + tuner.SetProfilingCounters({ + "dram_read_bytes", + "dram_read_transactions", + "dram_read_throughput", + "l2_global_load_bytes" + //"l2_global_load_bytes" + }); + tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); + tuner.SetProfiling(true); + + auto kernel_result = profile_kernel(A, x, y, conf); + + // auto kernel_result = cusp::ktt::multiply(A, x, y, conf, true); + + // if (kernel_result.HasRemainingProfilingRuns()) + // { + // std::cout << "NEED RUNS!\n"; + // } + + for (const auto& computation_result : kernel_result.GetResults()) + { + if (!computation_result.HasProfilingData()) + { + std::cout << "No profiling data!\n"; + continue; + } + + const auto& profiling_data = computation_result.GetProfilingData(); + if (!profiling_data.IsValid()) + { + std::cout << "profiling data invalid\n"; + } + else + { + std::cout << "Transactions: " << profiling_data.GetCounter("dram_read_transactions").GetValueUint() << "\n"; + std::cout << "Throughput: " << profiling_data.GetCounter("dram_read_throughput").GetValueUint() << "\n"; + std::cout << "Bytes: " << profiling_data.GetCounter("l2_global_load_bytes").GetValueUint() << "\n"; + return profiling_data.GetCounter("dram_read_bytes").GetValueUint(); + } + } + + return size_t(-1); +} + +void test_l2() +{ + auto& tuner = cusp::ktt::get_tuner(); + + const size_t max_bytes = 2u*(1u << 30); // 2 GB + const size_t n = (1u << 20)*4; + + for (int d = 32; d < 4096; d *= 2) + { + if (dia_matrix_size(n, n, d) <= max_bytes) + { + std::cout << d << " diags: \n"; + + auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, d); + + cusp::dia_matrix A = dia_host_matrix; + cusp::array1d x(A.num_cols, 1); + cusp::array1d y(A.num_rows); + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + + auto expected_bytes = min_read_bytes(A, x); + auto actual_bytes = get_actual_read_bytes(A, x, y, conf); + + std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, d)) << "\n"; + std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + std::cout << "Actual: " << size_str(actual_bytes) << "\n"; + } + } + + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11); + + // cusp::dia_matrix A = dia_host_matrix; + // cusp::array1d x(A.num_cols, 1); + // cusp::array1d y(A.num_rows); + + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + + // auto expected_bytes = min_read_bytes(A, x); + // auto actual_bytes = get_actual_read_bytes(A, x, y, conf); + + // std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + // std::cout << "Actual: " << size_str(actual_bytes) << "\n"; +} + +void test_matrix() +{ + auto& tuner = cusp::ktt::get_tuner(); + + const int n = 8*(1u << 20); + //const int n = (1u << 20)/2; + + //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024); + auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32); + cusp::dia_matrix A = dia_host_matrix; + + // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); + + cusp::array1d x(A.num_cols, 1); + cusp::array1d y(A.num_rows); + + //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11); + //cusp::dia_matrix A2 = dia_host_matrix; + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + // tuner.SetProfiling(true); + // tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); + // tuner.SetProfilingCounters({ + // "dram_read_bytes" + // //"dram_write_bytes", + // //"l2_global_load_bytes" + // }); + + //cusp::ktt::tune(A2, x, y); + //cusp::ktt::tune(A, x, y); + + // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); + // cusp::ktt::multiply(A, x, y, conf1); + + auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; + + //cusp::ktt::multiply(A, x, y, conf2); + std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + + auto expected_bytes = min_read_bytes(A, x); + std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + + + + //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + + // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } }); + // cusp::ktt::multiply(A, x, y, conf3); +} + +int main(void) +{ + auto& tuner = cusp::ktt::get_tuner(); + tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds); + + //test_l2(); + test_matrix(); + + // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256); + // cusp::coo_matrix A = orig; + // cusp::coo_matrix B = orig; + + // auto start = krn::steady_clock::now(); + // cusp::dia_matrix B1 = convert(B); + // auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "ME host: " << duration << "\n\n"; + + // start = krn::steady_clock::now(); + // cusp::dia_matrix B2 = convert(A); + // duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "ME device: " << duration << "\n\n"; + + // start = krn::steady_clock::now(); + // cusp::dia_matrix B3 = A; + // duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "cusp device: " << duration << "\n\n"; + + // cusp::dia_matrix D = convert(A); + // if (!equals(D, orig)) { + // std::cout << "RIIIIIIIP!!!!\n"; + // } + + // auto start = krn::steady_clock::now(); + // cusp::dia_matrix B1 = convert(A); + // auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "ME: " << duration << "\n\n"; + + // start = krn::steady_clock::now(); + // cusp::dia_matrix B2 = convert2(A); + // duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "Transposed: " << duration << "\n\n"; + + // if (!equals(B1, B2)) { + // std::cout << "RIIIIIIP!!!\n"; + // } + + // using list = format_list; + // ConversionTimer timer; + // for_each_pair(timer); + + // cudaDeviceProp prop; + // cudaError_t err = cudaGetDeviceProperties(&prop, 0); + + // if (err == cudaSuccess) { + // std::cout << "Max warps per sm: " << prop.maxThreadsPerMultiProcessor << "\n"; + // } else { + // std::cout << "Failed to retrieve device properties\n"; + // } + + // { + // const int n = (1u << 20); + // //const int n = (1u << 20)/2; + + // //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024); + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11); + // cusp::dia_matrix A = dia_host_matrix; + + // // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); + + // cusp::array1d x(A.num_cols, 1); + // cusp::array1d y(A.num_rows); + + // //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11); + // //cusp::dia_matrix A2 = dia_host_matrix; + + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + // //cusp::ktt::tune(A2, x, y); + // //cusp::ktt::tune(A, x, y); + + // // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); + // // cusp::ktt::multiply(A, x, y, conf1); + + // auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + // //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; + // cusp::ktt::multiply(A, x, y, conf2); + + // // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } }); + // // cusp::ktt::multiply(A, x, y, conf3); + // } + + // { + // const int n = 1u << 15; + // auto coo_host_matrix = make_diagonal_symmetric_matrix(n, n, 512, 16); + // cusp::coo_matrix coo_device_matrix = coo_host_matrix; + // cusp::dia_matrix A = coo_host_matrix; + + // cusp::array1d x(coo_host_matrix.num_cols, 1); + // cusp::array1d y_ref(coo_host_matrix.num_rows); + // cusp::array1d y(coo_host_matrix.num_rows); + + // cusp::multiply(coo_device_matrix, x, y_ref); + + // auto& tuner = cusp::ktt::get_tuner(); + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + // // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0]) << "\n"; + + // tuner.SetProfiling(true); + // tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); + + // tuner.SetProfilingCounters({ + // "tex_cache_hit_rate", + // "tex_cache_transactions", + // "l1_cache_global_hit_rate", + // "l2_l1_read_hit_rate", + // "l2_l1_read_transactions", + // "l2_read_transactions", + // "l2_write_transactions", + // "dram_read_transactions", + // "dram_write_transactions", + // "dram_read_bytes", + // "dram_write_bytes" + // }); + + // auto result = cusp::ktt::multiply(A, x, y, {}, true); + + // for (const auto& res : result.GetResults()) { + // if (!res.HasProfilingData()) { + // std::cout << "No profiling data!\n"; + // continue; + // } + // const auto& prof = res.GetProfilingData(); + // if (!prof.IsValid()) { + // std::cout << "profiling data invalid\n"; + // } else { + // for (const auto& counter : prof.GetCounters()) { + // std::cout << counter.GetName() << " = "; + // switch (counter.GetType()) { + // case ::ktt::ProfilingCounterType::Double: + // std::cout << counter.GetValueDouble() << "\n"; + // break; + // case ::ktt::ProfilingCounterType::Int: + // std::cout << counter.GetValueInt() << "\n"; + // break; + // case ::ktt::ProfilingCounterType::Percent: + // std::cout << counter.GetValueDouble() << " %\n"; + // break; + // case ::ktt::ProfilingCounterType::Throughput: + // std::cout << counter.GetValueUint() << " bytes/sec\n"; + // break; + // case ::ktt::ProfilingCounterType::UnsignedInt: + // std::cout << counter.GetValueUint() << "\n"; + // break; + // case ::ktt::ProfilingCounterType::UtilizationLevel: + // std::cout << counter.GetValueUint() << "\n"; + // break; + // } + // } + // } + // } + + // if (y_ref != y) { + // std::cout << "SECOND DIFF!\n"; + // } + // } + + //cusp::ktt::multiply(A, x, y); + //std::cout << y[0] << "\n"; + + //std::cout << y_ref_host.size() << "\n"; + //std::cout << y.size() << "\n"; + + return 0; +} From 87be93a6094c6c8f7b26b9f7a661ef1dd7d2c95b Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Fri, 30 Sep 2022 14:53:47 +0200 Subject: [PATCH 02/49] Don't include better_convert.hpp --- main.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/main.cu b/main.cu index f7edd966..28f9d63e 100644 --- a/main.cu +++ b/main.cu @@ -13,8 +13,6 @@ #include #include -#include "better_convert.hpp" - namespace krn = std::chrono; From 7d7178b0ef6b76c7bc80f685865a878945cd365b Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Fri, 30 Sep 2022 14:57:33 +0200 Subject: [PATCH 03/49] Implement prefetching using shared memory --- build.sh | 4 +- cusp/system/cuda/ktt/dia_multiply.h | 3 +- cusp/system/cuda/ktt/kernels/dia_kernel.h | 63 ++++++++++++++++++++--- main.cu | 15 ++++-- 4 files changed, 71 insertions(+), 14 deletions(-) diff --git a/build.sh b/build.sh index 1c04f512..9809c63c 100644 --- a/build.sh +++ b/build.sh @@ -1,7 +1,7 @@ nvcc -DCUSP_PATH=$(realpath .) \ -I . -I ../KTT/Source \ - -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \ - --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \ + -l cuda -l ktt -L ../KTT/Build/x86_64_Release/ \ + --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Release/) \ -std=c++17 -g -O3 -lineinfo main.cu [ "$?" -eq "0" ] || exit 1 diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index c685f6ba..659fbef2 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -22,7 +22,8 @@ namespace dia { inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) { - tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1, 2, 3, 4 }); + tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1 }); + tuner.AddParameter(kernel.kernel_id, "PREFETCH_FACTOR", std::vector{ 0, 2, 4, 8 }); } } // namespace dia diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index fda35831..51d87b06 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -100,11 +100,18 @@ blocked_offsets_dia_kernel(const int num_rows, const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; +#if PREFETCH_FACTOR > 0 + __shared__ ValueType1 values_prefetched[PREFETCH_FACTOR*BLOCK_SIZE]; + __shared__ ValueType2 x_prefetched[PREFETCH_FACTOR*BLOCK_SIZE]; +#endif + __shared__ IndexType offsets[BLOCK_SIZE]; ValueType sum = ValueType(0); - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { - if (offset_base + threadIdx.x < num_diagonals) { + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) + { + if (offset_base + threadIdx.x < num_diagonals) + { offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; } @@ -112,21 +119,65 @@ blocked_offsets_dia_kernel(const int num_rows, int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - if (thread_id < num_rows) { - for (IndexType i = 0; i < batch_size; ++i) { + if (thread_id < num_rows) + { +#if PREFETCH_FACTOR == 0 + for (IndexType i = 0; i < batch_size; ++i) + { IndexType col = offsets[i] + thread_id; - if (col >= 0 && col < num_cols) { + if (col >= 0 && col < num_cols) + { auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; auto x_val = __ldg(x + col); sum += diag_val*x_val; } } +#else + int rest = batch_size % PREFETCH_FACTOR; + int end = batch_size - rest; + + for (IndexType i = 0; i < end; i += PREFETCH_FACTOR) + { + for (int j = 0; j < PREFETCH_FACTOR; ++j) + { + IndexType col = offsets[i + j] + thread_id; + if (col >= 0 && col < num_cols) + { + auto val = values[ (offset_base + i + j)*pitch + thread_id ]; + values_prefetched[j*BLOCK_SIZE + threadIdx.x] = val; + x_prefetched[j*BLOCK_SIZE + threadIdx.x] = __ldg(x + col); + } + else + { + values_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType1(0); + x_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType2(0); + } + } + + for (int j = 0; j < PREFETCH_FACTOR; ++j) + { + sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x]; + } + } + + for (IndexType i = end; i < batch_size; ++i) + { + IndexType col = offsets[i] + thread_id; + if (col >= 0 && col < num_cols) + { + auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; + auto x_val = __ldg(x + col); + sum += diag_val*x_val; + } + } +#endif } __syncthreads(); } - if (thread_id < num_rows) { + if (thread_id < num_rows) + { y[thread_id] = sum; } } diff --git a/main.cu b/main.cu index f7edd966..cd4ade60 100644 --- a/main.cu +++ b/main.cu @@ -515,16 +515,21 @@ void test_matrix() // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); // cusp::ktt::multiply(A, x, y, conf1); - auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; - //cusp::ktt::multiply(A, x, y, conf2); - std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + // cusp::ktt::tune(A, x, y); - auto expected_bytes = min_read_bytes(A, x); - std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + cusp::ktt::multiply(A, x, y, conf2); + //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + //auto expected_bytes = min_read_bytes(A, x); + //std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + //for (int i = 0; i < 10; ++i) { + // std::cout << y[i] << "\n"; + //} //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; From 455504b925bb4fd75dc45d027832ee9c7d2e88e9 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 28 Nov 2022 15:09:52 +0100 Subject: [PATCH 04/49] Fix formatting --- cusp/system/cuda/ktt/kernels/dia_kernel.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 51d87b06..6c25a200 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -67,11 +67,14 @@ naive_dia_kernel(const int num_rows, const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - if (thread_id < num_rows) { + if (thread_id < num_rows) + { ValueType sum = ValueType(0); - for (IndexType i = 0; i < num_diagonals; ++i) { + for (IndexType i = 0; i < num_diagonals; ++i) + { IndexType col = diagonal_offsets[i] + thread_id; - if (col >= 0 && col < num_cols) { + if (col >= 0 && col < num_cols) + { auto diag_val = values[i*pitch + thread_id]; auto x_val = x[col]; sum += diag_val * x_val; @@ -325,7 +328,7 @@ template __device__ void -cached_naive_dia_kernel(const int num_rows, +cached_x_naive_dia_kernel(const int num_rows, const int num_cols, const int num_diagonals, const int pitch, @@ -395,7 +398,7 @@ ktt_dia_vector_kernel( experimental_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); // #elif KERNEL_TYPE == 4 -// cached_naive_dia_kernel +// cached_x_naive_dia_kernel // (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #endif } From c563dcb014cbd6d6d97cf3727edf0684cac59735 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 28 Nov 2022 15:10:21 +0100 Subject: [PATCH 05/49] Remove debug print --- cusp/system/cuda/ktt/dia_multiply.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 659fbef2..d009bfce 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -165,8 +165,6 @@ ::ktt::KernelResult multiply(::ktt::Tuner& tuner, return result; } - std::cout << "HEllo\n"; - kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); auto args = add_arguments(kernel, A, x, y); From da27bd7845213c9d4026fed41d916a9c179cbced Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 28 Nov 2022 15:10:52 +0100 Subject: [PATCH 06/49] Test cache effectiveness --- main.cu | 301 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 220 insertions(+), 81 deletions(-) diff --git a/main.cu b/main.cu index 386802e3..352b3d50 100644 --- a/main.cu +++ b/main.cu @@ -4,11 +4,13 @@ #include #include #include +#include + #include #include #include - +#include #include #include #include @@ -331,16 +333,21 @@ size_t dia_matrix_size(size_t num_rows, size_t num_cols, size_t num_diags) return sizeof(int)*num_diags + num_rows*num_diags*sizeof(float); } +size_t dia_problem_size(size_t num_rows, size_t num_cols, size_t num_diags) +{ + return sizeof(int)*num_diags + + num_rows*num_diags*sizeof(float) + + num_rows*sizeof(float) + + num_cols*sizeof(float); +} + template -size_t min_read_bytes(const cusp::dia_matrix& A, - const cusp::array1d& x) + typename ValueT, + typename MemorySpace> +size_t min_read_bytes(const cusp::dia_matrix& A) { // Assume that each column has at least one diagonal passing through it. - size_t result = x.size() * sizeof(ValueT2); + size_t result = A.num_cols * sizeof(ValueT); for (IndexT offset : A.diagonal_offsets) { @@ -349,62 +356,35 @@ size_t min_read_bytes(const cusp::dia_matrix& A, IndexT end_row = std::min(A.num_cols - start_col, A.num_rows - start_row); // Assume that global reads are done in 32 byte transactions that must be aligned. - IndexT elems_per_read = 32 / sizeof(ValueT1); + IndexT elems_per_read = 32 / sizeof(ValueT); IndexT adjusted_start_row = start_row - (start_row % elems_per_read); IndexT adjusted_end_row = end_row + elems_per_read - (end_row % elems_per_read); - result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT1); + result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT); } return result; } -::ktt::KernelResult profile_kernel(const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& conf) -{ - ::ktt::KernelResult result; - std::vector<::ktt::KernelResult> results; - do { - result = cusp::ktt::multiply(A, x, y, conf, true); - results.push_back(result); - } while(result.HasRemainingProfilingRuns()); - - auto& tuner = cusp::ktt::get_tuner(); - tuner.SaveResults(results, "results.json", ::ktt::OutputFormat::JSON); - - return result; -} - - -size_t get_actual_read_bytes(const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& conf) +std::vector<::ktt::KernelProfilingCounter> +profile_multiply(const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& conf, + const std::vector& counters) { auto& tuner = cusp::ktt::get_tuner(); auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); - tuner.SetProfilingCounters({ - "dram_read_bytes", - "dram_read_transactions", - "dram_read_throughput", - "l2_global_load_bytes" - //"l2_global_load_bytes" - }); + tuner.SetProfilingCounters(counters); tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); tuner.SetProfiling(true); - auto kernel_result = profile_kernel(A, x, y, conf); - - // auto kernel_result = cusp::ktt::multiply(A, x, y, conf, true); - - // if (kernel_result.HasRemainingProfilingRuns()) - // { - // std::cout << "NEED RUNS!\n"; - // } + ::ktt::KernelResult kernel_result; + do { + kernel_result = cusp::ktt::multiply(A, x, y, conf, true); + } while(kernel_result.HasRemainingProfilingRuns()); for (const auto& computation_result : kernel_result.GetResults()) { @@ -421,46 +401,195 @@ size_t get_actual_read_bytes(const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& conf) +{ + const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"}); + + if (counters.empty()) { + throw std::runtime_error("Profiling failed."); + } + + return counters[0].GetValueUint(); +} + +size_t get_actual_read_bytes(::ktt::Tuner& tuner, + const cusp::dia_matrix& host_matrix) +{ + cusp::dia_matrix A = host_matrix; + cusp::array1d x(A.num_cols, 1); + cusp::array1d y(A.num_rows); + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + + return get_actual_read_bytes(tuner, A, x, y, conf); +} + +int get_max_size(int num_diags, size_t global_size) +{ + // Assume we have a square matrix of size n. + // + // On the gpu we need: + // - offsets -> num_diags * sizeof(int) bytes + // - diagonals -> num_diags * n * sizeof(float) bytes + // - x -> n * sizeof(float) bytes + // - y -> n * sizeof(float) bytes + // + // It must hold: + // + // global_size >= num_diags*sizeof(int) + (num_diags + 2)*n*sizeof(float) + // + // So: + // + // n <= (global_size - num_diags*sizeof(int)) / (num_diags + 2)*sizeof(float) + // + return (global_size - num_diags*sizeof(int)) / ((num_diags + 2)*sizeof(float)); +} + +void test_poisson(::ktt::Tuner& tuner, + const cusp::dia_matrix& host_matrix, + const std::string& name) +{ + auto expected_bytes = min_read_bytes(host_matrix); + auto actual_bytes = get_actual_read_bytes(tuner, host_matrix); + std::cout << " " << name << ": " << size_str(actual_bytes) << " vs " << size_str(expected_bytes) << "\n"; +} + +void test_poisson_sizes(::ktt::Tuner& tuner, size_t global_size) +{ + std::cout << "Testing poisson\n"; + + int n = get_max_size(5, global_size); + int grid_size = std::sqrt(n); + cusp::dia_matrix dia_host_matrix; + cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "5pt"); + + n = get_max_size(9, global_size); + grid_size = std::sqrt(n); + cusp::gallery::poisson9pt(dia_host_matrix, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "9pt"); + + n = get_max_size(27, global_size); + grid_size = std::cbrt(n); + cusp::gallery::poisson27pt(dia_host_matrix, grid_size, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "27pt"); + + n = get_max_size(7, global_size); + grid_size = std::cbrt(n); + cusp::gallery::poisson7pt(dia_host_matrix, grid_size, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "7pt"); + + + // std::cout << " Memory requirements: " + // << size_str(dia_problem_size(dia_host_matrix.num_rows, + // dia_host_matrix.num_cols, + // dia_host_matrix.diagonal_offsets.size())) + // << "\n"; +} + +void test_poisson_7pt(::ktt::Tuner& tuner, size_t global_limit) +{ + std::cout << "Testing different sizes of poisson 7pt problem (actual vs min)\n"; + + size_t mb = (size_t(1) << 20); + cusp::dia_matrix host_matrix; + + for (size_t mem = 512*mb; mem <= global_limit; mem += 512*mb) + { + int n = get_max_size(7, mem); + int grid_size = std::cbrt(n); + + cusp::gallery::poisson7pt(host_matrix, grid_size, grid_size, grid_size); + + auto expected_bytes = min_read_bytes(host_matrix); + auto actual_bytes = get_actual_read_bytes(tuner, host_matrix); + float ratio = float(actual_bytes)/expected_bytes; + + std::cout << " " + << size_str(mem) << " (" << grid_size << ") -> " + << size_str(actual_bytes) << " vs " + << size_str(expected_bytes) << " (" + << ratio << ")\n"; + } } void test_l2() { auto& tuner = cusp::ktt::get_tuner(); - const size_t max_bytes = 2u*(1u << 30); // 2 GB - const size_t n = (1u << 20)*4; + tuner.SetLoggingLevel(::ktt::LoggingLevel::Off); - for (int d = 32; d < 4096; d *= 2) + cudaDeviceProp prop; + cudaError_t err = cudaGetDeviceProperties(&prop, 0); + if (err != cudaSuccess) { - if (dia_matrix_size(n, n, d) <= max_bytes) - { - std::cout << d << " diags: \n"; + std::cout << "Failed to read device properties.\n"; + return; + } - auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, d); + size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem); + size_t max_floats = max_bytes/sizeof(float); - cusp::dia_matrix A = dia_host_matrix; - cusp::array1d x(A.num_cols, 1); - cusp::array1d y(A.num_rows); + std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n"; + std::cout << "Global memory available: " << size_str(prop.totalGlobalMem) << "\n"; + std::cout << "Global limit: " << size_str(max_bytes) << "\n"; - auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); - auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + test_poisson_sizes(tuner, max_bytes); + test_poisson_7pt(tuner, max_bytes); - auto expected_bytes = min_read_bytes(A, x); - auto actual_bytes = get_actual_read_bytes(A, x, y, conf); + // std::cout << "Using size: " << size_str(max_bytes) << "\n"; - std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, d)) << "\n"; - std::cout << "Expected: " << size_str(expected_bytes) << "\n"; - std::cout << "Actual: " << size_str(actual_bytes) << "\n"; - } - } + // // const size_t n = (1u << 20)*4; + + // for (int dist = 1; dist <= 1024; dist *= 2) + // { + // for (int diags = 4; diags <= 1024; diags *= 2) + // { + // int actual_diags = diags + 1; // to make it nicely symmetric + // int n = max_floats/(actual_diags + 1); + + // if (dist*diags > n) + // { + // std::cout << "skipping\n"; + // continue; + // } + + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, actual_diags); + + // cusp::dia_matrix A = dia_host_matrix; + // cusp::array1d x(A.num_cols, 1); + // cusp::array1d y(A.num_rows); + + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + + // auto expected_bytes = min_read_bytes(A, x); + // auto actual_bytes = get_actual_read_bytes(A, x, y, conf); + + // float ration = float(actual_bytes)/expected_bytes; + + // std::cout << "(" << size_str(n) << ", " << dist << ", " << actual_diags << ") -> " << ration << "\n"; + + // // std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, actual_diags)) << "\n"; + // // std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + // // std::cout << "Actual: " << size_str(actual_bytes) << "\n"; + // // std::cout << "------------------------------------------------\n"; + // } + // } // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11); @@ -485,15 +614,16 @@ void test_matrix() const int n = 8*(1u << 20); //const int n = (1u << 20)/2; - //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024); - auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32); - cusp::dia_matrix A = dia_host_matrix; - - // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); + cusp::dia_matrix dia_host_matrix; + cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390); + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32); + cusp::dia_matrix A = dia_host_matrix; cusp::array1d x(A.num_cols, 1); cusp::array1d y(A.num_rows); + // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); + //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11); //cusp::dia_matrix A2 = dia_host_matrix; @@ -515,12 +645,21 @@ void test_matrix() auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2); + + conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) }, + { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2); + + std::cout << "non_cached: " << size_str(non_cached) << "\n"; + std::cout << "cached: " << size_str(cached) << "\n"; + //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; // cusp::ktt::tune(A, x, y); - cusp::ktt::multiply(A, x, y, conf2); - //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + // cusp::ktt::multiply(A, x, y, conf2); + // std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; //auto expected_bytes = min_read_bytes(A, x); //std::cout << "Expected: " << size_str(expected_bytes) << "\n"; @@ -541,8 +680,8 @@ int main(void) auto& tuner = cusp::ktt::get_tuner(); tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds); - //test_l2(); - test_matrix(); + test_l2(); + // test_matrix(); // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256); // cusp::coo_matrix A = orig; From 302a4579a6d59f180cb7de76f42b8a92fc080ff8 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 28 Nov 2022 17:32:27 +0100 Subject: [PATCH 07/49] Fix out of memory errors --- main.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cu b/main.cu index 352b3d50..f08791a1 100644 --- a/main.cu +++ b/main.cu @@ -540,7 +540,7 @@ void test_l2() return; } - size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem); + size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20)); size_t max_floats = max_bytes/sizeof(float); std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n"; From 5be16e0fccfbd984d0ba5c1611bb329b31ece679 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 28 Nov 2022 20:11:28 +0100 Subject: [PATCH 08/49] Implement prefetching --- cusp/system/cuda/ktt/dia_multiply.h | 17 +++++- cusp/system/cuda/ktt/kernels/dia_kernel.h | 71 ++++++++++++++++++++--- main.cu | 30 +++++----- 3 files changed, 93 insertions(+), 25 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index d009bfce..7ad15283 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -23,7 +23,22 @@ namespace dia { inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) { tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1 }); - tuner.AddParameter(kernel.kernel_id, "PREFETCH_FACTOR", std::vector{ 0, 2, 4, 8 }); + tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector{ 0, 2, 4 }); + tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector{ 0, 1, 2, 3 }); + + // only one type of prefetching can be applied at once + tuner.AddConstraint(kernel.kernel_id, + {"SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"}, + [] (const std::vector& values) { + return values[0] == 0 || values[1] == 0; + }); + + // only one type of prefetching can be applied at once + tuner.AddConstraint(kernel.kernel_id, + {"KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"}, + [] (const std::vector& values) { + return values[0] == 1 || (values[1] == 0 && values[2] == 0); + }); } } // namespace dia diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 6c25a200..5c90b44d 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -103,9 +103,9 @@ blocked_offsets_dia_kernel(const int num_rows, const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; -#if PREFETCH_FACTOR > 0 - __shared__ ValueType1 values_prefetched[PREFETCH_FACTOR*BLOCK_SIZE]; - __shared__ ValueType2 x_prefetched[PREFETCH_FACTOR*BLOCK_SIZE]; +#if SHARED_PREFETCH_FACTOR > 0 + __shared__ ValueType1 values_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE]; + __shared__ ValueType2 x_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE]; #endif __shared__ IndexType offsets[BLOCK_SIZE]; @@ -124,7 +124,7 @@ blocked_offsets_dia_kernel(const int num_rows, if (thread_id < num_rows) { -#if PREFETCH_FACTOR == 0 +#if SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0 for (IndexType i = 0; i < batch_size; ++i) { IndexType col = offsets[i] + thread_id; @@ -135,13 +135,66 @@ blocked_offsets_dia_kernel(const int num_rows, sum += diag_val*x_val; } } -#else - int rest = batch_size % PREFETCH_FACTOR; +#elif REGISTER_PREFETCH_FACTOR > 0 + int end = batch_size - (batch_size % REGISTER_PREFETCH_FACTOR); + + for (IndexType i = 0; i < end; i += REGISTER_PREFETCH_FACTOR) + { + IndexType col1 = offsets[i] + thread_id; + ValueType1 diag_val1 = 0; + ValueType2 x_val1 = 0; + if (col1 >= 0 && col1 < num_cols) + { + diag_val1 = values[ (offset_base + i)*pitch + thread_id ]; + x_val1 = __ldg(x + col1); + } +#if REGISTER_PREFETCH_FACTOR > 1 + IndexType col2 = offsets[i + 1] + thread_id; + ValueType1 diag_val2 = 0; + ValueType2 x_val2 = 0; + if (col2 >= 0 && col2 < num_cols) + { + diag_val2 = values[ (offset_base + i + 1)*pitch + thread_id ]; + x_val2 = __ldg(x + col2); + } +#endif +#if REGISTER_PREFETCH_FACTOR > 2 + IndexType col3 = offsets[i + 2] + thread_id; + ValueType1 diag_val3 = 0; + ValueType2 x_val3 = 0; + if (col3 >= 0 && col3 < num_cols) + { + diag_val3 = values[ (offset_base + i + 2)*pitch + thread_id ]; + x_val3 = __ldg(x + col3); + } +#endif + + sum += diag_val1 * x_val1; +#if REGISTER_PREFETCH_FACTOR > 1 + sum += diag_val2 * x_val2; +#endif +#if REGISTER_PREFETCH_FACTOR > 2 + sum += diag_val3 * x_val3; +#endif + } + + for (IndexType i = end; i < batch_size; ++i) + { + IndexType col = offsets[i] + thread_id; + if (col >= 0 && col < num_cols) + { + auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; + auto x_val = __ldg(x + col); + sum += diag_val*x_val; + } + } +#elif SHARED_PREFETCH_FACTOR > 0 + int rest = batch_size % SHARED_PREFETCH_FACTOR; int end = batch_size - rest; - for (IndexType i = 0; i < end; i += PREFETCH_FACTOR) + for (IndexType i = 0; i < end; i += SHARED_PREFETCH_FACTOR) { - for (int j = 0; j < PREFETCH_FACTOR; ++j) + for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j) { IndexType col = offsets[i + j] + thread_id; if (col >= 0 && col < num_cols) @@ -157,7 +210,7 @@ blocked_offsets_dia_kernel(const int num_rows, } } - for (int j = 0; j < PREFETCH_FACTOR; ++j) + for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j) { sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x]; } diff --git a/main.cu b/main.cu index f08791a1..034c65fd 100644 --- a/main.cu +++ b/main.cu @@ -614,9 +614,9 @@ void test_matrix() const int n = 8*(1u << 20); //const int n = (1u << 20)/2; - cusp::dia_matrix dia_host_matrix; - cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390); - // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 32); + // cusp::dia_matrix dia_host_matrix; + // cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390); + auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64); cusp::dia_matrix A = dia_host_matrix; cusp::array1d x(A.num_cols, 1); @@ -637,24 +637,24 @@ void test_matrix() // //"l2_global_load_bytes" // }); - //cusp::ktt::tune(A2, x, y); - //cusp::ktt::tune(A, x, y); + std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, 64)) << "\n"; + // cusp::ktt::tune(A, x, y); // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); // cusp::ktt::multiply(A, x, y, conf1); auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, - { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); - size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2); + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(2) } }); + // size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2); - conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) }, - { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); - size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2); + // conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) }, + // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + // size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2); - std::cout << "non_cached: " << size_str(non_cached) << "\n"; - std::cout << "cached: " << size_str(cached) << "\n"; + // std::cout << "non_cached: " << size_str(non_cached) << "\n"; + // std::cout << "cached: " << size_str(cached) << "\n"; - //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; + std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; // cusp::ktt::tune(A, x, y); @@ -680,8 +680,8 @@ int main(void) auto& tuner = cusp::ktt::get_tuner(); tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds); - test_l2(); - // test_matrix(); + // test_l2(); + test_matrix(); // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256); // cusp::coo_matrix A = orig; From bee9571357d55f21c235d807eb386d3bd57f1d8f Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 29 Nov 2022 13:25:52 +0100 Subject: [PATCH 09/49] Use -lineinfo in debug mode in tests --- build/build-env.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/build/build-env.py b/build/build-env.py index fbd7f216..3d19bc53 100644 --- a/build/build-env.py +++ b/build/build-env.py @@ -277,6 +277,10 @@ def addKTT(env): env.Append(RPATH = [ ktt_lib_path ]) env.Append(LIBS = [ "ktt" ]) + if env['mode'] == 'debug': + env.Append(CFLAGS = ['-DKTT_LINE_INFO']) + env.Append(CXXFLAGS = ['-DKTT_LINE_INFO']) + def Environment(buildDir): # allow the user discretion to choose the MSVC version @@ -465,7 +469,7 @@ def Environment(buildDir): env['ENV']['DYLD_LIBRARY_PATH'] = os.environ['DYLD_LIBRARY_PATH'] elif 'LD_LIBRARY_PATH' in os.environ: env['ENV']['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH'] - + addKTT(env) # generate help text From e036ae72a66788ecf0a1c7017018d273e9c3be88 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 29 Nov 2022 13:26:52 +0100 Subject: [PATCH 10/49] Implement prefetching --- cusp/system/cuda/ktt/dia_multiply.h | 16 +++- cusp/system/cuda/ktt/kernels/dia_kernel.h | 107 ++++++++++++++-------- main.cu | 27 ++++-- 3 files changed, 99 insertions(+), 51 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 7ad15283..dad0f6aa 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -23,22 +23,30 @@ namespace dia { inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) { tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1 }); - tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector{ 0, 2, 4 }); + tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector{ 0, 2, 4, 8 }); tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector{ 0, 1, 2, 3 }); + tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector{ 0, 1 }); // only one type of prefetching can be applied at once tuner.AddConstraint(kernel.kernel_id, - {"SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"}, + { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, [] (const std::vector& values) { return values[0] == 0 || values[1] == 0; }); - // only one type of prefetching can be applied at once + // prefetching is used only for the blocked offsets kernel tuner.AddConstraint(kernel.kernel_id, - {"KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR"}, + { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, [] (const std::vector& values) { return values[0] == 1 || (values[1] == 0 && values[2] == 0); }); + + // different register prefetching implementations are used only when register prefetching is used + tuner.AddConstraint(kernel.kernel_id, + { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" }, + [] (const std::vector& values) { + return values[0] > 0 || values[1] == 0; + }); } } // namespace dia diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 5c90b44d..9d5050a5 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -10,43 +10,66 @@ __device__ T max(T a, T b) { return a > b ? a : b; } -// template -// struct UnrolledLoop : UnrolledLoop -// { -// using parent = UnrolledLoop; - -// IndexType col; -// ValueType1 diag_val = 0; -// ValueType2 x_val = 0; - -// __device__ void prefetch(const IndexType* offsets, -// const ValueType1* values, -// const ValueType2* x, -// IndexType pitch, -// IndexType row, -// IndexType base, -// IndexType cols) -// { -// if (col >= 0 && col < cols) -// { -// diag_val = values[] -// parent::prefetch(offsets, values, x, row, base + 1); -// } -// } - -// template -// __device__ ValueType3 do_iter(IndexType cols) -// { -// if (col < 0 || col >= cols) -// { -// return 0; -// } -// return diag_val*x_val + parent::do_iter(cols); -// } -// }; +template +struct UnrolledLoop : UnrolledLoop +{ + using parent = UnrolledLoop; + + IndexType col; + ValueType1 diag_val = 0; + ValueType2 x_val = 0; + + __device__ void prefetch(const IndexType* offsets, + const ValueType1* values, + const ValueType2* x, + int pitch, + int cols, + IndexType row, + IndexType offset_base, + IndexType i) + { + col = offsets[i] + row; + + if (col >= 0 && col < cols) + { + diag_val = values[(offset_base + i)*pitch + row]; + x_val = __ldg(x + col); + } + + parent::prefetch(offsets, values, x, pitch, cols, row, offset_base, i + 1); + } + + template + __device__ ValueType3 do_iter() + { + return static_cast(diag_val*x_val) + parent::template do_iter(); + } +}; + +template +struct UnrolledLoop +{ + __device__ void prefetch(const IndexType* offsets, + const ValueType1* values, + const ValueType2* x, + int pitch, + int cols, + IndexType row, + IndexType offset_base, + IndexType i) {} + + template + __device__ ValueType3 do_iter() + { + return 0; + } +}; + template loop; + loop.prefetch(offsets, values, x, pitch, num_cols, thread_id, offset_base, i); + sum += loop.template do_iter(); + +#elif REGISTER_PREFETCH_TYPE == 1 + IndexType col1 = offsets[i] + thread_id; ValueType1 diag_val1 = 0; ValueType2 x_val1 = 0; @@ -176,6 +207,8 @@ blocked_offsets_dia_kernel(const int num_rows, #if REGISTER_PREFETCH_FACTOR > 2 sum += diag_val3 * x_val3; #endif + +#endif // REGISTER_PREFETCH_TYPE == 1 } for (IndexType i = end; i < batch_size; ++i) diff --git a/main.cu b/main.cu index 034c65fd..51184a4d 100644 --- a/main.cu +++ b/main.cu @@ -611,12 +611,14 @@ void test_matrix() { auto& tuner = cusp::ktt::get_tuner(); - const int n = 8*(1u << 20); - //const int n = (1u << 20)/2; + int n = 8*(1u << 20); - // cusp::dia_matrix dia_host_matrix; - // cusp::gallery::poisson7pt(dia_host_matrix, 390, 390, 390); - auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64); + n = get_max_size(5, 2*(size_t(1) << 30)); + int grid_size = std::sqrt(n); + cusp::dia_matrix dia_host_matrix; + cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size); + + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64); cusp::dia_matrix A = dia_host_matrix; cusp::array1d x(A.num_cols, 1); @@ -637,14 +639,15 @@ void test_matrix() // //"l2_global_load_bytes" // }); - std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, 64)) << "\n"; + std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n"; // cusp::ktt::tune(A, x, y); // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); // cusp::ktt::multiply(A, x, y, conf1); auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, - { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(2) } }); + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, + { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(4) } }); // size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2); // conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) }, @@ -654,11 +657,15 @@ void test_matrix() // std::cout << "non_cached: " << size_str(non_cached) << "\n"; // std::cout << "cached: " << size_str(cached) << "\n"; - std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; - - // cusp::ktt::tune(A, x, y); + // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; + cusp::ktt::tune(A, x, y); // cusp::ktt::multiply(A, x, y, conf2); + // for (int i = 0; i < 10; ++i) + // { + // std::cout << y[i] << "\n"; + // } + // std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; //auto expected_bytes = min_read_bytes(A, x); From 88123f896dc34695a0ddcc2ebd48e7c04a30bf7e Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 5 Dec 2022 16:24:05 +0100 Subject: [PATCH 11/49] Add tuning parameter for different way to load values from global memory --- cusp/system/cuda/ktt/dia_multiply.h | 5 +- cusp/system/cuda/ktt/kernels/dia_kernel.h | 144 ++++++++++++---------- main.cu | 2 +- 3 files changed, 83 insertions(+), 68 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index dad0f6aa..7a0f4486 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -24,8 +24,9 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k { tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1 }); tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector{ 0, 2, 4, 8 }); - tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector{ 0, 1, 2, 3 }); + tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector{ 0, 2, 3, 4 }); tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector{ 0, 1 }); + tuner.AddParameter(kernel.kernel_id, "LOAD_TYPE", std::vector{ 0, 1 }); // only one type of prefetching can be applied at once tuner.AddConstraint(kernel.kernel_id, @@ -41,7 +42,7 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k return values[0] == 1 || (values[1] == 0 && values[2] == 0); }); - // different register prefetching implementations are used only when register prefetching is used + // different register prefetching implementations are used only when register prefetching is enabled tuner.AddConstraint(kernel.kernel_id, { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" }, [] (const std::vector& values) { diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 9d5050a5..17119b2d 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -1,5 +1,15 @@ #pragma once + +#if REGISTER_PREFETCH_FACTOR > 0 +#define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR +#elif SHARED_PREFETCH_FACTOR > 0 +#define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR +#else +#define PREFETCH_FACTOR 0 +#endif + + template __device__ T min(T a, T b) { return a < b ? a : b; @@ -10,6 +20,32 @@ __device__ T max(T a, T b) { return a > b ? a : b; } + +template +__device__ T load_diag_val(const T* val) +{ +#if LOAD_TYPE == 0 + return *val; +#elif LOAD_TYPE == 1 + // Cache streaming, likely to be accessed once. + // The ld.cs load cached streaming operation allocates global lines with evict-first policy in L1 and L2 + // to limit cache pollution by temporary streaming data + return __ldcs(val); +#endif +} + +template +__device__ T load_x_val(const T* val) +{ +#if LOAD_TYPE == 0 + return *val; +#else + // Needs compute capability at least 3.5 + return __ldg(val); +#endif +} + + template if (col >= 0 && col < cols) { - diag_val = values[(offset_base + i)*pitch + row]; - x_val = __ldg(x + col); + diag_val = load_diag_val(&values[(offset_base + i)*pitch + row]); + x_val = load_x_val(&x[col]); } parent::prefetch(offsets, values, x, pitch, cols, row, offset_base, i + 1); @@ -86,20 +122,19 @@ naive_dia_kernel(const int num_rows, const ValueType2* x, ValueType3* y) { - using ValueType = ValueType3; - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; if (thread_id < num_rows) { - ValueType sum = ValueType(0); + ValueType3 sum = ValueType3(0); for (IndexType i = 0; i < num_diagonals; ++i) { IndexType col = diagonal_offsets[i] + thread_id; if (col >= 0 && col < num_cols) { - auto diag_val = values[i*pitch + thread_id]; - auto x_val = x[col]; + auto diag_val = load_diag_val(&values[i*pitch + thread_id]); + auto x_val = load_x_val(&x[col]); + sum += diag_val * x_val; } } @@ -107,6 +142,16 @@ naive_dia_kernel(const int num_rows, } } +#define REGISTER_PREFETCH_ITERATION(iter) \ + IndexType col##iter = offsets[i + iter] + thread_id; \ + ValueType1 diag_val##iter = 0; \ + ValueType2 x_val##iter = 0; \ + if (col##iter >= 0 && col##iter < num_cols) \ + { \ + diag_val##iter = load_diag_val(&values[ (offset_base + i + iter)*pitch + thread_id ]); \ + x_val##iter = load_x_val(&x[col##iter]); \ + } + template 0 @@ -132,7 +175,7 @@ blocked_offsets_dia_kernel(const int num_rows, #endif __shared__ IndexType offsets[BLOCK_SIZE]; - ValueType sum = ValueType(0); + ValueType3 sum = ValueType3(0); for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { @@ -153,16 +196,18 @@ blocked_offsets_dia_kernel(const int num_rows, IndexType col = offsets[i] + thread_id; if (col >= 0 && col < num_cols) { - auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; - auto x_val = __ldg(x + col); + auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]); + auto x_val = load_x_val(&x[col]); + sum += diag_val*x_val; } } -#elif REGISTER_PREFETCH_FACTOR > 0 - int end = batch_size - (batch_size % REGISTER_PREFETCH_FACTOR); +#else // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0 + int end = batch_size - (batch_size % PREFETCH_FACTOR); - for (IndexType i = 0; i < end; i += REGISTER_PREFETCH_FACTOR) + for (IndexType i = 0; i < end; i += PREFETCH_FACTOR) { +#if REGISTER_PREFETCH_FACTOR > 0 #if REGISTER_PREFETCH_TYPE == 0 UnrolledLoop loop; @@ -171,70 +216,37 @@ blocked_offsets_dia_kernel(const int num_rows, #elif REGISTER_PREFETCH_TYPE == 1 - IndexType col1 = offsets[i] + thread_id; - ValueType1 diag_val1 = 0; - ValueType2 x_val1 = 0; - if (col1 >= 0 && col1 < num_cols) - { - diag_val1 = values[ (offset_base + i)*pitch + thread_id ]; - x_val1 = __ldg(x + col1); - } -#if REGISTER_PREFETCH_FACTOR > 1 - IndexType col2 = offsets[i + 1] + thread_id; - ValueType1 diag_val2 = 0; - ValueType2 x_val2 = 0; - if (col2 >= 0 && col2 < num_cols) - { - diag_val2 = values[ (offset_base + i + 1)*pitch + thread_id ]; - x_val2 = __ldg(x + col2); - } -#endif + REGISTER_PREFETCH_ITERATION(0); + REGISTER_PREFETCH_ITERATION(1); #if REGISTER_PREFETCH_FACTOR > 2 - IndexType col3 = offsets[i + 2] + thread_id; - ValueType1 diag_val3 = 0; - ValueType2 x_val3 = 0; - if (col3 >= 0 && col3 < num_cols) - { - diag_val3 = values[ (offset_base + i + 2)*pitch + thread_id ]; - x_val3 = __ldg(x + col3); - } + REGISTER_PREFETCH_ITERATION(2); +#endif +#if REGISTER_PREFETCH_FACTOR > 3 + REGISTER_PREFETCH_ITERATION(3); #endif + sum += diag_val0 * x_val0; sum += diag_val1 * x_val1; -#if REGISTER_PREFETCH_FACTOR > 1 +#if REGISTER_PREFETCH_FACTOR > 2 sum += diag_val2 * x_val2; #endif -#if REGISTER_PREFETCH_FACTOR > 2 +#if REGISTER_PREFETCH_FACTOR > 3 sum += diag_val3 * x_val3; #endif #endif // REGISTER_PREFETCH_TYPE == 1 - } - - for (IndexType i = end; i < batch_size; ++i) - { - IndexType col = offsets[i] + thread_id; - if (col >= 0 && col < num_cols) - { - auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; - auto x_val = __ldg(x + col); - sum += diag_val*x_val; - } - } #elif SHARED_PREFETCH_FACTOR > 0 - int rest = batch_size % SHARED_PREFETCH_FACTOR; - int end = batch_size - rest; - for (IndexType i = 0; i < end; i += SHARED_PREFETCH_FACTOR) - { for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j) { IndexType col = offsets[i + j] + thread_id; if (col >= 0 && col < num_cols) { - auto val = values[ (offset_base + i + j)*pitch + thread_id ]; - values_prefetched[j*BLOCK_SIZE + threadIdx.x] = val; - x_prefetched[j*BLOCK_SIZE + threadIdx.x] = __ldg(x + col); + auto diag_val = load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]); + auto x_val = load_x_val(&x[col]); + + values_prefetched[j*BLOCK_SIZE + threadIdx.x] = diag_val; + x_prefetched[j*BLOCK_SIZE + threadIdx.x] = x_val; } else { @@ -247,6 +259,7 @@ blocked_offsets_dia_kernel(const int num_rows, { sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x]; } +#endif // SHARED_PREFETCH_FACTOR > 0 } for (IndexType i = end; i < batch_size; ++i) @@ -254,12 +267,13 @@ blocked_offsets_dia_kernel(const int num_rows, IndexType col = offsets[i] + thread_id; if (col >= 0 && col < num_cols) { - auto diag_val = values[ (offset_base + i)*pitch + thread_id ]; - auto x_val = __ldg(x + col); + auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]); + auto x_val = load_x_val(&x[col]); + sum += diag_val*x_val; } } -#endif +#endif // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0 } __syncthreads(); diff --git a/main.cu b/main.cu index 51184a4d..7943c796 100644 --- a/main.cu +++ b/main.cu @@ -639,7 +639,7 @@ void test_matrix() // //"l2_global_load_bytes" // }); - std::cout << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n"; + std::cout << "Device memory needed: " << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n"; // cusp::ktt::tune(A, x, y); // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); From c839c64d888e6242715167ac46c0b24a629abe2c Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 7 Dec 2022 11:35:40 +0100 Subject: [PATCH 12/49] Experiment with x vector caching and fix some bugs --- cusp/ktt/matrix_generation.h | 10 +- main.cu | 226 +++++++++++++++++++++++++++++++---- 2 files changed, 209 insertions(+), 27 deletions(-) diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h index 6c5e0b15..914d2225 100644 --- a/cusp/ktt/matrix_generation.h +++ b/cusp/ktt/matrix_generation.h @@ -43,14 +43,16 @@ make_diagonal_symmetric_matrix(int rows, for (int i = 0; i < diagonal_count; ++i) { int offset = starting_offset + offset_step*i; - offsets.push_back(offset); int starting_row = offset < 0 ? -offset : 0; int starting_col = offset < 0 ? 0 : offset; - int ending_row = starting_row >= rows || starting_col >= cols - ? 0 - : starting_row + std::min(rows - starting_row, cols - starting_col); + if (starting_row >= rows || starting_col >= cols) + continue; + + offsets.push_back(offset); + + int ending_row = starting_row + std::min(rows - starting_row, cols - starting_col); for (int row = starting_row; row < ending_row; ++row) { num_entries++; diff --git a/main.cu b/main.cu index 7943c796..c5f4e4da 100644 --- a/main.cu +++ b/main.cu @@ -14,9 +14,51 @@ #include #include #include +#include namespace krn = std::chrono; +template +void write_csv(const std::vector>& data, + const std::vector& col_labels, + const std::vector& row_labels, + const std::string& path) +{ + if (row_labels.size() != data.size()) + throw std::runtime_error("Weird number of rows or row labels."); + + for (int i = 1; i < data.size(); ++i) + { + if (data[i].size() != data[i-1].size()) + throw std::runtime_error("Not all rows have same length."); + if (i == 1 && data[i].size() != col_labels.size()) + throw std::runtime_error("Bad number of column lables."); + } + + std::ofstream file(path); + + if (!file) + throw std::runtime_error("Cannot open: " + path); + + const char* sep = ""; + for (const auto& label : col_labels) + { + file << sep << label; + sep = ","; + } + file << "\n"; + + for (int i = 0; i < data.size(); ++i) + { + file << row_labels[i]; + for (const auto& x : data[i]) + { + file << "," << x; + } + file << "\n"; + } +} + struct Timer { @@ -269,6 +311,33 @@ convert2(const cusp::coo_matrix& matrix return res; } +template typename Matrix, + typename IndexType, + typename ValueType> +Matrix +send_to_device(const Matrix& host_matrix) +{ + return host_matrix; +} + +template typename Matrix, + typename IndexType, + typename ValueType, + typename MemorySpace> +auto make_test_device_x(const Matrix& matrix) +{ + return cusp::array1d(matrix.num_cols, 1); +} + +template typename Matrix, + typename IndexType, + typename ValueType, + typename MemorySpace> +auto make_test_device_y(const Matrix& matrix) +{ + return cusp::array1d(matrix.num_rows, 0); +} + template bool equals(const cusp::dia_matrix& A, const cusp::dia_matrix& B) @@ -353,7 +422,7 @@ size_t min_read_bytes(const cusp::dia_matrix& A) { IndexT start_row = offset >= 0 ? 0 : -offset; IndexT start_col = offset >= 0 ? offset : 0; - IndexT end_row = std::min(A.num_cols - start_col, A.num_rows - start_row); + IndexT end_row = start_row + std::min(A.num_cols - start_col, A.num_rows - start_row); // Assume that global reads are done in 32 byte transactions that must be aligned. IndexT elems_per_read = 32 / sizeof(ValueT); @@ -371,7 +440,8 @@ profile_multiply(const cusp::dia_matrix& A, const cusp::array1d& x, cusp::array1d& y, const ::ktt::KernelConfiguration& conf, - const std::vector& counters) + const std::vector& counters, + ::ktt::KernelResult& result) { auto& tuner = cusp::ktt::get_tuner(); @@ -386,6 +456,8 @@ profile_multiply(const cusp::dia_matrix& A, kernel_result = cusp::ktt::multiply(A, x, y, conf, true); } while(kernel_result.HasRemainingProfilingRuns()); + result = kernel_result; + for (const auto& computation_result : kernel_result.GetResults()) { if (!computation_result.HasProfilingData()) @@ -412,9 +484,13 @@ size_t get_actual_read_bytes(::ktt::Tuner& tuner, const cusp::dia_matrix& A, const cusp::array1d& x, cusp::array1d& y, - const ::ktt::KernelConfiguration& conf) + const ::ktt::KernelConfiguration& conf, + ::ktt::KernelResult* res_p = nullptr) { - const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"}); + ::ktt::KernelResult res; + const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"}, res); + + if (res_p) *res_p = res; if (counters.empty()) { throw std::runtime_error("Profiling failed."); @@ -424,17 +500,22 @@ size_t get_actual_read_bytes(::ktt::Tuner& tuner, } size_t get_actual_read_bytes(::ktt::Tuner& tuner, - const cusp::dia_matrix& host_matrix) + const cusp::dia_matrix& host_matrix, + ::ktt::KernelResult* res_p = nullptr) { cusp::dia_matrix A = host_matrix; cusp::array1d x(A.num_cols, 1); cusp::array1d y(A.num_rows); auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); - auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, - { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); - - return get_actual_read_bytes(tuner, A, x, y, conf); + // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, + { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, + { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } }); + + return get_actual_read_bytes(tuner, A, x, y, conf, res_p); } int get_max_size(int num_diags, size_t global_size) @@ -526,10 +607,79 @@ void test_poisson_7pt(::ktt::Tuner& tuner, size_t global_limit) } } +void test_x_caching_uniform(::ktt::Tuner& tuner, size_t global_limit) +{ + std::vector distances; + std::vector diag_counts; + + for (int dist = 64; dist <= 2048; dist *= 2) + distances.push_back(dist); + + for (int count = 64; count <= 2048; count *= 2) + diag_counts.push_back(count + 1); + + std::vector> bytes; + std::vector> times; + + std::cout << "TESTING UNIFORM MATRICES\n" + << "Taget device memory: " << size_str(global_limit) << "\n"; + + for (auto dist : distances) + { + std::cout << "Distance: " << dist << "\n"; + bytes.emplace_back(); + times.emplace_back(); + + ::ktt::KernelResult kernel_result; + size_t reference_time = 1; + + for (auto diag_count : diag_counts) + { + size_t n = get_max_size(diag_count, global_limit); + if (n < 1024) // That is too small. + { + std::cout << "skipping\n"; + bytes.back().push_back(-1); + times.back().push_back(0); + continue; + } + + auto host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, diag_count); + + if (host_matrix.diagonal_offsets.size() != diag_count) + { + std::cout << "skipping\n"; + bytes.back().push_back(-1); + times.back().push_back(0); + continue; + } + + auto expected_bytes = min_read_bytes(host_matrix); + auto actual_bytes = get_actual_read_bytes(tuner, host_matrix, &kernel_result); + float ratio = float(actual_bytes)/expected_bytes; + + if (reference_time == 1) + { + reference_time = kernel_result.GetKernelDuration(); + } + + size_t time = kernel_result.GetKernelDuration(); + float time_ratio = float(time)/reference_time; + + std::cout << diag_count << "(" << n << ") -> " << ratio << "(" << actual_bytes << " : " << expected_bytes << ")\n"; + // << " (" << time_ratio << " : " << time << ")\n"; + + bytes.back().push_back(ratio); + times.back().push_back(time); + } + } + + write_csv(bytes, distances, diag_counts, "bytes_transfered.csv"); +} + void test_l2() { auto& tuner = cusp::ktt::get_tuner(); - tuner.SetLoggingLevel(::ktt::LoggingLevel::Off); cudaDeviceProp prop; @@ -540,15 +690,15 @@ void test_l2() return; } - size_t max_bytes = std::min(5*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20)); - size_t max_floats = max_bytes/sizeof(float); + size_t max_bytes = std::min(2*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20)); std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n"; std::cout << "Global memory available: " << size_str(prop.totalGlobalMem) << "\n"; std::cout << "Global limit: " << size_str(max_bytes) << "\n"; - test_poisson_sizes(tuner, max_bytes); - test_poisson_7pt(tuner, max_bytes); + //test_poisson_sizes(tuner, max_bytes); + //test_poisson_7pt(tuner, max_bytes); + test_x_caching_uniform(tuner, max_bytes); // std::cout << "Using size: " << size_str(max_bytes) << "\n"; @@ -611,18 +761,21 @@ void test_matrix() { auto& tuner = cusp::ktt::get_tuner(); - int n = 8*(1u << 20); + int n = 130879; - n = get_max_size(5, 2*(size_t(1) << 30)); - int grid_size = std::sqrt(n); - cusp::dia_matrix dia_host_matrix; - cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size); + //n = get_max_size(5, 2*(size_t(1) << 30)); + //int grid_size = std::sqrt(n); + //cusp::dia_matrix dia_host_matrix; + //cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size); - // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 64); + auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 512, 2049); + + std::cout << min_read_bytes(dia_host_matrix) << "\n"; cusp::dia_matrix A = dia_host_matrix; cusp::array1d x(A.num_cols, 1); cusp::array1d y(A.num_rows); + cusp::array1d yy(A.num_rows); // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); @@ -645,6 +798,11 @@ void test_matrix() // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); // cusp::ktt::multiply(A, x, y, conf1); + auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, + { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, + { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } }); + auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(4) } }); @@ -654,12 +812,34 @@ void test_matrix() // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); // size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2); + std::cout << A.diagonal_offsets[0] << "\n"; + + cusp::ktt::disable(); + cusp::multiply(A, x, yy); + cusp::ktt::enable(); + + cusp::ktt::multiply(A, x, y, conf); + + if (y == yy) { + std::cout << "GOOD!\n"; + } else { + std::cout << "WRONG!\n"; + } + + for (int i = 0; i < y.size(); ++i) + { + if (y[i] != yy[i]) + { + std::cout << i << " -> " << y[i] - yy[i] << "\n"; + } + } + // std::cout << "non_cached: " << size_str(non_cached) << "\n"; // std::cout << "cached: " << size_str(cached) << "\n"; // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; - cusp::ktt::tune(A, x, y); + // cusp::ktt::tune(A, x, y); // cusp::ktt::multiply(A, x, y, conf2); // for (int i = 0; i < 10; ++i) // { @@ -687,8 +867,8 @@ int main(void) auto& tuner = cusp::ktt::get_tuner(); tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds); - // test_l2(); - test_matrix(); + test_l2(); + // test_matrix(); // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256); // cusp::coo_matrix A = orig; From ae3b003b71c9a0a56e3c49b07f52aaca435ca6e6 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 28 Dec 2022 16:47:31 +0100 Subject: [PATCH 13/49] Add a.out to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 769857d7..ff73601b 100644 --- a/.gitignore +++ b/.gitignore @@ -122,3 +122,4 @@ performance/spmm/spmm *.user .vscode +a.out From 956130deb6657dba04a639f4b6406ee9d70a9469 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 28 Dec 2022 16:48:52 +0100 Subject: [PATCH 14/49] Throw exception when some diagonals don't fit --- cusp/ktt/matrix_generation.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h index 914d2225..b30e616a 100644 --- a/cusp/ktt/matrix_generation.h +++ b/cusp/ktt/matrix_generation.h @@ -3,6 +3,7 @@ #include #include // min +#include namespace cusp { @@ -48,7 +49,7 @@ make_diagonal_symmetric_matrix(int rows, int starting_col = offset < 0 ? 0 : offset; if (starting_row >= rows || starting_col >= cols) - continue; + throw std::runtime_error("make_diagonal_symmetric_matrix: Too many diagonals."); offsets.push_back(offset); From ccf10295be394043af745434f131c3ba8c673eeb Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 28 Dec 2022 16:53:52 +0100 Subject: [PATCH 15/49] Add striped kernel, kernel for timing acesses and block size parameter --- cusp/system/cuda/ktt/dia_multiply.h | 40 +++- cusp/system/cuda/ktt/kernels/dia_kernel.h | 268 ++++++++++++++++++++-- 2 files changed, 278 insertions(+), 30 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 7a0f4486..847599be 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -22,11 +22,24 @@ namespace dia { inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) { - tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1 }); + tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1, 2 }); tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector{ 0, 2, 4, 8 }); tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector{ 0, 2, 3, 4 }); tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector{ 0, 1 }); tuner.AddParameter(kernel.kernel_id, "LOAD_TYPE", std::vector{ 0, 1 }); + tuner.AddParameter(kernel.kernel_id, "STRIPING_FACTOR", std::vector{ 2, 4, 8 }); + tuner.AddParameter(kernel.kernel_id, "BLOCK_SIZE", std::vector{ 256, 512 }); + + tuner.AddThreadModifier( + kernel.kernel_id, + { kernel.definition_ids[0] }, + ::ktt::ModifierType::Local, + ::ktt::ModifierDimension::X, + { std::string("BLOCK_SIZE") }, + [] (const uint64_t defaultSize, const std::vector& parameters) { + return parameters[0]; + } + ); // only one type of prefetching can be applied at once tuner.AddConstraint(kernel.kernel_id, @@ -48,6 +61,13 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k [] (const std::vector& values) { return values[0] > 0 || values[1] == 0; }); + + // don't try different striping factors for non-striped kernels + tuner.AddConstraint(kernel.kernel_id, + { "KERNEL_TYPE", "STRIPING_FACTOR" }, + [] (const std::vector& values) { + return values[0] == 2 || values[1] == 2; + }); } } // namespace dia @@ -60,21 +80,20 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h"; - const size_t BLOCK_SIZE = 256; - const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks( - ktt_dia_vector_kernel, - BLOCK_SIZE, 0); + const size_t block_size = 256; + const size_t max_blocks = cusp::system::cuda::detail::max_active_blocks( + ktt_dia_vector_kernel, + block_size, 0); std::vector< std::string > type_names { std::string(NAMEOF_TYPE(IndexType)), std::string(NAMEOF_TYPE(ValueType1)), std::string(NAMEOF_TYPE(ValueType2)), - std::string(NAMEOF_TYPE(ValueType3)), - std::to_string(BLOCK_SIZE) + std::string(NAMEOF_TYPE(ValueType3)) }; - const ::ktt::DimensionVector blockDimensions(BLOCK_SIZE); - const ::ktt::DimensionVector gridDimensions(MAX_BLOCKS); + const ::ktt::DimensionVector blockDimensions(block_size); + const ::ktt::DimensionVector gridDimensions(max_blocks); auto definition_id = tuner.AddKernelDefinitionFromFile( "ktt_dia_vector_kernel", @@ -132,7 +151,8 @@ auto get_launcher(const kernel_context& ctx, size_t num_cols, bool profile = false) { - return [=] (::ktt::ComputeInterface& interface) { + return [=] (::ktt::ComputeInterface& interface) + { ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]); ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) ); diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 17119b2d..ccf6338b 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -2,11 +2,15 @@ #if REGISTER_PREFETCH_FACTOR > 0 -#define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR + #define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR #elif SHARED_PREFETCH_FACTOR > 0 -#define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR + #define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR #else -#define PREFETCH_FACTOR 0 + #define PREFETCH_FACTOR 0 +#endif + +#ifndef BLOCK_SIZE + #define BLOCK_SIZE 256 #endif @@ -110,8 +114,7 @@ struct UnrolledLoop template + typename ValueType3> __device__ void naive_dia_kernel(const int num_rows, const int num_cols, @@ -155,8 +158,7 @@ naive_dia_kernel(const int num_rows, template + typename ValueType3> __device__ void blocked_offsets_dia_kernel(const int num_rows, const int num_cols, @@ -288,8 +290,237 @@ blocked_offsets_dia_kernel(const int num_rows, template + typename ValueType3> +__device__ void +striped_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) +{ +#ifdef STRIPING_FACTOR + const IndexType num_groups = STRIPING_FACTOR; +#else + const IndexType num_groups = 1; +#endif + + const IndexType group_size = BLOCK_SIZE/num_groups; + const IndexType stride = num_rows/num_groups; + + const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + + const IndexType group_id = threadIdx.x / group_size; + const IndexType group_lane_id = threadIdx.x % group_size; + + const IndexType row = group_id*stride + blockIdx.x*group_size + + group_lane_id; + + // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row); + + __shared__ IndexType offsets[BLOCK_SIZE]; + ValueType3 sum = ValueType3(0); + + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) + { + if (offset_base + threadIdx.x < num_diagonals) + { + offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; + } + + __syncthreads(); + + int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + + if (row < num_rows) + { + for (IndexType i = 0; i < batch_size; ++i) + { + IndexType col = offsets[i] + row; + if (col >= 0 && col < num_cols) + { + auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); + auto x_val = load_x_val(&x[col]); + + sum += diag_val*x_val; + } + } + } + + __syncthreads(); + } + + if (row < num_rows) + y[row] = sum; +} + + +// template +// __device__ void +// double_striped_dia_kernel(const int num_rows, +// const int num_cols, +// const int num_diagonals, +// const int pitch, +// const IndexType* diagonal_offsets, +// const ValueType1* values, +// const ValueType2* x, +// ValueType3* y) +// { +// #ifdef STRIPING_FACTOR +// const IndexType desired_distance = 256; +// const IndexType warps_per_stripe = 1; +// #else +// const IndexType desired_distance = 256; +// const IndexType warps_per_stripe = 1; +// #endif + +// IndexType stripes_needed = +// (num_rows + desired_distance - 1) / desired_distance; + +// IndexType warps_needed = 32 * stripes_needed; +// IndexType warps_available = BLOCK_SIZE / 32; + +// IndexType num_sectors; +// if (warps_needed < warps_available) +// { +// // we dont need multiple sectors +// } +// else +// { +// IndexType num_sectors = warps_needed / warps_available; +// IndexType blocks_per_sector = gridDim.x / num_sectors; + +// IndexType sector_id = blockIdx.x / blocks_per_sector; +// IndexType stripe_id = threadIdx.x / 32; +// IndexType lane_id = threadIdx.x % 32; +// } + +// const IndexType group_size = BLOCK_SIZE/num_groups; +// const IndexType stride = num_rows/num_groups; + +// const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + +// const IndexType group_id = threadIdx.x / group_size; +// const IndexType group_lane_id = threadIdx.x % group_size; + +// const IndexType row = group_id*stride + blockIdx.x*group_size +// + group_lane_id; + +// // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row); + +// __shared__ IndexType offsets[BLOCK_SIZE]; +// ValueType3 sum = ValueType3(0); + +// for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) +// { +// if (offset_base + threadIdx.x < num_diagonals) +// { +// offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; +// } + +// __syncthreads(); + +// int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + +// if (row < num_rows) +// { +// for (IndexType i = 0; i < batch_size; ++i) +// { +// IndexType col = offsets[i] + row; +// if (col >= 0 && col < num_cols) +// { +// auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); +// auto x_val = load_x_val(&x[col]); + +// sum += diag_val*x_val; +// } +// } +// } + +// __syncthreads(); +// } + +// if (row < num_rows) +// y[row] = sum; +// } + + +template +__global__ void +timed_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y, + long long int* times, + const int times_pitch) +{ + const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + const IndexType row = thread_id; + + __shared__ IndexType offsets[BLOCK_SIZE]; + ValueType3 sum = ValueType3(0); + + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) + { + if (offset_base + threadIdx.x < num_diagonals) + { + offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; + } + + __syncthreads(); + + int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + + if (row < num_rows) + { + for (IndexType i = 0; i < batch_size; ++i) + { + IndexType col = offsets[i] + row; + if (col >= 0 && col < num_cols) + { + auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); + auto to_load = &x[col]; + + long long int start = clock64(); + auto x_val = load_x_val(to_load); + long long int end = clock64(); + + if (threadIdx.x % 32 == 0) + { + IndexType k = row / 32; + __stwt(times + (offset_base + i)*times_pitch + k, end - start + 1); + } + + sum += diag_val*x_val; + } + } + } + + __syncthreads(); + } + + if (row < num_rows) + y[row] = sum; +} + + + +template __device__ void cached_x_dia_kernel(const int num_rows, const int num_cols, @@ -358,8 +589,7 @@ cached_x_dia_kernel(const int num_rows, template + typename ValueType3> __device__ void experimental_dia_kernel(const int num_rows, const int num_cols, @@ -425,8 +655,7 @@ experimental_dia_kernel(const int num_rows, template + typename ValueType3> __device__ void cached_x_naive_dia_kernel(const int num_rows, const int num_cols, @@ -472,8 +701,7 @@ cached_x_naive_dia_kernel(const int num_rows, template + typename ValueType3> __launch_bounds__(BLOCK_SIZE,1) __global__ void ktt_dia_vector_kernel( const int num_rows, @@ -486,19 +714,19 @@ ktt_dia_vector_kernel( ValueType3* y) { #if KERNEL_TYPE == 0 - naive_dia_kernel + naive_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #elif KERNEL_TYPE == 1 - blocked_offsets_dia_kernel + blocked_offsets_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #elif KERNEL_TYPE == 2 - cached_x_dia_kernel + striped_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #elif KERNEL_TYPE == 3 - experimental_dia_kernel + experimental_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); // #elif KERNEL_TYPE == 4 -// cached_x_naive_dia_kernel +// cached_x_naive_dia_kernel // (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #endif } From bd0d1c715485ed4f944244a86dddbb9158c3b320 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 3 Jan 2023 14:24:44 +0100 Subject: [PATCH 16/49] Make experimental version of two-level striping --- cusp/system/cuda/ktt/dia_multiply.h | 19 +++ cusp/system/cuda/ktt/kernels/dia_kernel.h | 186 +++++++++++----------- 2 files changed, 110 insertions(+), 95 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 847599be..18c09884 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -30,6 +30,20 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k tuner.AddParameter(kernel.kernel_id, "STRIPING_FACTOR", std::vector{ 2, 4, 8 }); tuner.AddParameter(kernel.kernel_id, "BLOCK_SIZE", std::vector{ 256, 512 }); + // START OF HACKS --------------------- + tuner.AddParameter(kernel.kernel_id, "BLOCKS_PER_SECTOR", std::vector{ 0 }); + tuner.AddParameter(kernel.kernel_id, "CHUNK_SIZE", std::vector{ 0 }); + tuner.AddParameter(kernel.kernel_id, "STRIPE_SIZE", std::vector{ 0 }); + tuner.AddParameter(kernel.kernel_id, "GRID_SIZE", std::vector{ 0 }); + tuner.AddParameter(kernel.kernel_id, "SECTOR_MAPPING_TYPE", + std::vector{ 0, 1 }); + + tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" }, + [] (const std::vector& values) { + return values[0] == 0; + }); + // END OF HACKS ----------------------- + tuner.AddThreadModifier( kernel.kernel_id, { kernel.definition_ids[0] }, @@ -156,6 +170,11 @@ auto get_launcher(const kernel_context& ctx, ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]); ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) ); + auto conf = interface.GetCurrentConfiguration(); + for (const auto& pair : conf.GetPairs()) + if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0) + grid_size = ::ktt::DimensionVector(pair.GetValue()); + if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index ccf6338b..7f26658d 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -357,97 +357,96 @@ striped_dia_kernel(const int num_rows, } -// template -// __device__ void -// double_striped_dia_kernel(const int num_rows, -// const int num_cols, -// const int num_diagonals, -// const int pitch, -// const IndexType* diagonal_offsets, -// const ValueType1* values, -// const ValueType2* x, -// ValueType3* y) -// { -// #ifdef STRIPING_FACTOR -// const IndexType desired_distance = 256; -// const IndexType warps_per_stripe = 1; -// #else -// const IndexType desired_distance = 256; -// const IndexType warps_per_stripe = 1; -// #endif - -// IndexType stripes_needed = -// (num_rows + desired_distance - 1) / desired_distance; - -// IndexType warps_needed = 32 * stripes_needed; -// IndexType warps_available = BLOCK_SIZE / 32; - -// IndexType num_sectors; -// if (warps_needed < warps_available) -// { -// // we dont need multiple sectors -// } -// else -// { -// IndexType num_sectors = warps_needed / warps_available; -// IndexType blocks_per_sector = gridDim.x / num_sectors; - -// IndexType sector_id = blockIdx.x / blocks_per_sector; -// IndexType stripe_id = threadIdx.x / 32; -// IndexType lane_id = threadIdx.x % 32; -// } - -// const IndexType group_size = BLOCK_SIZE/num_groups; -// const IndexType stride = num_rows/num_groups; - -// const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - -// const IndexType group_id = threadIdx.x / group_size; -// const IndexType group_lane_id = threadIdx.x % group_size; - -// const IndexType row = group_id*stride + blockIdx.x*group_size -// + group_lane_id; - -// // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row); - -// __shared__ IndexType offsets[BLOCK_SIZE]; -// ValueType3 sum = ValueType3(0); - -// for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) -// { -// if (offset_base + threadIdx.x < num_diagonals) -// { -// offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; -// } - -// __syncthreads(); - -// int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - -// if (row < num_rows) -// { -// for (IndexType i = 0; i < batch_size; ++i) -// { -// IndexType col = offsets[i] + row; -// if (col >= 0 && col < num_cols) -// { -// auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); -// auto x_val = load_x_val(&x[col]); - -// sum += diag_val*x_val; -// } -// } -// } - -// __syncthreads(); -// } - -// if (row < num_rows) -// y[row] = sum; -// } +template +__device__ void +double_striped_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) +{ +#if KERNEL_TYPE == 3 + IndexType blocks_per_sector = BLOCKS_PER_SECTOR; + IndexType stripe_size = STRIPE_SIZE; + IndexType chunk_size = CHUNK_SIZE; +#else + IndexType blocks_per_sector = 0; + IndexType stripe_size = 0; + IndexType chunk_size = 0; + assert(false); +#endif + +#if SECTOR_MAPPING_TYPE == 0 + IndexType sector_id = blockIdx.x / blocks_per_sector; + IndexType sector_lane_id = blockIdx.x % blocks_per_sector; + + IndexType stripe_id = threadIdx.x / (chunk_size*32); + IndexType stripe_lane_id = threadIdx.x % (chunk_size*32); + + IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE + + stripe_id*stripe_size + + sector_lane_id*chunk_size*32 + + stripe_lane_id; +#elif SECTOR_MAPPING_TYPE == 1 + IndexType num_sectors = gridDim.x / blocks_per_sector; + + IndexType sector_id = blockIdx.x % num_sectors; + IndexType sector_lane_id = blockIdx.x / num_sectors; + + IndexType stripe_id = threadIdx.x / (chunk_size*32); + IndexType stripe_lane_id = threadIdx.x % (chunk_size*32); + + IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE + + stripe_id*stripe_size + + sector_lane_id*chunk_size*32 + + stripe_lane_id; +#endif + + IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + + //printf("(%d, %d, %d) -> %d\n", blockIdx.x, threadIdx.x, thread_id, row); + + __shared__ IndexType offsets[BLOCK_SIZE]; + ValueType3 sum = ValueType3(0); + + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) + { + if (offset_base + threadIdx.x < num_diagonals) + { + offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; + } + + __syncthreads(); + + int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + + if (row < num_rows) + { + for (IndexType i = 0; i < batch_size; ++i) + { + IndexType col = offsets[i] + row; + if (col >= 0 && col < num_cols) + { + auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); + auto x_val = load_x_val(&x[col]); + + sum += diag_val*x_val; + } + } + } + + __syncthreads(); + } + + if (row < num_rows) + y[row] = sum; +} template (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #elif KERNEL_TYPE == 3 - experimental_dia_kernel + double_striped_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -// #elif KERNEL_TYPE == 4 -// cached_x_naive_dia_kernel -// (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); #endif } From 697bec30dd6e07a856c58a732fa04a8d814f9776 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 4 Jan 2023 10:10:17 +0100 Subject: [PATCH 17/49] Add support for csr format --- cusp/ktt/detail/ktt.inl | 5 +- cusp/ktt/ktt.h | 10 +- cusp/system/cuda/ktt/csr_multiply.h | 218 +++++++++++++--------- cusp/system/cuda/ktt/dia_multiply.h | 18 +- cusp/system/cuda/ktt/kernels/csr_kernel.h | 23 +-- cusp/system/cuda/ktt/multiply.h | 94 +++++++++- cusp/system/cuda/ktt/utils.h | 28 ++- cusp/system/detail/generic/multiply.inl | 9 +- testing/ktt.cu | 22 ++- 9 files changed, 281 insertions(+), 146 deletions(-) diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl index f2b3c745..e02b4529 100644 --- a/cusp/ktt/detail/ktt.inl +++ b/cusp/ktt/detail/ktt.inl @@ -101,12 +101,13 @@ template typename Matrix, + typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3> std::vector<::ktt::KernelResult> -tune(const cusp::dia_matrix& A, +tune(const Matrix& A, const cusp::array1d& x, cusp::array1d& y, std::optional<::ktt::ReferenceComputation> reference_computation) diff --git a/cusp/ktt/ktt.h b/cusp/ktt/ktt.h index cab2d637..88a5d026 100644 --- a/cusp/ktt/ktt.h +++ b/cusp/ktt/ktt.h @@ -1,9 +1,10 @@ #pragma once -#include - +#include #include +#include + namespace cusp { namespace ktt { @@ -54,12 +55,13 @@ ::ktt::KernelResult multiply(const Matrix& A, const ::ktt::KernelConfiguration& configuration, bool run_with_profiling = false); -template typename Matrix, + typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3> std::vector<::ktt::KernelResult> -tune(const cusp::dia_matrix& A, +tune(const Matrix& A, const cusp::array1d& x, cusp::array1d& y, std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt); diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h index 0c0deec8..3303fe93 100644 --- a/cusp/system/cuda/ktt/csr_multiply.h +++ b/cusp/system/cuda/ktt/csr_multiply.h @@ -1,25 +1,18 @@ -#pragma once +#pragma once -#include +#include #include -#include -#include // max_active_blocks - -#include #include -namespace cusp::system::cuda::ktt::csr { - inline ::ktt::KernelId get_kernel_id(::ktt::KernelDefinitionId id, std::string kernel_name) { - static ::ktt::KernelId kernel = cusp::ktt::detail::tuner->CreateSimpleKernel(kernel_name, id); - return kernel; - } -} +#include +#include +#include + +#include + +#include -template<> -inline ::ktt::KernelId cusp::ktt::detail::get_kernel_id() { - return cusp::system::cuda::ktt::csr::get_kernel_id(0, ""); -} namespace cusp { @@ -31,109 +24,148 @@ namespace ktt { namespace csr { -inline void setup_tuning_parameters(::ktt::Tuner& tuner, ::ktt::KernelId kernel) { - tuner.AddParameter(kernel, "TEST_PARAM", std::vector{0, 1, 2}); + +inline void setup_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector{ 128 }); + tuner.AddParameter(kernel_id, "THREADS_PER_VECTOR", std::vector{ 32 }); + + tuner.AddThreadModifier( + kernel.kernel_id, + { kernel.definition_ids[0] }, + ::ktt::ModifierType::Local, + ::ktt::ModifierDimension::X, + { std::string("BLOCK_SIZE") }, + [] (const uint64_t defaultSize, const std::vector& parameters) { + return parameters[0]; + } + ); } -} // namespace csr -template -void multiply(cuda::execution_policy& exec, - const MatrixType& A, - const VectorType1& x, - VectorType2& y, - cusp::csr_format) +template +kernel_context initialize_kernel(::ktt::Tuner& tuner) { - static_assert( std::is_same_v - && std::is_same_v - && std::is_same_v, - "All arguments must be in device memory." ); + std::string kernel_path = + STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h"; - using ValueType = typename MatrixType::value_type; + kernel_context kernel(tuner); - if (A.num_entries == 0) { - thrust::fill(y.begin(), y.end(), ValueType(0)); - return; - } + std::vector< std::string > type_names { + std::string(NAMEOF_TYPE(IndexType)), + std::string(NAMEOF_TYPE(ValueType1)), + std::string(NAMEOF_TYPE(ValueType2)), + std::string(NAMEOF_TYPE(ValueType3)), + }; - std::cout << "Hello from out thing\n"; + // NOTE: These can be anything since they are awlays set in the launcher. + // So use some values that will hopefully cause a crash if not set properly + // in the launcher. + ::ktt::DimensionVector block_size(0); + ::ktt::DimensionVector grid_size(0); - // copied from csr_vector_spmv.h - typedef typename MatrixType::row_offsets_array_type::const_iterator RowIterator; - typedef typename MatrixType::column_indices_array_type::const_iterator ColumnIterator; - typedef typename MatrixType::values_array_type::const_iterator ValueIterator1; + auto definition_id = tuner.AddKernelDefinitionFromFile( + "ktt_csr_vector_kernel", + kernel_path, + grid_size, + block_size, + type_names + ); - typedef typename VectorType1::const_iterator ValueIterator2; - typedef typename VectorType2::iterator ValueIterator3; + kernel.definition_ids.push_back(definition_id); + kernel.kernel_id = tuner.CreateSimpleKernel("CsrKernel", definition_id); - auto& tuner = *cusp::ktt::detail::tuner; + setup_tuning_parameters(kernel); - std::string path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h"; + return kernel; +} - const size_t THREADS_PER_BLOCK = 128; - const size_t THREADS_PER_VECTOR = 32; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - std::vector< std::string > type_names { - std::string(NAMEOF_TYPE(typename MatrixType::index_type)), - std::string(NAMEOF_TYPE(typename MatrixType::value_type)), - std::string(NAMEOF_TYPE(typename VectorType1::value_type)), - std::string(NAMEOF_TYPE(typename VectorType2::value_type)), - std::to_string(VECTORS_PER_BLOCK), - std::to_string(THREADS_PER_VECTOR) - }; - // TODO: we probably wanna change this later - const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks( - ktt_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK)); +} // namespace csr + + +template +const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::csr_format format) +{ + static kernel_context kernel = + csr::initialize_kernel(tuner); + + return kernel; +} + - const ::ktt::DimensionVector blockDimensions(THREADS_PER_BLOCK); - const ::ktt::DimensionVector gridDimensions(NUM_BLOCKS); +template +std::vector<::ktt::ArgumentId> +add_arguments(const kernel_context& kernel, + const cusp::csr_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + auto args = add_arguments(*kernel.tuner, + A.num_rows, A.row_offsets, A.column_indices, + A.values, x, y); - ::ktt::KernelDefinitionId definition = tuner.GetKernelDefinitionId("ktt_csr_vector_kernel", type_names); + kernel.tuner->SetArguments(kernel.definition_ids[0], args); - bool called_first_time = definition == ::ktt::InvalidKernelDefinitionId; + return args; +} - if (called_first_time) { - definition = tuner.AddKernelDefinitionFromFile( - "ktt_csr_vector_kernel", - path, - gridDimensions, - blockDimensions, - type_names); - } - ::ktt::KernelId kernel = csr::get_kernel_id(definition, "CsrKernel"); +// Returns the argument id of the y vector given a vector of arguments +// returned by a previous call of `add_arguments`. +inline ::ktt::ArgumentId +get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, + cusp::csr_format) +{ + return arguments[5]; +} - if (called_first_time) { - csr::setup_tuning_parameters(tuner, kernel); - } - auto num_rows_id = tuner.AddArgumentScalar(A.num_rows); - auto Ap_id = add_arg(A.row_offsets); - auto Aj_id = add_arg(A.column_indices); - auto Ax_id = add_arg(A.values); - auto x_id = add_arg(x); - auto y_id = add_arg(y, ::ktt::ArgumentAccessType::ReadWrite); +template +auto get_launcher(const kernel_context& ctx, + const cusp::csr_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + bool profile = false) +{ + return [&] (::ktt::ComputeInterface& interface) + { + auto conf = interface.GetCurrentConfiguration(); - std::vector<::ktt::ArgumentId> args = { num_rows_id, Ap_id, Aj_id, Ax_id, x_id, y_id }; + ::ktt::DimensionVector block_size = + interface.GetCurrentLocalSize(ctx.definition_ids[0]); - tuner.SetArguments(definition, args); + auto threads_per_vector = get_parameter_uint(conf, "THREADS_PER_VECTOR"); + auto vectors_per_block = block_size.GetSizeX() / threads_per_vector; - tuner.TuneIteration(kernel, {}); + ::ktt::DimensionVector grid_size( + DIVIDE_INTO(A.num_rows, vectors_per_block)); - tuner.SetArguments(definition, {}); - for (auto arg : args) { - tuner.RemoveArgument(arg); - } + if (!profile) { + interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); + } else { + interface.RunKernelWithProfiling(ctx.definition_ids[0], + grid_size, block_size); + } + }; } + } // namespace ktt } // namespace cuda diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 18c09884..5e9a7414 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -1,11 +1,14 @@ #pragma once -#include -#include // max_active_blocks +#include -#include #include #include +#include + +#include // max_active_blocks +#include +#include #include #include @@ -94,11 +97,6 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h"; - const size_t block_size = 256; - const size_t max_blocks = cusp::system::cuda::detail::max_active_blocks( - ktt_dia_vector_kernel, - block_size, 0); - std::vector< std::string > type_names { std::string(NAMEOF_TYPE(IndexType)), std::string(NAMEOF_TYPE(ValueType1)), @@ -106,8 +104,8 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) std::string(NAMEOF_TYPE(ValueType3)) }; - const ::ktt::DimensionVector blockDimensions(block_size); - const ::ktt::DimensionVector gridDimensions(max_blocks); + const ::ktt::DimensionVector blockDimensions(0); + const ::ktt::DimensionVector gridDimensions(0); auto definition_id = tuner.AddKernelDefinitionFromFile( "ktt_dia_vector_kernel", diff --git a/cusp/system/cuda/ktt/kernels/csr_kernel.h b/cusp/system/cuda/ktt/kernels/csr_kernel.h index f28773a1..85cd6fa6 100644 --- a/cusp/system/cuda/ktt/kernels/csr_kernel.h +++ b/cusp/system/cuda/ktt/kernels/csr_kernel.h @@ -1,17 +1,11 @@ +#pragma once -inline __device__ int fib(int n) { - if (n < 2) { - return n; - } - return fib(n-1) + fib(n-2); -} +#define VECTORS_PER_BLOCK (BLOCK_SIZE/THREADS_PER_VECTOR) template + typename ValueType3> __global__ void ktt_csr_vector_kernel(const unsigned int num_rows, const IndexType* Ap, @@ -20,17 +14,6 @@ ktt_csr_vector_kernel(const unsigned int num_rows, const ValueType2* x, ValueType3* y) { - -#if TEST_PARAM == 0 - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("Test param is 0.\n"); - } -#else - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("Fib of 20 is %d.\n", fib(20)); - } -#endif - typedef ValueType1 ValueType; __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h index 5e6accab..4a8e9a34 100644 --- a/cusp/system/cuda/ktt/multiply.h +++ b/cusp/system/cuda/ktt/multiply.h @@ -1,9 +1,14 @@ -#pragma once +#pragma once #include -//#include + +#include #include +#include + +#include + namespace cusp { namespace system { @@ -12,7 +17,7 @@ namespace cuda { namespace ktt { -// TODO: should this be here?? + // Helpful function to get a kernel used for a given multiplication // instead of having to manually specify the types. template (tuner, FormatType{}); } + +template +::ktt::KernelResult multiply(::ktt::Tuner& tuner, const MatrixType& A, + const VectorType1& x, VectorType2& y) +{ + const kernel_context& kernel = get_kernel(tuner, A, x, y); + + auto args = add_arguments(kernel, A, x, y); + + auto launcher = get_launcher(kernel, A, x, y, false); + tuner.SetLauncher(kernel.kernel_id, launcher); + + auto res = tuner.TuneIteration(kernel.kernel_id, {}); + + remove_arguments(kernel, args); + + return res; +} + + +template +::ktt::KernelResult +multiply(::ktt::Tuner& tuner, const MatrixType& A, const VectorType1& x, + VectorType2& y, const ::ktt::KernelConfiguration& configuration, + bool run_with_profiling = false) +{ + const kernel_context& kernel = get_kernel(tuner, A, x, y); + + auto args = add_arguments(kernel, A, x, y); + + auto launcher = get_launcher(kernel, A, x, y, run_with_profiling); + tuner.SetLauncher(kernel.kernel_id, launcher); + + auto result = tuner.Run(kernel.kernel_id, configuration, {}); + + remove_arguments(kernel, args); + + return result; +} + + +template +std::vector<::ktt::KernelResult> +tune(::ktt::Tuner& tuner, const MatrixType& A, + const VectorType1& x, VectorType2& y, + std::optional<::ktt::ReferenceComputation> ref_computation = std::nullopt) +{ + using Format = typename MatrixType::format; + using HostVector2 = + typename VectorType2::template rebind::type; + + kernel_context kernel = get_kernel(tuner, A, x, y); + auto args = add_arguments(kernel, A, x, y); + + if (ref_computation) + { + tuner.SetReferenceComputation(get_output_argument(args, Format{}), + *ref_computation); + } + + HostVector2 host_y = y; + tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) + { + // clear y so previous results don't affect validation + y = host_y; + auto launcher = get_launcher(kernel, A, x, y, false); + launcher(interface); + }); + + auto results = tuner.Tune(kernel.kernel_id); + + remove_arguments(kernel, args); + + return results; +} + + } // namespace ktt } // namespace cuda diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h index a2d1c029..556f9542 100644 --- a/cusp/system/cuda/ktt/utils.h +++ b/cusp/system/cuda/ktt/utils.h @@ -3,6 +3,10 @@ #include #include +#include // uint64_t +#include // std::runtime_error + + #define STR(str) #str #define STRING(str) STR(str) @@ -16,7 +20,7 @@ namespace ktt { template void* cast(const T* ptr) -{ +{ return const_cast(static_cast(ptr)); } @@ -65,7 +69,7 @@ void add_arg(::ktt::Tuner& tuner, std::vector<::ktt::ArgumentId>& argument_ids, argument_ids.push_back(id); } -template +template std::vector<::ktt::ArgumentId> add_arguments(::ktt::Tuner& tuner, Args&&... args) { std::vector<::ktt::ArgumentId> argument_ids; @@ -85,6 +89,26 @@ inline void remove_arguments(const kernel_context& kernel, const std::vector<::k } +uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf, + const std::string& name) +{ + for (const auto& pair : conf.GetPairs()) + if (pair.GetName() == name) + return pair.GetValue(); + + throw std::runtime_error("No paramater with name: " + name); +} + +double get_parameter_double(const ::ktt::KernelConfiguration& conf, + const std::string& name) +{ + for (const auto& pair : conf.GetPairs()) + if (pair.GetName() == name) + return pair.GetValueDouble(); + + throw std::runtime_error("No paramater with name: " + name); +} + } // namespace ktt } // namespace cuda diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl index 6e620701..501e0b80 100644 --- a/cusp/system/detail/generic/multiply.inl +++ b/cusp/system/detail/generic/multiply.inl @@ -109,7 +109,7 @@ multiply(thrust::execution_policy &exec, template struct has_rebind : std::false_type {}; - + template struct has_rebind>> : std::true_type {}; @@ -128,11 +128,14 @@ multiply(cusp::system::cuda::detail::execution_policy &exec, { typedef typename LinearOperator::format MatrixFormat; - if constexpr ((/*cusp::detail::is_csr::value ||*/ cusp::detail::is_dia::value) && has_rebind_v) { + if constexpr ((cusp::detail::is_csr::value + || cusp::detail::is_dia::value) + && has_rebind_v) + { if (cusp::ktt::detail::is_enabled) { cusp::ktt::detail::lazy_init(); cusp::system::cuda::ktt::multiply(*cusp::ktt::detail::tuner, A, B, C); - return; + return; } } diff --git a/testing/ktt.cu b/testing/ktt.cu index 613e668a..e54f4289 100644 --- a/testing/ktt.cu +++ b/testing/ktt.cu @@ -29,8 +29,9 @@ } \ DECLARE_UNITTEST(VTEST##Ktt##Fmt##Matrix); -#define DECLARE_KTT_UNITTEST(VTEST) \ - DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST,Dia,dia) +#define DECLARE_KTT_UNITTEST(VTEST) \ + DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \ + DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Csr, csr) struct UnitTestStopCondition : ::ktt::StopCondition @@ -71,11 +72,12 @@ private: }; -void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& results, - const std::string& ktt_logs, - const std::string& arg_name, - const std::string& filename = "unknown", - int lineno = -1) +void assert_tunning_results_valid( + const std::vector<::ktt::KernelResult>& results, + const std::string& ktt_logs, + const std::string& arg_name, + const std::string& filename = "unknown", + int lineno = -1) { bool failed = false; unittest::UnitTestFailure f; @@ -106,8 +108,10 @@ void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& result failed = true; - f << "[" << filename << ":" << lineno << "] " << result.GetKernelName() << ": "; - f << "Encountered an error: " << reason << "\n\n"; + f << "[" << filename << ":" << lineno << "] "; + f << result.GetKernelName() << ": "; + f << reason << "\n\n"; + f << "On matrix: " << arg_name << "\n\n"; f << "In configuration:\n"; From b0f277766e12ec516970b47c7fe781ac60b99f5a Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 10 Jan 2023 19:11:22 +0100 Subject: [PATCH 18/49] Fix deprecation warning --- cusp/system/cuda/detail/synchronize.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cusp/system/cuda/detail/synchronize.inl b/cusp/system/cuda/detail/synchronize.inl index 8256398e..73919e38 100644 --- a/cusp/system/cuda/detail/synchronize.inl +++ b/cusp/system/cuda/detail/synchronize.inl @@ -32,7 +32,7 @@ namespace detail void synchronize(const char *message) { - cudaError_t error = cudaThreadSynchronize(); + cudaError_t error = cudaDeviceSynchronize(); if(error) { throw thrust::system_error(error, thrust::cuda_category(), std::string("synchronize: ") + message); From 256c3c8c0d78f04803885b6b096c12bb11523912 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 14 Feb 2023 20:45:00 +0100 Subject: [PATCH 19/49] Make it possible to use ell with ktt --- cusp/system/cuda/ktt/ell_multiply.h | 160 ++++++++++++++++++++++ cusp/system/cuda/ktt/kernels/ell_kernel.h | 55 ++++++++ cusp/system/cuda/ktt/multiply.h | 1 + cusp/system/detail/generic/multiply.inl | 4 +- testing/ktt.cu | 2 +- 5 files changed, 219 insertions(+), 3 deletions(-) create mode 100644 cusp/system/cuda/ktt/ell_multiply.h create mode 100644 cusp/system/cuda/ktt/kernels/ell_kernel.h diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h new file mode 100644 index 00000000..f7a8da71 --- /dev/null +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -0,0 +1,160 @@ +#pragma once + +#include +#include + +namespace cusp { + +namespace system { + +namespace cuda { + +namespace ktt { + +namespace ell { + + +inline void setup_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector{ 128 }); + tuner.AddParameter(kernel_id, "UNCACHED_LOADS", + std::vector{ 0, 1 }); + + tuner.AddThreadModifier( + kernel_id, + { kernel.definition_ids[0] }, + ::ktt::ModifierType::Local, + ::ktt::ModifierDimension::X, + { std::string("BLOCK_SIZE") }, + [] (const uint64_t defaultSize, const std::vector& parameters) { + return parameters[0]; + } + ); +} + + +template +kernel_context initialize_kernel(::ktt::Tuner& tuner) +{ + std::string kernel_path = + STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/ell_kernel.h"; + + kernel_context kernel(tuner); + + std::vector< std::string > type_names { + std::string(NAMEOF_TYPE(IndexType)), + std::string(NAMEOF_TYPE(ValueType1)) + }; + + // NOTE: These can be anything since they are awlays set in the launcher. + // So use some values that will hopefully cause a crash if not set properly + // in the launcher. + ::ktt::DimensionVector block_size(0); + ::ktt::DimensionVector grid_size(0); + + auto definition_id = tuner.AddKernelDefinitionFromFile( + "ktt_ell_kernel", + kernel_path, + grid_size, + block_size, + type_names + ); + + kernel.definition_ids.push_back(definition_id); + kernel.kernel_id = tuner.CreateSimpleKernel("EllKernel", definition_id); + + setup_tuning_parameters(kernel); + + return kernel; +} + + +} // namespace ell + + +template +const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::ell_format format) +{ + static kernel_context kernel = + ell::initialize_kernel(tuner); + + return kernel; +} + + +template +std::vector<::ktt::ArgumentId> +add_arguments(const kernel_context& kernel, + const cusp::ell_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + IndexType pitch = A.column_indices.pitch; + IndexType num_entries_per_row = A.column_indices.num_cols; + + auto args = add_arguments(*kernel.tuner, + A.num_rows, A.num_cols, num_entries_per_row, pitch, + A.column_indices.values, A.values.values, x, y); + + kernel.tuner->SetArguments(kernel.definition_ids[0], args); + + return args; +} + + +// Returns the argument id of the y vector given a vector of arguments +// returned by a previous call of `add_arguments`. +inline ::ktt::ArgumentId +get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, + cusp::ell_format) +{ + return arguments[7]; +} + + +template +auto get_launcher(const kernel_context& ctx, + const cusp::ell_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + bool profile = false) +{ + return [&] (::ktt::ComputeInterface& interface) + { + ::ktt::DimensionVector block_size = + interface.GetCurrentLocalSize(ctx.definition_ids[0]); + ::ktt::DimensionVector grid_size( + DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); + + if (!profile) { + interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); + } else { + interface.RunKernelWithProfiling(ctx.definition_ids[0], + grid_size, block_size); + } + }; +} + + +} // namespace ktt + +} // namespace cuda + +} // namespace system + +} // namespace cusp diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h new file mode 100644 index 00000000..be2d250d --- /dev/null +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -0,0 +1,55 @@ + +template +__device__ T load_uncached(const T* addr) +{ +#if UNCACHED_LOADS == 0 + return *addr; +#else + return __ldcv(addr); +#endif +} + +template +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_ell_kernel(const IndexType num_rows, + const IndexType num_cols, + const IndexType num_cols_per_row, + const IndexType pitch, + const IndexType * Aj, + const ValueType * Ax, + const ValueType * x, + ValueType * y) +{ + const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x; + + // if (row == 0) + // { + // printf("Hello from ELL.\n"); + // } + + if (row < num_rows) + { + ValueType sum = 0; + IndexType offset = row; + + for(IndexType n = 0; n < num_cols_per_row; n++) + { + const IndexType col = load_uncached(Aj + offset); + + // This assumes that + // cusp::ell_matrix<...>::invalid_index is always < 0 + if (col >= 0) + { + const ValueType x_j = x[col]; + const ValueType A_ij = load_uncached(Ax + offset); + + sum += A_ij * x_j; + } + + offset += pitch; + } + + y[row] = sum; + } +} diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h index 4a8e9a34..4f5a0b84 100644 --- a/cusp/system/cuda/ktt/multiply.h +++ b/cusp/system/cuda/ktt/multiply.h @@ -4,6 +4,7 @@ #include #include +#include #include diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl index 501e0b80..99909c51 100644 --- a/cusp/system/detail/generic/multiply.inl +++ b/cusp/system/detail/generic/multiply.inl @@ -128,8 +128,8 @@ multiply(cusp::system::cuda::detail::execution_policy &exec, { typedef typename LinearOperator::format MatrixFormat; - if constexpr ((cusp::detail::is_csr::value - || cusp::detail::is_dia::value) + if constexpr ((cusp::detail::is_ell::value + || cusp::detail::is_dia::value) && has_rebind_v) { if (cusp::ktt::detail::is_enabled) { diff --git a/testing/ktt.cu b/testing/ktt.cu index e54f4289..aac96e34 100644 --- a/testing/ktt.cu +++ b/testing/ktt.cu @@ -31,7 +31,7 @@ #define DECLARE_KTT_UNITTEST(VTEST) \ DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \ - DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Csr, csr) + DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell) struct UnitTestStopCondition : ::ktt::StopCondition From cd09e89b542e6383c4faa21f71ebbf02fdc2dbd5 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 15 Feb 2023 10:29:55 +0100 Subject: [PATCH 20/49] Add different load types --- cusp/system/cuda/ktt/kernels/ell_kernel.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h index be2d250d..e8b55d2f 100644 --- a/cusp/system/cuda/ktt/kernels/ell_kernel.h +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -4,8 +4,12 @@ __device__ T load_uncached(const T* addr) { #if UNCACHED_LOADS == 0 return *addr; -#else +#elif UNCACHED_LOADS == 1 return __ldcv(addr); +#elif UNCACHED_LOADS == 2 + return __ldcs(addr); +#elif UNCACHED_LOADS == 3 + return __ldlu(addr); #endif } @@ -16,18 +20,13 @@ ktt_ell_kernel(const IndexType num_rows, const IndexType num_cols, const IndexType num_cols_per_row, const IndexType pitch, - const IndexType * Aj, - const ValueType * Ax, - const ValueType * x, - ValueType * y) + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y) { const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x; - // if (row == 0) - // { - // printf("Hello from ELL.\n"); - // } - if (row < num_rows) { ValueType sum = 0; From ae7d024f593cb25826159116ba245de118094d1d Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Sun, 5 Mar 2023 20:29:53 +0100 Subject: [PATCH 21/49] Add convenience type for creating tuning configuration --- cusp/system/cuda/ktt/utils.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h index 556f9542..a3f4af1e 100644 --- a/cusp/system/cuda/ktt/utils.h +++ b/cusp/system/cuda/ktt/utils.h @@ -18,6 +18,9 @@ namespace cuda { namespace ktt { +using u64_vec = std::vector; + + template void* cast(const T* ptr) { From 41a18e1c85f58f005f39c635d218026979200164 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Sun, 5 Mar 2023 20:30:34 +0100 Subject: [PATCH 22/49] Add new tuning parameters --- cusp/system/cuda/ktt/ell_multiply.h | 32 +++- cusp/system/cuda/ktt/kernels/ell_kernel.h | 193 ++++++++++++++++++++-- 2 files changed, 206 insertions(+), 19 deletions(-) diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index f7a8da71..19456107 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -19,9 +19,12 @@ inline void setup_tuning_parameters(const kernel_context& kernel) auto& tuner = *kernel.tuner; auto kernel_id = kernel.kernel_id; - tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector{ 128 }); - tuner.AddParameter(kernel_id, "UNCACHED_LOADS", - std::vector{ 0, 1 }); + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128 }); + tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "UNCACHED_LOADS", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "DISABLE_UNROLL", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 }); + tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4 }); tuner.AddThreadModifier( kernel_id, @@ -33,6 +36,24 @@ inline void setup_tuning_parameters(const kernel_context& kernel) return parameters[0]; } ); + + // BREAK can be used only when there is no prefetching + // TODO(KTT): Implement prefetching when using early break and remove this + tuner.AddConstraint( + kernel.kernel_id, + { "BREAK", "PREFETCH_FACTOR" }, + [] (const std::vector& values) { + return values[0] == 0 || values[1] == 0; + }); + + + // All the warps working on one stripe of the matrix fit in a thread block. + tuner.AddConstraint( + kernel.kernel_id, + { "THREADS_PER_ROW", "BLOCK_SIZE" }, + [] (const std::vector& values) { + return 32*values[0] <= values[1]; + }); } @@ -136,10 +157,13 @@ auto get_launcher(const kernel_context& ctx, { return [&] (::ktt::ComputeInterface& interface) { + const auto& conf = interface.GetCurrentConfiguration(); + auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW"); + ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]); ::ktt::DimensionVector grid_size( - DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); + threads_per_row * DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h index e8b55d2f..3f42ce92 100644 --- a/cusp/system/cuda/ktt/kernels/ell_kernel.h +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -1,6 +1,13 @@ +#if DISABLE_UNROLL == 1 +#define UNROLL 1 +#else +#define UNROLL +#endif + + template -__device__ T load_uncached(const T* addr) +__device__ T load(const T* addr) { #if UNCACHED_LOADS == 0 return *addr; @@ -13,42 +20,198 @@ __device__ T load_uncached(const T* addr) #endif } + +template +struct Prefetcher : Prefetcher +{ + using parent = Prefetcher; + + IndexType col; + ValueType A_val = 0; + ValueType x_val = 0; + + __device__ + void prefetch_cols(const IndexType* __restrict__ Aj, + IndexType offset, + IndexType pitch, + IndexType i) + { + col = load(Aj + offset + i*pitch); + parent::prefetch_cols(Aj, offset, pitch, i + 1); + } + + __device__ + void prefetch_vals(const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + IndexType offset, + IndexType pitch, + IndexType i) + { + if (col >= 0) + { + A_val = load(Ax + offset + i*pitch); + x_val = x[col]; + } + parent::prefetch_vals(Ax, x, offset, pitch, i + 1); + } + + __device__ ValueType accumulate_results() + { + return x_val*A_val + parent::accumulate_results(); + } +}; + +template +struct Prefetcher +{ + __device__ + void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset, + IndexType pitch, IndexType i) { } + + __device__ + void prefetch_vals(const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, IndexType offset, + IndexType pitch, IndexType i) { } + + __device__ ValueType accumulate_results() + { + return 0; + } +}; + + template -__launch_bounds__(BLOCK_SIZE, 1) __global__ void -ktt_ell_kernel(const IndexType num_rows, - const IndexType num_cols, - const IndexType num_cols_per_row, - const IndexType pitch, - const IndexType* __restrict__ Aj, - const ValueType* __restrict__ Ax, - const ValueType* __restrict__ x, - ValueType* __restrict__ y) +__device__ void +ktt_ell_kernel_basic(const IndexType num_rows, + const IndexType num_cols, + const IndexType num_cols_per_row, + const IndexType pitch, + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y) { + +#if THREADS_PER_ROW > 1 + const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + const IndexType warp_id = thread_id/32; + const IndexType stripe_id = warp_id/THREADS_PER_ROW; + const IndexType warp_lane = thread_id % 32; + const IndexType stripe_lane = warp_id % THREADS_PER_ROW; + const IndexType block_lane = threadIdx.x/(32*THREADS_PER_ROW); + + const IndexType row = stripe_id*32 + warp_lane; + const IndexType start_offset = row + stripe_lane * pitch; + const IndexType stride = THREADS_PER_ROW * pitch; + + const IndexType rounded_up_count = + (num_cols_per_row + THREADS_PER_ROW - 1)/THREADS_PER_ROW; + const IndexType remainder = num_cols_per_row % THREADS_PER_ROW; + + const IndexType elem_count = + remainder == 0 || stripe_lane < remainder + ? rounded_up_count + : rounded_up_count - 1; + + //if (row == 1) + // printf("%d -> %d %d %d %d\n", thread_id, rounded_up_count, remainder, stripe_lane, elem_count); + + const int stripes_in_block = BLOCK_SIZE/(32*THREADS_PER_ROW); + __shared__ ValueType sums[stripes_in_block][32]; + + if (stripe_lane == 0) + { + sums[block_lane][warp_lane] = 0; + } + __syncthreads(); +#else const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x; + const IndexType start_offset = row; + const IndexType stride = pitch; + const IndexType elem_count = num_cols_per_row; +#endif if (row < num_rows) { ValueType sum = 0; - IndexType offset = row; + IndexType offset = start_offset; - for(IndexType n = 0; n < num_cols_per_row; n++) +#if PREFETCH_FACTOR > 0 + IndexType num_iters = elem_count/PREFETCH_FACTOR; + + #pragma unroll UNROLL + for(IndexType n = 0; n < num_iters; ++n) { - const IndexType col = load_uncached(Aj + offset); + Prefetcher prefetcher; + + prefetcher.prefetch_cols(Aj, offset, stride, 0); + prefetcher.prefetch_vals(Ax, x, offset, stride, 0); + sum += prefetcher.accumulate_results(); + + offset += PREFETCH_FACTOR * stride; + } + + IndexType remaining_iters = elem_count % PREFETCH_FACTOR; +#else + IndexType remaining_iters = elem_count; +#endif + + #pragma unroll UNROLL + for (int n = 0; n < remaining_iters; ++n) + { + const IndexType col = load(Aj + offset); // This assumes that // cusp::ell_matrix<...>::invalid_index is always < 0 if (col >= 0) { const ValueType x_j = x[col]; - const ValueType A_ij = load_uncached(Ax + offset); + const ValueType A_ij = load(Ax + offset); sum += A_ij * x_j; } + else + { +#if BREAK == 1 + break; +#endif + } - offset += pitch; + offset += stride; } +#if THREADS_PER_ROW > 1 + atomicAdd(&sums[block_lane][warp_lane], sum); + + __syncthreads(); + + if (stripe_lane == 0) + { + y[row] = sums[block_lane][warp_lane]; + } +#else y[row] = sum; +#endif } } + + +template +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_ell_kernel(const IndexType num_rows, + const IndexType num_cols, + const IndexType num_cols_per_row, + const IndexType pitch, + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y) +{ + ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch, + Aj, Ax, x, y); +} From 4376fd1aff2c671d66d66dfcddfe51a17353b40c Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Sun, 5 Mar 2023 20:31:12 +0100 Subject: [PATCH 23/49] Shift all non-zeros to the start when converting from dia to ell --- .../detail/generic/conversions/dia_to_other.h | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/cusp/system/detail/generic/conversions/dia_to_other.h b/cusp/system/detail/generic/conversions/dia_to_other.h index 45c26f47..bc30fb5a 100644 --- a/cusp/system/detail/generic/conversions/dia_to_other.h +++ b/cusp/system/detail/generic/conversions/dia_to_other.h @@ -41,6 +41,7 @@ #include #include +#include namespace cusp { @@ -211,6 +212,62 @@ convert(thrust::execution_policy& exec, thrust::copy(exec, src.values.values.begin(), src.values.values.end(), dst.values.values.begin()); + using StridedIndexIterator = thrust::transform_iterator< + cusp::multiplies_value, + IndexIterator>; + using RowElemIndicesIterator = thrust::transform_iterator< + cusp::plus_value, + StridedIndexIterator>; + using RowColsIterator = thrust::permutation_iterator< + decltype(dst.column_indices.values.begin()), + RowElemIndicesIterator>; + using RowValsIterator = thrust::permutation_iterator< + decltype(dst.values.values.begin()), + RowElemIndicesIterator>; + + for (size_t row_idx = 0; row_idx < src.num_rows; ++row_idx) + { + StridedIndexIterator strided_cols( + IndexIterator(0), + cusp::multiplies_value(pitch)); + RowElemIndicesIterator elem_indices( + strided_cols, + cusp::plus_value((IndexType)row_idx)); + + RowColsIterator row_cols(dst.column_indices.values.begin(), elem_indices); + RowValsIterator row_vals(dst.values.values.begin(), elem_indices); + + thrust::stable_partition(exec, + row_vals, + row_vals + dst.column_indices.num_cols, + row_cols, + thrust::placeholders::_1 != IndexType(-1)); + + thrust::stable_partition(exec, + row_cols, + row_cols + dst.column_indices.num_cols, + thrust::placeholders::_1 != IndexType(-1)); + } + + // for (int row = 0; row < dst.num_rows; ++row) + // { + // int last = 0; + // for (int col = 0; col < dst.column_indices.num_cols; ++col) + // { + // if (dst.column_indices(row, col) >= 0) + // { + // auto tmp1 = dst.column_indices(row, last); + // dst.column_indices(row, last) = dst.column_indices(row, col); + // dst.column_indices(row, col) = tmp1; + + // auto tmp2 = dst.values(row, last); + // dst.values(row, last) = dst.values(row, col); + // dst.values(row, col) = tmp2; + + // last++; + // } + // } + // } } } // end namespace generic From 17342e1188cc21b04af9ccacd3d8f943d8b89345 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 7 Mar 2023 09:02:22 +0100 Subject: [PATCH 24/49] Simplify the ell kernel --- cusp/system/cuda/ktt/ell_multiply.h | 9 ++-- cusp/system/cuda/ktt/kernels/ell_kernel.h | 59 ++++++----------------- 2 files changed, 22 insertions(+), 46 deletions(-) diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 19456107..8b217bb9 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -160,10 +160,13 @@ auto get_launcher(const kernel_context& ctx, const auto& conf = interface.GetCurrentConfiguration(); auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW"); - ::ktt::DimensionVector block_size = - interface.GetCurrentLocalSize(ctx.definition_ids[0]); + auto threads_in_block = + interface.GetCurrentLocalSize(ctx.definition_ids[0]).GetSizeX(); + + ::ktt::DimensionVector block_size( + threads_in_block/threads_per_row, threads_per_row); ::ktt::DimensionVector grid_size( - threads_per_row * DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); + DIVIDE_INTO(A.num_rows, threads_in_block/threads_per_row)); if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h index 3f42ce92..b6a29324 100644 --- a/cusp/system/cuda/ktt/kernels/ell_kernel.h +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -95,56 +95,33 @@ ktt_ell_kernel_basic(const IndexType num_rows, const ValueType* __restrict__ x, ValueType* __restrict__ y) { - -#if THREADS_PER_ROW > 1 - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - const IndexType warp_id = thread_id/32; - const IndexType stripe_id = warp_id/THREADS_PER_ROW; - const IndexType warp_lane = thread_id % 32; - const IndexType stripe_lane = warp_id % THREADS_PER_ROW; - const IndexType block_lane = threadIdx.x/(32*THREADS_PER_ROW); - - const IndexType row = stripe_id*32 + warp_lane; - const IndexType start_offset = row + stripe_lane * pitch; + const IndexType row = blockDim.x*blockIdx.x + threadIdx.x; const IndexType stride = THREADS_PER_ROW * pitch; - const IndexType rounded_up_count = - (num_cols_per_row + THREADS_PER_ROW - 1)/THREADS_PER_ROW; - const IndexType remainder = num_cols_per_row % THREADS_PER_ROW; - - const IndexType elem_count = - remainder == 0 || stripe_lane < remainder - ? rounded_up_count - : rounded_up_count - 1; - - //if (row == 1) - // printf("%d -> %d %d %d %d\n", thread_id, rounded_up_count, remainder, stripe_lane, elem_count); - - const int stripes_in_block = BLOCK_SIZE/(32*THREADS_PER_ROW); - __shared__ ValueType sums[stripes_in_block][32]; +#if THREADS_PER_ROW > 1 + __shared__ ValueType sums[BLOCK_SIZE/THREADS_PER_ROW]; - if (stripe_lane == 0) + if (threadIdx.y == 0) { - sums[block_lane][warp_lane] = 0; + sums[threadIdx.x] = 0; } + __syncthreads(); -#else - const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x; - const IndexType start_offset = row; - const IndexType stride = pitch; - const IndexType elem_count = num_cols_per_row; #endif if (row < num_rows) { ValueType sum = 0; - IndexType offset = start_offset; + IndexType n = threadIdx.y; + IndexType offset = row + n * pitch; #if PREFETCH_FACTOR > 0 - IndexType num_iters = elem_count/PREFETCH_FACTOR; + IndexType bound = num_cols_per_row + - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW; + IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW; #pragma unroll UNROLL - for(IndexType n = 0; n < num_iters; ++n) + for(; n < bound; n += step) { Prefetcher prefetcher; @@ -154,14 +131,10 @@ ktt_ell_kernel_basic(const IndexType num_rows, offset += PREFETCH_FACTOR * stride; } - - IndexType remaining_iters = elem_count % PREFETCH_FACTOR; -#else - IndexType remaining_iters = elem_count; #endif #pragma unroll UNROLL - for (int n = 0; n < remaining_iters; ++n) + for (; n < num_cols_per_row; n += THREADS_PER_ROW) { const IndexType col = load(Aj + offset); @@ -185,13 +158,13 @@ ktt_ell_kernel_basic(const IndexType num_rows, } #if THREADS_PER_ROW > 1 - atomicAdd(&sums[block_lane][warp_lane], sum); + atomicAdd(&sums[threadIdx.x], sum); __syncthreads(); - if (stripe_lane == 0) + if (threadIdx.y == 0) { - y[row] = sums[block_lane][warp_lane]; + y[row] = sums[threadIdx.x]; } #else y[row] = sum; From 39da0d7cb42dc873a648442f653f33f6572eaf80 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 8 Mar 2023 11:29:45 +0100 Subject: [PATCH 25/49] Instrument csr and ell kernels --- .../cuda/detail/multiply/csr_vector_spmv.h | 18 ++++++++++++++++++ cusp/system/cuda/detail/multiply/ell_spmv.h | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h index 7bf4b98d..6f9e2c5c 100644 --- a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h +++ b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h @@ -24,6 +24,13 @@ #include +#ifdef TIME_CSR +#include +#include + +namespace krn = std::chrono; +#endif + namespace cusp { namespace system @@ -190,11 +197,22 @@ void __spmv_csr_vector(cuda::execution_policy& exec, cudaStream_t s = stream(thrust::detail::derived_cast(exec)); + #ifdef TIME_CSR + auto start = krn::steady_clock::now(); + #endif + spmv_csr_vector_kernel <<>> (A.num_rows, A.row_offsets.begin(), A.column_indices.begin(), A.values.begin(), x.begin(), y.begin(), initialize, combine, reduce); + + #ifdef TIME_CSR + cudaStreamSynchronize(s); + auto end = krn::steady_clock::now(); + size_t time = krn::duration_cast(end - start).count(); + std::cout << "cusp csr kernel: " << time << " us\n"; + #endif } template #include +#ifdef TIME_ELL +#include +#include + +namespace krn = std::chrono; +#endif + namespace cusp { namespace system @@ -132,8 +139,19 @@ void multiply(cuda::execution_policy& exec, cudaStream_t s = stream(thrust::detail::derived_cast(exec)); +#ifdef TIME_ELL + auto start = krn::steady_clock::now(); +#endif + spmv_ell_kernel <<>> (A.num_rows, A.num_cols, num_entries_per_row, pitch, J, V, x_ptr, y_ptr, initialize, combine, reduce); + +#ifdef TIME_ELL + cudaStreamSynchronize(s); + auto end = krn::steady_clock::now(); + size_t time = krn::duration_cast(end - start).count(); + std::cout << "cusp ell kernel: " << time << " us\n"; +#endif } } // end namespace detail From b09f3df268353b54b42301d9df3d17d4d30b48e4 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 13 Mar 2023 15:02:44 +0100 Subject: [PATCH 26/49] Implement ellr_matrix --- cusp/detail/ellr_matrix.inl | 131 ++++++++++++++++++++++++++++++++++++ cusp/ellr_matrix.h | 89 ++++++++++++++++++++++++ 2 files changed, 220 insertions(+) create mode 100644 cusp/detail/ellr_matrix.inl create mode 100644 cusp/ellr_matrix.h diff --git a/cusp/detail/ellr_matrix.inl b/cusp/detail/ellr_matrix.inl new file mode 100644 index 00000000..2bc5ee4b --- /dev/null +++ b/cusp/detail/ellr_matrix.inl @@ -0,0 +1,131 @@ +#include +#include +#include + +#include +#include +#include // raw_pointer_cast + +namespace cusp { + + +template +struct ell_row_length +{ + const IndexType* column_indices; + size_t num_cols_per_row; + size_t pitch; + + __host__ __device__ size_t operator()(IndexType row_idx) + { + size_t len = 0; + + while (len < num_cols_per_row && row_idx + len*pitch < 8 + && column_indices[row_idx + len*pitch] >= 0) + { + len++; + } + + return len; + } +}; + +template +void +compute_row_lengths(cusp::ellr_matrix& A) +{ + thrust::counting_iterator row_idx_it(0); + + thrust::transform( + row_idx_it, + row_idx_it + A.num_rows, + A.row_lengths.begin(), + ell_row_length{ + thrust::raw_pointer_cast(&A.column_indices(0, 0)), + A.column_indices.num_cols, + A.column_indices.pitch } + ); +} + + +////////////////// +// Constructors // +////////////////// + +template +ellr_matrix +::ellr_matrix(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row, const size_t alignment) + : Parent(num_rows, num_cols, num_entries, num_entries_per_row, alignment) +{ + row_lengths.resize(num_rows); +} + +// construct from a different matrix +template +template +ellr_matrix +::ellr_matrix(const MatrixType& matrix) +{ + cusp::convert(matrix, *this); + update_row_lengths(); +} + +////////////////////// +// Member Functions // +////////////////////// + +template +void +ellr_matrix +::swap(ellr_matrix& matrix) +{ + Parent::swap(matrix); + row_lengths.swap(matrix.row_lengths); +} + +template +void +ellr_matrix +::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row) +{ + Parent::resize(num_rows, num_cols, num_entries, num_entries_per_row); + row_lengths.resize(num_rows); +} + +template +void +ellr_matrix +::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row, const size_t alignment) +{ + Parent::resize(num_rows, num_cols, num_entries, + num_entries_per_row, alignment); + row_lengths.resize(num_rows); +} + +// assignment from another matrix +template +template +ellr_matrix& +ellr_matrix +::operator=(const MatrixType& matrix) +{ + cusp::convert(matrix, *this); + update_row_lengths(); + + return *this; +} + +template +void ellr_matrix::update_row_lengths() +{ + if (row_lengths.size() != this->num_rows) + row_lengths.resize(this->num_rows); + + compute_row_lengths(*this); +} + + +} // namespace cusp diff --git a/cusp/ellr_matrix.h b/cusp/ellr_matrix.h new file mode 100644 index 00000000..92f62cc9 --- /dev/null +++ b/cusp/ellr_matrix.h @@ -0,0 +1,89 @@ +#include + +#include + +#include + +namespace cusp +{ + + +template +class ellr_matrix : public cusp::ell_matrix +{ + + using Parent = cusp::ell_matrix; + +public: + + using row_lengths_array_type = cusp::array1d; + + row_lengths_array_type row_lengths; + + /*! Construct an empty \p ellr_matrix. + */ + ellr_matrix(void) {} + + /*! Construct an \p ell_matrix with a specific shape, number of nonzero entries, + * and maximum number of nonzero entries per row. + * + * \param num_rows Number of rows. + * \param num_cols Number of columns. + * \param num_entries Number of nonzero matrix entries. + * \param num_entries_per_row Maximum number of nonzeros per row. + * \param alignment Amount of padding used to align the data structure (default 32). + */ + ellr_matrix(const size_t num_rows, const size_t num_cols, + const size_t num_entries, const size_t num_entries_per_row, + const size_t alignment = 32); + + /*! Construct an \p ell_matrix from another matrix. + * + * \param matrix Another sparse or dense matrix. + */ + template + ellr_matrix(const MatrixType& matrix); + + /*! Resize matrix dimensions and underlying storage + * + * \param num_rows Number of rows. + * \param num_cols Number of columns. + * \param num_entries Number of nonzero matrix entries. + * \param num_entries_per_row Maximum number of nonzeros per row. + */ + void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row); + + /*! Resize matrix dimensions and underlying storage + * + * \param num_rows Number of rows. + * \param num_cols Number of columns. + * \param num_entries Number of nonzero matrix entries. + * \param num_entries_per_row Maximum number of nonzeros per row. + * \param alignment Amount of padding used to align the data structure (default 32). + */ + void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row, const size_t alignment); + + /*! Swap the contents of two \p ell_matrix objects. + * + * \param matrix Another \p ell_matrix with the same IndexType and ValueType. + */ + void swap(ellr_matrix& matrix); + + /*! Assignment from another matrix. + * + * \tparam MatrixType Format type of input matrix. + * + * \param matrix Another sparse or dense matrix. + */ + template + ellr_matrix& operator=(const MatrixType& matrix); + + void update_row_lengths(); +}; + + +} // namespace cusp + +#include From bdcdcf3b2d3589e3821481159115b3f19a51fb68 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 13 Mar 2023 15:08:36 +0100 Subject: [PATCH 27/49] Move ellr_matrix to ktt namespace --- cusp/{ => ktt}/detail/ellr_matrix.inl | 10 ++++++++-- cusp/{ => ktt}/ellr_matrix.h | 8 +++++++- 2 files changed, 15 insertions(+), 3 deletions(-) rename cusp/{ => ktt}/detail/ellr_matrix.inl (96%) rename cusp/{ => ktt}/ellr_matrix.h (97%) diff --git a/cusp/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl similarity index 96% rename from cusp/detail/ellr_matrix.inl rename to cusp/ktt/detail/ellr_matrix.inl index 2bc5ee4b..6f5888cc 100644 --- a/cusp/detail/ellr_matrix.inl +++ b/cusp/ktt/detail/ellr_matrix.inl @@ -6,7 +6,11 @@ #include #include // raw_pointer_cast -namespace cusp { +namespace cusp +{ + +namespace ktt +{ template @@ -32,7 +36,7 @@ struct ell_row_length template void -compute_row_lengths(cusp::ellr_matrix& A) +compute_row_lengths(cusp::ktt::ellr_matrix& A) { thrust::counting_iterator row_idx_it(0); @@ -128,4 +132,6 @@ void ellr_matrix::update_row_lengths() } +} // namespace ktt + } // namespace cusp diff --git a/cusp/ellr_matrix.h b/cusp/ktt/ellr_matrix.h similarity index 97% rename from cusp/ellr_matrix.h rename to cusp/ktt/ellr_matrix.h index 92f62cc9..f887dcea 100644 --- a/cusp/ellr_matrix.h +++ b/cusp/ktt/ellr_matrix.h @@ -4,9 +4,13 @@ #include + namespace cusp { +namespace ktt +{ + template class ellr_matrix : public cusp::ell_matrix @@ -84,6 +88,8 @@ class ellr_matrix : public cusp::ell_matrix }; +} // namespace ktt + } // namespace cusp -#include +#include From 0fa834f2d3b881671eb330fac81e40f35cafc5ae Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 13 Mar 2023 17:19:44 +0100 Subject: [PATCH 28/49] Add include guard --- cusp/ktt/ellr_matrix.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h index f887dcea..1ba60ff8 100644 --- a/cusp/ktt/ellr_matrix.h +++ b/cusp/ktt/ellr_matrix.h @@ -1,3 +1,5 @@ +#pragma once + #include #include From 2d43d7df6048f9f8ea5a3bcf8d4ec282358119a1 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 14 Mar 2023 14:47:11 +0100 Subject: [PATCH 29/49] Implement kernel for ellr matrix --- cusp/ktt/detail/ellr_matrix.inl | 6 +- cusp/ktt/detail/ktt.inl | 24 +-- cusp/ktt/ellr_matrix.h | 2 +- cusp/system/cuda/detail/multiply/dia_spmv.h | 18 ++ cusp/system/cuda/ktt/dia_multiply.h | 221 ++++++++++---------- cusp/system/cuda/ktt/ell_multiply.h | 114 ++++++++-- cusp/system/cuda/ktt/kernels/ell_kernel.h | 39 +++- cusp/system/cuda/ktt/multiply.h | 2 +- testing/ktt.cu | 11 +- 9 files changed, 282 insertions(+), 155 deletions(-) diff --git a/cusp/ktt/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl index 6f5888cc..8a67949d 100644 --- a/cusp/ktt/detail/ellr_matrix.inl +++ b/cusp/ktt/detail/ellr_matrix.inl @@ -20,11 +20,11 @@ struct ell_row_length size_t num_cols_per_row; size_t pitch; - __host__ __device__ size_t operator()(IndexType row_idx) + __host__ __device__ IndexType operator()(IndexType row_idx) { - size_t len = 0; + IndexType len = 0; - while (len < num_cols_per_row && row_idx + len*pitch < 8 + while (len < num_cols_per_row && column_indices[row_idx + len*pitch] >= 0) { len++; diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl index e02b4529..9e47d9be 100644 --- a/cusp/ktt/detail/ktt.inl +++ b/cusp/ktt/detail/ktt.inl @@ -116,18 +116,18 @@ tune(const Matrix& A, } -template -void reset_tuning() -{ - using namespace cusp::system::cuda::ktt; +// template +// void reset_tuning() +// { +// using namespace cusp::system::cuda::ktt; - kernel_context kernel = get_kernel(get_tuner(), Format{}); - detail::tuner->ClearData(kernel.kernel_id); -} +// kernel_context kernel = get_kernel(get_tuner(), Format{}); +// detail::tuner->ClearData(kernel.kernel_id); +// } template (); + return reset_tuning(A); } diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h index 1ba60ff8..7f8d932c 100644 --- a/cusp/ktt/ellr_matrix.h +++ b/cusp/ktt/ellr_matrix.h @@ -22,7 +22,7 @@ class ellr_matrix : public cusp::ell_matrix public: - using row_lengths_array_type = cusp::array1d; + using row_lengths_array_type = cusp::array1d; row_lengths_array_type row_lengths; diff --git a/cusp/system/cuda/detail/multiply/dia_spmv.h b/cusp/system/cuda/detail/multiply/dia_spmv.h index bf381825..81dbe069 100644 --- a/cusp/system/cuda/detail/multiply/dia_spmv.h +++ b/cusp/system/cuda/detail/multiply/dia_spmv.h @@ -26,6 +26,13 @@ #include +#ifdef TIME_DIA +#include +#include + +namespace krn = std::chrono; +#endif + namespace cusp { namespace system @@ -165,8 +172,19 @@ void multiply(cuda::execution_policy& exec, cudaStream_t s = stream(thrust::detail::derived_cast(exec)); +#ifdef TIME_DIA + auto start = krn::steady_clock::now(); +#endif + spmv_dia_kernel <<>> (A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets.begin(), A.values.values.begin(), x.begin(), y.begin(), initialize, combine, reduce); + +#ifdef TIME_DIA + cudaStreamSynchronize(s); + auto end = krn::steady_clock::now(); + size_t time = krn::duration_cast(end - start).count(); + std::cout << "cusp dia kernel: " << time << " us\n"; +#endif } } // end namespace detail diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 5e9a7414..88cd254e 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -87,11 +87,8 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k }); } -} // namespace dia - - template -kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) +kernel_context initialize_kernel(::ktt::Tuner& tuner) { kernel_context kernel(tuner); @@ -122,13 +119,20 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) return kernel; } + +} // namespace dia + + template -const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::dia_format format) + typename ValueType3, + typename MemorySpace> +const kernel_context& get_kernel( + ::ktt::Tuner& tuner, + const cusp::dia_matrix& A) { - static kernel_context kernel = initialize_kernel(tuner, format); + static kernel_context kernel = dia::initialize_kernel(tuner); return kernel; } @@ -158,15 +162,22 @@ inline ::ktt::ArgumentId get_output_argument(const std::vector<::ktt::ArgumentId return arguments[7]; } -auto get_launcher(const kernel_context& ctx, - size_t num_rows, - size_t num_cols, - bool profile = false) + +template +auto get_launcher( + const kernel_context& ctx, + const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + bool profile = false) { - return [=] (::ktt::ComputeInterface& interface) + return [&, profile] (::ktt::ComputeInterface& interface) { ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]); - ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) ); + ::ktt::DimensionVector grid_size( DIVIDE_INTO(A.num_rows, block_size.GetSizeX()) ); auto conf = interface.GetCurrentConfiguration(); for (const auto& pair : conf.GetPairs()) @@ -181,98 +192,98 @@ auto get_launcher(const kernel_context& ctx, }; } -template -::ktt::KernelResult multiply(::ktt::Tuner& tuner, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y) -{ - if (A.num_entries == 0) { - thrust::fill(y.begin(), y.end(), ValueType3(0)); - ::ktt::KernelResult result("DiaKernel", {}); - result.SetStatus(::ktt::ResultStatus::Ok); - return result; - } - - const kernel_context& kernel = get_kernel(tuner, cusp::dia_format{}); - auto args = add_arguments(kernel, A, x, y); - - tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols)); - - auto res = tuner.TuneIteration(kernel.kernel_id, {}); - remove_arguments(kernel, args); - - return res; -} - -template -::ktt::KernelResult multiply(::ktt::Tuner& tuner, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& configuration, - bool run_with_profiling = false) -{ - if (A.num_entries == 0) { - thrust::fill(y.begin(), y.end(), ValueType3(0)); - ::ktt::KernelResult result("DiaKernel", {}); - result.SetStatus(::ktt::ResultStatus::Ok); - return result; - } - - kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); - auto args = add_arguments(kernel, A, x, y); - - tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling)); - - auto result = tuner.Run(kernel.kernel_id, configuration, {}); - remove_arguments(kernel, args); - - return result; -} - -template -std::vector<::ktt::KernelResult> -tune(::ktt::Tuner& tuner, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt) -{ - if (A.num_entries == 0) { - return {}; - } - - kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); - auto args = add_arguments(kernel, A, x, y); - - if (reference_computation) { - tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value()); - } - - cusp::array1d host_y = y; - tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) { - // clear y so previous results don't affect validation - y = host_y; - auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols); - launcher(interface); - }); - - auto results = tuner.Tune(kernel.kernel_id); - - remove_arguments(kernel, args); - - return results; -} +// template +// ::ktt::KernelResult multiply(::ktt::Tuner& tuner, +// const cusp::dia_matrix& A, +// const cusp::array1d& x, +// cusp::array1d& y) +// { +// if (A.num_entries == 0) { +// thrust::fill(y.begin(), y.end(), ValueType3(0)); +// ::ktt::KernelResult result("DiaKernel", {}); +// result.SetStatus(::ktt::ResultStatus::Ok); +// return result; +// } + +// const kernel_context& kernel = get_kernel(tuner, cusp::dia_format{}); +// auto args = add_arguments(kernel, A, x, y); + +// tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols)); + +// auto res = tuner.TuneIteration(kernel.kernel_id, {}); +// remove_arguments(kernel, args); + +// return res; +// } + +// template +// ::ktt::KernelResult multiply(::ktt::Tuner& tuner, +// const cusp::dia_matrix& A, +// const cusp::array1d& x, +// cusp::array1d& y, +// const ::ktt::KernelConfiguration& configuration, +// bool run_with_profiling = false) +// { +// if (A.num_entries == 0) { +// thrust::fill(y.begin(), y.end(), ValueType3(0)); +// ::ktt::KernelResult result("DiaKernel", {}); +// result.SetStatus(::ktt::ResultStatus::Ok); +// return result; +// } + +// kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); +// auto args = add_arguments(kernel, A, x, y); + +// tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling)); + +// auto result = tuner.Run(kernel.kernel_id, configuration, {}); +// remove_arguments(kernel, args); + +// return result; +// } + +// template +// std::vector<::ktt::KernelResult> +// tune(::ktt::Tuner& tuner, +// const cusp::dia_matrix& A, +// const cusp::array1d& x, +// cusp::array1d& y, +// std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt) +// { +// if (A.num_entries == 0) { +// return {}; +// } + +// kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); +// auto args = add_arguments(kernel, A, x, y); + +// if (reference_computation) { +// tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value()); +// } + +// cusp::array1d host_y = y; +// tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) { +// // clear y so previous results don't affect validation +// y = host_y; +// auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols); +// launcher(interface); +// }); + +// auto results = tuner.Tune(kernel.kernel_id); + +// remove_arguments(kernel, args); + +// return results; +// } } // namespace ktt diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 8b217bb9..036329aa 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -1,7 +1,10 @@ #pragma once +#include + #include #include +#include namespace cusp { @@ -14,13 +17,12 @@ namespace ktt { namespace ell { -inline void setup_tuning_parameters(const kernel_context& kernel) +inline void setup_common_tuning_parameters(const kernel_context& kernel) { auto& tuner = *kernel.tuner; auto kernel_id = kernel.kernel_id; tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128 }); - tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "UNCACHED_LOADS", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "DISABLE_UNROLL", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 }); @@ -37,22 +39,31 @@ inline void setup_tuning_parameters(const kernel_context& kernel) } ); - // BREAK can be used only when there is no prefetching - // TODO(KTT): Implement prefetching when using early break and remove this + // All the warps working on one stripe of the matrix fit in a thread block. tuner.AddConstraint( kernel.kernel_id, - { "BREAK", "PREFETCH_FACTOR" }, + { "THREADS_PER_ROW", "BLOCK_SIZE" }, [] (const std::vector& values) { - return values[0] == 0 || values[1] == 0; + return 32*values[0] <= values[1]; }); +} +inline void setup_ell_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; - // All the warps working on one stripe of the matrix fit in a thread block. + setup_common_tuning_parameters(kernel); + + tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 }); + + // BREAK can be used only when there is no prefetching + // TODO(KTT): Implement prefetching when using early break and remove this tuner.AddConstraint( kernel.kernel_id, - { "THREADS_PER_ROW", "BLOCK_SIZE" }, + { "BREAK", "PREFETCH_FACTOR" }, [] (const std::vector& values) { - return 32*values[0] <= values[1]; + return values[0] == 0 || values[1] == 0; }); } @@ -60,8 +71,12 @@ inline void setup_tuning_parameters(const kernel_context& kernel) template -kernel_context initialize_kernel(::ktt::Tuner& tuner) + typename ValueType3, + typename TuningParamsInit> +kernel_context initialize_kernel(::ktt::Tuner& tuner, + const std::string& kernel_function, + const std::string& kernel_name, + TuningParamsInit init) { std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/ell_kernel.h"; @@ -80,7 +95,7 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner) ::ktt::DimensionVector grid_size(0); auto definition_id = tuner.AddKernelDefinitionFromFile( - "ktt_ell_kernel", + kernel_function, kernel_path, grid_size, block_size, @@ -88,9 +103,9 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner) ); kernel.definition_ids.push_back(definition_id); - kernel.kernel_id = tuner.CreateSimpleKernel("EllKernel", definition_id); + kernel.kernel_id = tuner.CreateSimpleKernel(kernel_name, definition_id); - setup_tuning_parameters(kernel); + init(kernel); return kernel; } @@ -102,15 +117,56 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner) template -const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::ell_format format) + typename ValueType3, + typename MemorySpace> +const kernel_context& get_kernel( + ::ktt::Tuner& tuner, + const cusp::ell_matrix& A) { static kernel_context kernel = - ell::initialize_kernel(tuner); + ell::initialize_kernel( + tuner, "ktt_ell_kernel", "EllKernel", + ell::setup_ell_tuning_parameters); return kernel; } +template +const kernel_context& get_kernel( + ::ktt::Tuner& tuner, + const cusp::ktt::ellr_matrix& A) +{ + static kernel_context kernel = + ell::initialize_kernel( + tuner, "ktt_ellr_kernel", "EllrKernel", + ell::setup_common_tuning_parameters); + + return kernel; +} + + +template +std::vector<::ktt::ArgumentId> +add_common_arguments( + const kernel_context& kernel, + const cusp::ell_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + IndexType pitch = A.column_indices.pitch; + IndexType num_entries_per_row = A.column_indices.num_cols; + + return add_arguments(*kernel.tuner, A.num_rows, A.num_cols, + num_entries_per_row, pitch, A.column_indices.values, + A.values.values, x, y); +} template & x, cusp::array1d& y) { - IndexType pitch = A.column_indices.pitch; - IndexType num_entries_per_row = A.column_indices.num_cols; + auto args = add_common_arguments(kernel, A, x, y); + kernel.tuner->SetArguments(kernel.definition_ids[0], args); + return args; +} + +template +std::vector<::ktt::ArgumentId> +add_arguments( + const kernel_context& kernel, + const cusp::ktt::ellr_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + auto args = add_common_arguments(kernel, A, x, y); + auto aditional_args = add_arguments(*kernel.tuner, A.row_lengths); - auto args = add_arguments(*kernel.tuner, - A.num_rows, A.num_cols, num_entries_per_row, pitch, - A.column_indices.values, A.values.values, x, y); + args.push_back(aditional_args[0]); kernel.tuner->SetArguments(kernel.definition_ids[0], args); diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h index b6a29324..98f605f5 100644 --- a/cusp/system/cuda/ktt/kernels/ell_kernel.h +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -88,36 +88,37 @@ template 1 - __shared__ ValueType sums[BLOCK_SIZE/THREADS_PER_ROW]; + __shared__ ValueType sums[BLOCK_SIZE / THREADS_PER_ROW]; if (threadIdx.y == 0) - { sums[threadIdx.x] = 0; - } __syncthreads(); #endif if (row < num_rows) { + const IndexType num_cols = row_lengths == NULL + ? max_cols_per_row + : row_lengths[row]; ValueType sum = 0; IndexType n = threadIdx.y; - IndexType offset = row + n * pitch; + IndexType offset = row + n*pitch; #if PREFETCH_FACTOR > 0 - IndexType bound = num_cols_per_row - - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW; + IndexType bound = num_cols - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW; IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW; #pragma unroll UNROLL @@ -134,7 +135,7 @@ ktt_ell_kernel_basic(const IndexType num_rows, #endif #pragma unroll UNROLL - for (; n < num_cols_per_row; n += THREADS_PER_ROW) + for (; n < num_cols; n += THREADS_PER_ROW) { const IndexType col = load(Aj + offset); @@ -186,5 +187,23 @@ ktt_ell_kernel(const IndexType num_rows, ValueType* __restrict__ y) { ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch, - Aj, Ax, x, y); + Aj, Ax, x, y, (const IndexType*)NULL); +} + + +template +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_ellr_kernel(const IndexType num_rows, + const IndexType num_cols, + const IndexType num_cols_per_row, + const IndexType pitch, + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y, + const IndexType* __restrict__ row_lengths) +{ + ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch, + Aj, Ax, x, y, row_lengths); } diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h index 4f5a0b84..18ff8bd0 100644 --- a/cusp/system/cuda/ktt/multiply.h +++ b/cusp/system/cuda/ktt/multiply.h @@ -33,7 +33,7 @@ kernel_context get_kernel(::ktt::Tuner& tuner, using ValueType = typename MatrixType::value_type; using FormatType = typename MatrixType::format; - return get_kernel(tuner, FormatType{}); + return get_kernel(tuner, A); } diff --git a/testing/ktt.cu b/testing/ktt.cu index aac96e34..d3bf2088 100644 --- a/testing/ktt.cu +++ b/testing/ktt.cu @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -29,9 +30,17 @@ } \ DECLARE_UNITTEST(VTEST##Ktt##Fmt##Matrix); +#define DECLARE_KTT_ELLR_UNITTEST(VTEST) \ + void VTEST##Ktt##Ellr##Matrix(void) \ + { \ + VTEST< cusp::ktt::ellr_matrix >(); \ + } \ + DECLARE_UNITTEST(VTEST##Ktt##Ellr##Matrix); + #define DECLARE_KTT_UNITTEST(VTEST) \ DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \ - DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell) + DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell) \ + DECLARE_KTT_ELLR_UNITTEST(VTEST) struct UnitTestStopCondition : ::ktt::StopCondition From de428af797f3c2d3b662fef2feb621b9918a1a02 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 20 Mar 2023 09:28:22 +0100 Subject: [PATCH 30/49] Add option to disable conversion exceptions --- cusp/system/detail/generic/conversions/coo_to_other.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cusp/system/detail/generic/conversions/coo_to_other.h b/cusp/system/detail/generic/conversions/coo_to_other.h index 06a88b7f..b25fe8f8 100644 --- a/cusp/system/detail/generic/conversions/coo_to_other.h +++ b/cusp/system/detail/generic/conversions/coo_to_other.h @@ -137,7 +137,8 @@ convert(thrust::execution_policy& exec, DestinationType& dst, cusp::coo_format&, cusp::dia_format&, - size_t alignment = 32) + size_t alignment = 32, + bool dont_throw = false) { typedef typename DestinationType::index_type IndexType; typedef typename DestinationType::value_type ValueType; @@ -156,7 +157,7 @@ convert(thrust::execution_policy& exec, const float size = float(occupied_diagonals) * float(src.num_rows); const float fill_ratio = size / std::max(1.0f, float(src.num_entries)); - if (max_fill < fill_ratio && size > threshold) + if (max_fill < fill_ratio && size > threshold && !dont_throw) throw cusp::format_conversion_exception("dia_matrix fill-in would exceed maximum tolerance"); // compute number of occupied diagonals and enumerate them @@ -219,7 +220,8 @@ convert(thrust::execution_policy& exec, cusp::coo_format&, cusp::ell_format&, size_t num_entries_per_row = 0, - size_t alignment = 32) + size_t alignment = 32, + bool dont_throw = false) { typedef typename DestinationType::index_type IndexType; typedef typename DestinationType::value_type ValueType; @@ -242,7 +244,7 @@ convert(thrust::execution_policy& exec, const float size = float(max_entries_per_row) * float(src.num_rows); const float fill_ratio = size / std::max(1.0f, float(src.num_entries)); - if (max_fill < fill_ratio && size > threshold) + if (max_fill < fill_ratio && size > threshold && !dont_throw) throw cusp::format_conversion_exception("ell_matrix fill-in would exceed maximum tolerance"); num_entries_per_row = max_entries_per_row; From 4ad20a90b53e3c0f3681f555c9018e9c8b26053f Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 20 Mar 2023 09:29:51 +0100 Subject: [PATCH 31/49] Remove ifs from ellr kernel --- cusp/system/cuda/ktt/kernels/ell_kernel.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h index 98f605f5..3d89bd60 100644 --- a/cusp/system/cuda/ktt/kernels/ell_kernel.h +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -49,11 +49,15 @@ struct Prefetcher : Prefetcher IndexType pitch, IndexType i) { +#ifndef ELLR if (col >= 0) { +#endif A_val = load(Ax + offset + i*pitch); x_val = x[col]; +#ifndef ELLR } +#endif parent::prefetch_vals(Ax, x, offset, pitch, i + 1); } @@ -110,7 +114,7 @@ ktt_ell_kernel_basic(const IndexType num_rows, if (row < num_rows) { - const IndexType num_cols = row_lengths == NULL + const IndexType num_elems = row_lengths == NULL ? max_cols_per_row : row_lengths[row]; ValueType sum = 0; @@ -118,7 +122,7 @@ ktt_ell_kernel_basic(const IndexType num_rows, IndexType offset = row + n*pitch; #if PREFETCH_FACTOR > 0 - IndexType bound = num_cols - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW; + IndexType bound = num_elems - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW; IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW; #pragma unroll UNROLL @@ -135,18 +139,21 @@ ktt_ell_kernel_basic(const IndexType num_rows, #endif #pragma unroll UNROLL - for (; n < num_cols; n += THREADS_PER_ROW) + for (; n < num_elems; n += THREADS_PER_ROW) { const IndexType col = load(Aj + offset); // This assumes that // cusp::ell_matrix<...>::invalid_index is always < 0 +#ifndef ELLR if (col >= 0) { +#endif const ValueType x_j = x[col]; const ValueType A_ij = load(Ax + offset); sum += A_ij * x_j; +#ifndef ELLR } else { @@ -154,6 +161,7 @@ ktt_ell_kernel_basic(const IndexType num_rows, break; #endif } +#endif offset += stride; } From fa3e6a27af92a0045dcdfd62c4877cb3766f0f98 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 20 Mar 2023 09:33:21 +0100 Subject: [PATCH 32/49] Definne ELLR parameter for the ellr kernel --- cusp/system/cuda/ktt/ell_multiply.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 036329aa..957ba961 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -67,6 +67,16 @@ inline void setup_ell_tuning_parameters(const kernel_context& kernel) }); } +inline void setup_ellr_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + setup_common_tuning_parameters(kernel); + + tuner.AddParameter(kernel_id, "ELLR", u64_vec{ 1 }); +} + template( tuner, "ktt_ellr_kernel", "EllrKernel", - ell::setup_common_tuning_parameters); + ell::setup_ellr_tuning_parameters); return kernel; } From 8927db4be5407209fcbc9b70fd802cad92bdcdb6 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 20 Mar 2023 09:33:44 +0100 Subject: [PATCH 33/49] Add more block sizes to ell and ellr kernels --- cusp/system/cuda/ktt/ell_multiply.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 957ba961..a491a473 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -22,7 +22,7 @@ inline void setup_common_tuning_parameters(const kernel_context& kernel) auto& tuner = *kernel.tuner; auto kernel_id = kernel.kernel_id; - tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128 }); + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); tuner.AddParameter(kernel_id, "UNCACHED_LOADS", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "DISABLE_UNROLL", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 }); From 27529520354cd280f9edc2f967bb6ccfb3e7da00 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 20 Mar 2023 09:34:14 +0100 Subject: [PATCH 34/49] Fix having two kernels per matrix type --- cusp/system/cuda/ktt/ell_multiply.h | 14 ++++++-------- cusp/system/cuda/ktt/multiply.h | 14 ++++++++++++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index a491a473..3981156f 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -127,11 +127,10 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, template + typename ValueType3> const kernel_context& get_kernel( - ::ktt::Tuner& tuner, - const cusp::ell_matrix& A) + ::ktt::Tuner& tuner, + const cusp::ell_matrix& A) { static kernel_context kernel = ell::initialize_kernel( @@ -144,11 +143,10 @@ const kernel_context& get_kernel( template + typename ValueType3> const kernel_context& get_kernel( - ::ktt::Tuner& tuner, - const cusp::ktt::ellr_matrix& A) + ::ktt::Tuner& tuner, + const cusp::ktt::ellr_matrix& A) { static kernel_context kernel = ell::initialize_kernel( diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h index 18ff8bd0..59729596 100644 --- a/cusp/system/cuda/ktt/multiply.h +++ b/cusp/system/cuda/ktt/multiply.h @@ -31,9 +31,19 @@ kernel_context get_kernel(::ktt::Tuner& tuner, { using IndexType = typename MatrixType::index_type; using ValueType = typename MatrixType::value_type; - using FormatType = typename MatrixType::format; - return get_kernel(tuner, A); + if constexpr (std::is_same_v) + { + using DeviceMatrix = + typename MatrixType::rebind::type; + return get_kernel( + tuner, DeviceMatrix{}); + } + else + { + return get_kernel(tuner, A); + } } From eb9fa2039c5d00cea5bb197a2431a51160600538 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 21 Mar 2023 08:58:01 +0100 Subject: [PATCH 35/49] Remove commented code --- cusp/system/cuda/ktt/dia_multiply.h | 92 ----------------------------- 1 file changed, 92 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 88cd254e..a82137bf 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -192,98 +192,6 @@ auto get_launcher( }; } -// template -// ::ktt::KernelResult multiply(::ktt::Tuner& tuner, -// const cusp::dia_matrix& A, -// const cusp::array1d& x, -// cusp::array1d& y) -// { -// if (A.num_entries == 0) { -// thrust::fill(y.begin(), y.end(), ValueType3(0)); -// ::ktt::KernelResult result("DiaKernel", {}); -// result.SetStatus(::ktt::ResultStatus::Ok); -// return result; -// } - -// const kernel_context& kernel = get_kernel(tuner, cusp::dia_format{}); -// auto args = add_arguments(kernel, A, x, y); - -// tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols)); - -// auto res = tuner.TuneIteration(kernel.kernel_id, {}); -// remove_arguments(kernel, args); - -// return res; -// } - -// template -// ::ktt::KernelResult multiply(::ktt::Tuner& tuner, -// const cusp::dia_matrix& A, -// const cusp::array1d& x, -// cusp::array1d& y, -// const ::ktt::KernelConfiguration& configuration, -// bool run_with_profiling = false) -// { -// if (A.num_entries == 0) { -// thrust::fill(y.begin(), y.end(), ValueType3(0)); -// ::ktt::KernelResult result("DiaKernel", {}); -// result.SetStatus(::ktt::ResultStatus::Ok); -// return result; -// } - -// kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); -// auto args = add_arguments(kernel, A, x, y); - -// tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling)); - -// auto result = tuner.Run(kernel.kernel_id, configuration, {}); -// remove_arguments(kernel, args); - -// return result; -// } - -// template -// std::vector<::ktt::KernelResult> -// tune(::ktt::Tuner& tuner, -// const cusp::dia_matrix& A, -// const cusp::array1d& x, -// cusp::array1d& y, -// std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt) -// { -// if (A.num_entries == 0) { -// return {}; -// } - -// kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); -// auto args = add_arguments(kernel, A, x, y); - -// if (reference_computation) { -// tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value()); -// } - -// cusp::array1d host_y = y; -// tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) { -// // clear y so previous results don't affect validation -// y = host_y; -// auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols); -// launcher(interface); -// }); - -// auto results = tuner.Tune(kernel.kernel_id); - -// remove_arguments(kernel, args); - -// return results; -// } } // namespace ktt From e48d543159ca998eb3d3b32eb97c172cce496444 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 21 Mar 2023 09:24:05 +0100 Subject: [PATCH 36/49] Fix formatting --- cusp/system/cuda/ktt/dia_multiply.h | 154 ++++++++++++++++------------ 1 file changed, 91 insertions(+), 63 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index a82137bf..02df5bd8 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -6,12 +6,9 @@ #include #include -#include // max_active_blocks #include #include -#include -#include namespace cusp { @@ -23,23 +20,28 @@ namespace ktt { namespace dia { -inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) + +inline void setup_tuning_parameters(const kernel_context& kernel) { - tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1, 2 }); - tuner.AddParameter(kernel.kernel_id, "SHARED_PREFETCH_FACTOR", std::vector{ 0, 2, 4, 8 }); - tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_FACTOR", std::vector{ 0, 2, 3, 4 }); - tuner.AddParameter(kernel.kernel_id, "REGISTER_PREFETCH_TYPE", std::vector{ 0, 1 }); - tuner.AddParameter(kernel.kernel_id, "LOAD_TYPE", std::vector{ 0, 1 }); - tuner.AddParameter(kernel.kernel_id, "STRIPING_FACTOR", std::vector{ 2, 4, 8 }); - tuner.AddParameter(kernel.kernel_id, "BLOCK_SIZE", std::vector{ 256, 512 }); - - // START OF HACKS --------------------- - tuner.AddParameter(kernel.kernel_id, "BLOCKS_PER_SECTOR", std::vector{ 0 }); - tuner.AddParameter(kernel.kernel_id, "CHUNK_SIZE", std::vector{ 0 }); - tuner.AddParameter(kernel.kernel_id, "STRIPE_SIZE", std::vector{ 0 }); - tuner.AddParameter(kernel.kernel_id, "GRID_SIZE", std::vector{ 0 }); - tuner.AddParameter(kernel.kernel_id, "SECTOR_MAPPING_TYPE", - std::vector{ 0, 1 }); + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 0, 1, 2 }); + tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR", + u64_vec{ 0, 2, 4, 8 }); + tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR", + u64_vec{ 0, 2, 3, 4 }); + tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8 }); + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 256, 512 }); + + // START OF HACKS to test striping ---- + tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 }); + tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 }); + tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 }); + tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 }); + tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 }); tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" }, [] (const std::vector& values) { @@ -53,46 +55,55 @@ inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& k ::ktt::ModifierType::Local, ::ktt::ModifierDimension::X, { std::string("BLOCK_SIZE") }, - [] (const uint64_t defaultSize, const std::vector& parameters) { + [] (const uint64_t defaultSize, const u64_vec& parameters) { return parameters[0]; - } - ); - - // only one type of prefetching can be applied at once - tuner.AddConstraint(kernel.kernel_id, - { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, - [] (const std::vector& values) { - return values[0] == 0 || values[1] == 0; - }); + }); - // prefetching is used only for the blocked offsets kernel - tuner.AddConstraint(kernel.kernel_id, - { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, - [] (const std::vector& values) { - return values[0] == 1 || (values[1] == 0 && values[2] == 0); - }); + // Only one type of prefetching can be applied at once. + tuner.AddConstraint( + kernel.kernel_id, + { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, + [] (const std::vector& values) { + return values[0] == 0 || values[1] == 0; + }); - // different register prefetching implementations are used only when register prefetching is enabled - tuner.AddConstraint(kernel.kernel_id, - { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" }, - [] (const std::vector& values) { - return values[0] > 0 || values[1] == 0; - }); + // Prefetching is used only for the blocked offsets kernel. + tuner.AddConstraint( + kernel.kernel_id, + { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, + [] (const std::vector& values) { + return values[0] == 1 || (values[1] == 0 && values[2] == 0); + }); + + // Different register prefetching implementations are used + // only when register prefetching is actually enabled + tuner.AddConstraint( + kernel.kernel_id, + { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" }, + [] (const std::vector& values) { + return values[0] > 0 || values[1] == 0; + }); - // don't try different striping factors for non-striped kernels - tuner.AddConstraint(kernel.kernel_id, - { "KERNEL_TYPE", "STRIPING_FACTOR" }, - [] (const std::vector& values) { - return values[0] == 2 || values[1] == 2; - }); + // Don't try different striping factors for non-striped kernels. + tuner.AddConstraint( + kernel.kernel_id, + { "KERNEL_TYPE", "STRIPING_FACTOR" }, + [] (const std::vector& values) { + return values[0] == 2 || values[1] == 2; + }); } -template + +template kernel_context initialize_kernel(::ktt::Tuner& tuner) { kernel_context kernel(tuner); - std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h"; + std::string kernel_path = + STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h"; std::vector< std::string > type_names { std::string(NAMEOF_TYPE(IndexType)), @@ -101,6 +112,9 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner) std::string(NAMEOF_TYPE(ValueType3)) }; + // NOTE: These can be anything since they are awlays set in the launcher. + // So use some values that will hopefully cause a crash if not set properly + // in the launcher. const ::ktt::DimensionVector blockDimensions(0); const ::ktt::DimensionVector gridDimensions(0); @@ -114,7 +128,7 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner) kernel.definition_ids.push_back(definition_id); kernel.kernel_id = tuner.CreateSimpleKernel("DiaKernel", definition_id); - dia::setup_tuning_parameters(tuner, kernel); + dia::setup_tuning_parameters(kernel); return kernel; } @@ -132,7 +146,10 @@ const kernel_context& get_kernel( ::ktt::Tuner& tuner, const cusp::dia_matrix& A) { - static kernel_context kernel = dia::initialize_kernel(tuner); + constexpr auto init_kernel = + dia::initialize_kernel; + + static kernel_context kernel = init_kernel(tuner); return kernel; } @@ -141,24 +158,30 @@ template std::vector<::ktt::ArgumentId> add_arguments( - const kernel_context& kernel, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y) + const kernel_context& kernel, + const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) { const IndexType num_diagonals = A.values.num_cols; - const IndexType pitch = A.values.pitch; // length of one diagonal + possible padding + const IndexType pitch = A.values.pitch; + + auto args = + add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals, + pitch, A.diagonal_offsets, A.values.values, x, y); - auto args = add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets, A.values.values, x, y); kernel.tuner->SetArguments(kernel.definition_ids[0], args); return args; } -// Returns the argument id of the y vector given a vector of arguments returned by a previous -// call of `add_arguments`. -inline ::ktt::ArgumentId get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, cusp::dia_format) { +// Returns the argument id of the y vector given a vector of arguments +// returned by a previous call of `add_arguments`. +inline ::ktt::ArgumentId get_output_argument( + const std::vector<::ktt::ArgumentId>& arguments, + cusp::dia_format) +{ return arguments[7]; } @@ -176,18 +199,23 @@ auto get_launcher( { return [&, profile] (::ktt::ComputeInterface& interface) { - ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]); - ::ktt::DimensionVector grid_size( DIVIDE_INTO(A.num_rows, block_size.GetSizeX()) ); + ::ktt::DimensionVector block_size = + interface.GetCurrentLocalSize(ctx.definition_ids[0]); + ::ktt::DimensionVector grid_size( + DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); + // START OF HACKS to test striping ---- auto conf = interface.GetCurrentConfiguration(); for (const auto& pair : conf.GetPairs()) if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0) grid_size = ::ktt::DimensionVector(pair.GetValue()); + // END OF HACKS ----------------------- if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { - interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); + interface.RunKernelWithProfiling(ctx.definition_ids[0], + grid_size, block_size); } }; } From 15d306114d00b2f7ae575e2cd4925fb7f85bd2ca Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 21 Mar 2023 13:23:40 +0100 Subject: [PATCH 37/49] Make prefetching macros more convenient to use --- cusp/system/cuda/ktt/kernels/dia_kernel.h | 67 +++++++++++------------ 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 7f26658d..2160b007 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -9,10 +9,6 @@ #define PREFETCH_FACTOR 0 #endif -#ifndef BLOCK_SIZE - #define BLOCK_SIZE 256 -#endif - template __device__ T min(T a, T b) { @@ -145,16 +141,36 @@ naive_dia_kernel(const int num_rows, } } -#define REGISTER_PREFETCH_ITERATION(iter) \ - IndexType col##iter = offsets[i + iter] + thread_id; \ - ValueType1 diag_val##iter = 0; \ - ValueType2 x_val##iter = 0; \ - if (col##iter >= 0 && col##iter < num_cols) \ - { \ - diag_val##iter = load_diag_val(&values[ (offset_base + i + iter)*pitch + thread_id ]); \ - x_val##iter = load_x_val(&x[col##iter]); \ + +#define PREFETCH_LOAD_ITERATION(iter) \ + IndexType col##iter = offsets[i + iter - 1] + thread_id; \ + ValueType1 diag_val##iter = 0; \ + ValueType2 x_val##iter = 0; \ + if (col##iter >= 0 && col##iter < num_cols) \ + { \ + IndexType idx = (offset_base + i + iter - 1)*pitch + thread_id; \ + diag_val##iter = load_diag_val(&values[idx]); \ + x_val##iter = load_x_val(&x[col##iter]); \ } +#define PREFETCH_ACCUM_ITERATION(iter) \ + sum += diag_val##iter * x_val##iter; + +#define PREFETCH1(ITER) ITER(1); +#define PREFETCH2(ITER) ITER(2); PREFETCH1(ITER) +#define PREFETCH3(ITER) ITER(3); PREFETCH2(ITER) +#define PREFETCH4(ITER) ITER(4); PREFETCH3(ITER) +#define PREFETCH5(ITER) ITER(5); PREFETCH4(ITER) +#define PREFETCH6(ITER) ITER(6); PREFETCH5(ITER) +#define PREFETCH7(ITER) ITER(7); PREFETCH6(ITER) +#define PREFETCH8(ITER) ITER(8); PREFETCH7(ITER) + +#define PREFETCH_IMPL(factor, ITER) PREFETCH##factor(ITER) + +#define PREFETCH_LOAD_VALS(factor) PREFETCH_IMPL(factor, PREFETCH_LOAD_ITERATION) +#define PREFETCH_ACCUMULATE(factor) PREFETCH_IMPL(factor, PREFETCH_ACCUM_ITERATION) + + template num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + int batch_size = BLOCK_SIZE > num_diagonals - offset_base + ? num_diagonals - offset_base + : BLOCK_SIZE; if (thread_id < num_rows) { @@ -218,23 +234,8 @@ blocked_offsets_dia_kernel(const int num_rows, #elif REGISTER_PREFETCH_TYPE == 1 - REGISTER_PREFETCH_ITERATION(0); - REGISTER_PREFETCH_ITERATION(1); -#if REGISTER_PREFETCH_FACTOR > 2 - REGISTER_PREFETCH_ITERATION(2); -#endif -#if REGISTER_PREFETCH_FACTOR > 3 - REGISTER_PREFETCH_ITERATION(3); -#endif - - sum += diag_val0 * x_val0; - sum += diag_val1 * x_val1; -#if REGISTER_PREFETCH_FACTOR > 2 - sum += diag_val2 * x_val2; -#endif -#if REGISTER_PREFETCH_FACTOR > 3 - sum += diag_val3 * x_val3; -#endif + PREFETCH_LOAD_VALS(PREFETCH_FACTOR); + PREFETCH_ACCUMULATE(PREFETCH_FACTOR); #endif // REGISTER_PREFETCH_TYPE == 1 #elif SHARED_PREFETCH_FACTOR > 0 @@ -282,9 +283,7 @@ blocked_offsets_dia_kernel(const int num_rows, } if (thread_id < num_rows) - { y[thread_id] = sum; - } } template Date: Tue, 21 Mar 2023 13:46:23 +0100 Subject: [PATCH 38/49] Use a more readable implementation of prefetching --- cusp/system/cuda/ktt/kernels/dia_kernel.h | 43 +++++++---------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 2160b007..8f2e144a 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -100,10 +100,7 @@ struct UnrolledLoop IndexType i) {} template - __device__ ValueType3 do_iter() - { - return 0; - } + __device__ ValueType3 do_iter() { return 0; } }; @@ -208,48 +205,32 @@ blocked_offsets_dia_kernel(const int num_rows, if (thread_id < num_rows) { -#if SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0 - for (IndexType i = 0; i < batch_size; ++i) - { - IndexType col = offsets[i] + thread_id; - if (col >= 0 && col < num_cols) - { - auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]); - auto x_val = load_x_val(&x[col]); - - sum += diag_val*x_val; - } - } -#else // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0 - int end = batch_size - (batch_size % PREFETCH_FACTOR); + IndexType i = 0; - for (IndexType i = 0; i < end; i += PREFETCH_FACTOR) +#if PREFETCH_FACTOR > 0 + for (; i < batch_size - PREFETCH_FACTOR + 1; i += PREFETCH_FACTOR) { #if REGISTER_PREFETCH_FACTOR > 0 -#if REGISTER_PREFETCH_TYPE == 0 +#if REGISTER_PREFETCH_TYPE == 0 UnrolledLoop loop; loop.prefetch(offsets, values, x, pitch, num_cols, thread_id, offset_base, i); sum += loop.template do_iter(); - #elif REGISTER_PREFETCH_TYPE == 1 - PREFETCH_LOAD_VALS(PREFETCH_FACTOR); PREFETCH_ACCUMULATE(PREFETCH_FACTOR); - #endif // REGISTER_PREFETCH_TYPE == 1 -#elif SHARED_PREFETCH_FACTOR > 0 +#elif SHARED_PREFETCH_FACTOR > 0 for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j) { IndexType col = offsets[i + j] + thread_id; if (col >= 0 && col < num_cols) { - auto diag_val = load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]); - auto x_val = load_x_val(&x[col]); - - values_prefetched[j*BLOCK_SIZE + threadIdx.x] = diag_val; - x_prefetched[j*BLOCK_SIZE + threadIdx.x] = x_val; + values_prefetched[j*BLOCK_SIZE + threadIdx.x] = + load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]);; + x_prefetched[j*BLOCK_SIZE + threadIdx.x] = + load_x_val(&x[col]); } else { @@ -264,8 +245,9 @@ blocked_offsets_dia_kernel(const int num_rows, } #endif // SHARED_PREFETCH_FACTOR > 0 } +#endif // PREFETCH_FACTOR > 0 - for (IndexType i = end; i < batch_size; ++i) + for (; i < batch_size; ++i) { IndexType col = offsets[i] + thread_id; if (col >= 0 && col < num_cols) @@ -276,7 +258,6 @@ blocked_offsets_dia_kernel(const int num_rows, sum += diag_val*x_val; } } -#endif // SHARED_PREFETCH_FACTOR == 0 && REGISTER_PREFETCH_FACTOR == 0 } __syncthreads(); From 1dd0488a6038c6535ec4fc12ffdc5369a77c51ed Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Tue, 21 Mar 2023 13:49:28 +0100 Subject: [PATCH 39/49] Remove old unused kernels --- cusp/system/cuda/ktt/kernels/dia_kernel.h | 292 ++-------------------- 1 file changed, 22 insertions(+), 270 deletions(-) diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 8f2e144a..0b7bc0a3 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -104,41 +104,6 @@ struct UnrolledLoop }; -template -__device__ void -naive_dia_kernel(const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) -{ - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - - if (thread_id < num_rows) - { - ValueType3 sum = ValueType3(0); - for (IndexType i = 0; i < num_diagonals; ++i) - { - IndexType col = diagonal_offsets[i] + thread_id; - if (col >= 0 && col < num_cols) - { - auto diag_val = load_diag_val(&values[i*pitch + thread_id]); - auto x_val = load_x_val(&x[col]); - - sum += diag_val * x_val; - } - } - y[thread_id] = sum; - } -} - - #define PREFETCH_LOAD_ITERATION(iter) \ IndexType col##iter = offsets[i + iter - 1] + thread_id; \ ValueType1 diag_val##iter = 0; \ @@ -267,6 +232,7 @@ blocked_offsets_dia_kernel(const int num_rows, y[thread_id] = sum; } + template -__global__ void -timed_dia_kernel(const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y, - long long int* times, - const int times_pitch) -{ - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - const IndexType row = thread_id; - - __shared__ IndexType offsets[BLOCK_SIZE]; - ValueType3 sum = ValueType3(0); - - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) - { - if (offset_base + threadIdx.x < num_diagonals) - { - offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; - } - - __syncthreads(); - - int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - - if (row < num_rows) - { - for (IndexType i = 0; i < batch_size; ++i) - { - IndexType col = offsets[i] + row; - if (col >= 0 && col < num_cols) - { - auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); - auto to_load = &x[col]; - - long long int start = clock64(); - auto x_val = load_x_val(to_load); - long long int end = clock64(); - - if (threadIdx.x % 32 == 0) - { - IndexType k = row / 32; - __stwt(times + (offset_base + i)*times_pitch + k, end - start + 1); - } - - sum += diag_val*x_val; - } - } - } - - __syncthreads(); - } - - if (row < num_rows) - y[row] = sum; -} - - - template __device__ void -cached_x_dia_kernel(const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) -{ - using ValueType = ValueType3; - - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - - __shared__ IndexType offsets[BLOCK_SIZE]; - __shared__ ValueType x_cache[BLOCK_SIZE]; - IndexType cache_start = -IndexType(BLOCK_SIZE); - IndexType cache_end = 0; - - IndexType first_row = BLOCK_SIZE * blockIdx.x; - IndexType end_row = min(first_row + BLOCK_SIZE, num_rows); - - ValueType sum = ValueType(0); - - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { - if (offset_base + threadIdx.x < num_diagonals) { - offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; - } - - __syncthreads(); - - int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - - if (thread_id < num_rows) - { - for (IndexType i = 0; i < batch_size; ++i) - { - IndexType col = offsets[i] + thread_id; - ValueType x_val = -1; - - if (col >= 0 && col < num_cols) - { - x_val = col < cache_end ? x_cache[col - cache_start] : x[col]; - ValueType diag_val = values[(offset_base + i)*pitch + thread_id]; - sum += x_val * diag_val; - } - - __syncthreads(); - - x_cache[threadIdx.x] = x_val; - cache_start = first_row + offsets[i]; - cache_end = end_row + offsets[i]; - - __syncthreads(); - } - } - - __syncthreads(); - } - - if (thread_id < num_rows) { - y[thread_id] = sum; - } -} - -template -__device__ void -experimental_dia_kernel(const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) -{ - using ValueType = ValueType3; - - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - - __shared__ IndexType offsets[BLOCK_SIZE]; - __shared__ ValueType x_cache[2*BLOCK_SIZE]; - - IndexType first_row = BLOCK_SIZE * blockIdx.x; - IndexType first_col = offsets[0] + first_row; - - if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) { - x_cache[threadIdx.x] = x[first_col + threadIdx.x]; - } - - if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) { - x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x]; - } - - ValueType sum = ValueType(0); - - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { - if (offset_base + threadIdx.x < num_diagonals) { - offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; - } - - __syncthreads(); - - int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - - if (thread_id < num_rows) - { - for (IndexType i = 0; i < batch_size; ++i) - { - IndexType col = offsets[i] + thread_id; - - if (col >= 0 && col < num_cols) - { - ValueType x_val = x_cache[col - first_col]; - ValueType diag_val = values[(offset_base + i)*pitch + thread_id]; - sum += x_val * diag_val; - } - } - } - - __syncthreads(); - } - - if (thread_id < num_rows) { - y[thread_id] = sum; - } -} - -template -__device__ void -cached_x_naive_dia_kernel(const int num_rows, +naive_dia_kernel(const int num_rows, const int num_cols, const int num_diagonals, const int pitch, @@ -645,31 +409,19 @@ cached_x_naive_dia_kernel(const int num_rows, const ValueType2* x, ValueType3* y) { - using ValueType = ValueType3; - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - __shared__ ValueType x_cache[2*BLOCK_SIZE]; - - IndexType first_row = BLOCK_SIZE * blockIdx.x; - IndexType first_col = diagonal_offsets[0] + first_row; - - if (0 <= first_col + threadIdx.x && first_col + threadIdx.x < num_cols) { - x_cache[threadIdx.x] = x[first_col + threadIdx.x]; - } - - if (0 <= first_col + BLOCK_SIZE + threadIdx.x && first_col + BLOCK_SIZE + threadIdx.x < num_cols) { - x_cache[BLOCK_SIZE + threadIdx.x] = x[first_col + BLOCK_SIZE + threadIdx.x]; - } - - - if (thread_id < num_rows) { - ValueType sum = ValueType(0); - for (IndexType i = 0; i < num_diagonals; ++i) { + if (thread_id < num_rows) + { + ValueType3 sum = ValueType3(0); + for (IndexType i = 0; i < num_diagonals; ++i) + { IndexType col = diagonal_offsets[i] + thread_id; - if (col >= 0 && col < num_cols) { - auto diag_val = values[i*pitch + thread_id]; - auto x_val = x_cache[col - first_col]; + if (col >= 0 && col < num_cols) + { + auto diag_val = load_diag_val(&values[i*pitch + thread_id]); + auto x_val = load_x_val(&x[col]); + sum += diag_val * x_val; } } @@ -677,20 +429,20 @@ cached_x_naive_dia_kernel(const int num_rows, } } + template -__launch_bounds__(BLOCK_SIZE,1) __global__ void -ktt_dia_vector_kernel( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_dia_vector_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* diagonal_offsets, + const ValueType1* values, + const ValueType2* x, + ValueType3* y) { #if KERNEL_TYPE == 0 naive_dia_kernel From 1c6b3cf8d1a04475931a4d2f8868ba7644a52911 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Sun, 7 May 2023 13:19:41 +0200 Subject: [PATCH 40/49] Add more general function for matrix generation --- cusp/ktt/matrix_generation.h | 79 ++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h index b30e616a..712e0b55 100644 --- a/cusp/ktt/matrix_generation.h +++ b/cusp/ktt/matrix_generation.h @@ -4,52 +4,35 @@ #include // min #include +#include namespace cusp { namespace ktt { -/** - * @brief Create a symetric matrix with given number of diagonals filled with ones. - * - * The main diagonal is always filled and then equal number of diagonals is placed - * on both sides of it. That means the matrix is actually symetric only for - * odd numbers of diagonals. - * - * If the number of diagonals is greater then would fit into the matrix, - * the extra diagonals are just discarded. - * - * @param rows The number of rows of the resulting matrix. - * @param cols The number of columns of the resulting matrix. - * @param offset_step Distance between two diagonals. - * @param diagonal_count The number of non-zero diagonals in the matrix. - * - * @return The matrix. - */ + cusp::dia_matrix -make_diagonal_symmetric_matrix(int rows, - int cols, - int offset_step, - int diagonal_count) +make_diagonal_matrix(int rows, int cols, const std::vector& diag_offsets) { using DiaMatrix = cusp::dia_matrix; + int diagonal_count = diag_offsets.size(); + DiaMatrix::diagonal_offsets_array_type offsets; offsets.reserve(diagonal_count); DiaMatrix::values_array_type values(rows, diagonal_count, 0); - int starting_offset = -offset_step * diagonal_count/2; size_t num_entries = 0; - for (int i = 0; i < diagonal_count; ++i) { - int offset = starting_offset + offset_step*i; + int offset = diag_offsets[i]; int starting_row = offset < 0 ? -offset : 0; int starting_col = offset < 0 ? 0 : offset; if (starting_row >= rows || starting_col >= cols) - throw std::runtime_error("make_diagonal_symmetric_matrix: Too many diagonals."); + throw std::runtime_error( + "make_diagonal_matrix: Diagonal out of bounds."); offsets.push_back(offset); @@ -73,6 +56,52 @@ make_diagonal_symmetric_matrix(int rows, return res; } + +/** + * @brief Create a symetric matrix with given number of diagonals filled with ones. + * + * The main diagonal is always filled and then equal number of diagonals is placed + * on both sides of it. That means the matrix is actually symetric only for + * odd numbers of diagonals. + * + * If the number of diagonals is greater then would fit into the matrix + * an exception is thrown. + * + * @param rows The number of rows of the resulting matrix. + * @param cols The number of columns of the resulting matrix. + * @param offset_step Distance between two diagonals. + * @param diagonal_count The number of non-zero diagonals in the matrix. + * + * @return The matrix. + */ +cusp::dia_matrix +make_diagonal_symmetric_matrix(int rows, + int cols, + int offset_step, + int diagonal_count) +{ + std::vector offsets; + offsets.reserve(diagonal_count); + + int starting_offset = -offset_step * diagonal_count/2; + + for (int i = 0; i < diagonal_count; ++i) { + int offset = starting_offset + offset_step*i; + + int starting_row = offset < 0 ? -offset : 0; + int starting_col = offset < 0 ? 0 : offset; + + if (starting_row >= rows || starting_col >= cols) + throw std::runtime_error( + "make_diagonal_symmetric_matrix: Too many diagonals."); + + offsets.push_back(offset); + } + + return make_diagonal_matrix(rows, cols, offsets); +} + + } // namespace ktt } // namespace cusp From db83e9508b72e5c1ea1fc99bd86527931ec7d891 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 8 May 2023 19:25:05 +0200 Subject: [PATCH 41/49] Add check for builtin types --- cusp/system/detail/generic/multiply.inl | 47 ++++++++++++++++--------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl index 99909c51..75e4c85c 100644 --- a/cusp/system/detail/generic/multiply.inl +++ b/cusp/system/detail/generic/multiply.inl @@ -27,10 +27,13 @@ #include #include +#include + #include #include -#include +#include + namespace cusp { @@ -107,6 +110,8 @@ multiply(thrust::execution_policy &exec, cusp::multiply(exec, A, B, C, initialize, combine, reduce); } + +// The has_rebind type traits is used to check whether matrix is a view template struct has_rebind : std::false_type {}; @@ -116,38 +121,48 @@ struct has_rebind>> : std::true_type {}; template constexpr bool has_rebind_v = has_rebind::value; + template -typename thrust::detail::disable_if_convertible::type +typename thrust::detail::disable_if_convertible< + typename Matrix::format, + cusp::unknown_format +>::type multiply(cusp::system::cuda::detail::execution_policy &exec, - const LinearOperator& A, - const cusp::array1d& B, - cusp::array1d& C) + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y) { - typedef typename LinearOperator::format MatrixFormat; - - if constexpr ((cusp::detail::is_ell::value - || cusp::detail::is_dia::value) - && has_rebind_v) + using IndexType = typename Matrix::index_type; + using ValueType = typename Matrix::value_type; + + if constexpr ((cusp::detail::is_ell::value + || cusp::detail::is_dia::value + && std::is_fundamental_v + && std::is_fundamental_v + && std::is_fundamental_v + && std::is_fundamental_v) + && has_rebind_v) { - if (cusp::ktt::detail::is_enabled) { - cusp::ktt::detail::lazy_init(); - cusp::system::cuda::ktt::multiply(*cusp::ktt::detail::tuner, A, B, C); + if (cusp::ktt::detail::is_enabled) + { + cusp::system::cuda::ktt::multiply(cusp::ktt::get_tuner(), A, x, y); return; } } - typedef typename LinearOperator::value_type ValueType; + // The call is not meant for KTT. Pass it on... cusp::constant_functor initialize(0); thrust::multiplies combine; thrust::plus reduce; - cusp::multiply(exec, A, B, C, initialize, combine, reduce); + cusp::multiply(exec, A, x, y, initialize, combine, reduce); } + template Date: Mon, 8 May 2023 19:25:17 +0200 Subject: [PATCH 42/49] Modify the tuning space --- cusp/system/cuda/ktt/dia_multiply.h | 45 +++++++++++++++++------------ 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 02df5bd8..ff1ab7f6 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -26,27 +26,27 @@ inline void setup_tuning_parameters(const kernel_context& kernel) auto& tuner = *kernel.tuner; auto kernel_id = kernel.kernel_id; - tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 0, 1, 2 }); - tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR", - u64_vec{ 0, 2, 4, 8 }); + tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 1, 2 }); + // tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR", + // u64_vec{ 0, 2, 4, 8 }); tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR", u64_vec{ 0, 2, 3, 4 }); tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 }); - tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8 }); - tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 256, 512 }); + tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8, 16 }); + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); // START OF HACKS to test striping ---- - tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 }); - tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 }); - tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 }); - tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 }); - tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 }); - - tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" }, - [] (const std::vector& values) { - return values[0] == 0; - }); + // tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 }); + // tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 }); + // tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 }); + // tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 }); + // tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 }); + + // tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" }, + // [] (const std::vector& values) { + // return values[0] == 0; + // }); // END OF HACKS ----------------------- tuner.AddThreadModifier( @@ -91,6 +91,13 @@ inline void setup_tuning_parameters(const kernel_context& kernel) [] (const std::vector& values) { return values[0] == 2 || values[1] == 2; }); + + tuner.AddConstraint( + kernel.kernel_id, + { "BLOCK_SIZE", "STRIPING_FACTOR" }, + [] (const std::vector& values) { + return values[0]/values[1] >= 32; + }); } @@ -205,10 +212,10 @@ auto get_launcher( DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); // START OF HACKS to test striping ---- - auto conf = interface.GetCurrentConfiguration(); - for (const auto& pair : conf.GetPairs()) - if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0) - grid_size = ::ktt::DimensionVector(pair.GetValue()); + // auto conf = interface.GetCurrentConfiguration(); + // for (const auto& pair : conf.GetPairs()) + // if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0) + // grid_size = ::ktt::DimensionVector(pair.GetValue()); // END OF HACKS ----------------------- if (!profile) { From 2e529a3222f0689b480adbb8edbf0d06f256499d Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Wed, 10 May 2023 10:49:57 +0200 Subject: [PATCH 43/49] Fix tuning reseting --- cusp/ktt/detail/ktt.inl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl index 9e47d9be..cfe25886 100644 --- a/cusp/ktt/detail/ktt.inl +++ b/cusp/ktt/detail/ktt.inl @@ -137,11 +137,9 @@ void reset_tuning(const MatrixType& A, const cusp::array1d& x, cusp::array1d& y) { - using IndexType = typename MatrixType::index_type; - using ValueType = typename MatrixType::value_type; - using Format = typename MatrixType::format; - - return reset_tuning(A); + auto& tuner = get_tuner(); + const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + tuner.ClearData(kernel.kernel_id); } From f3157a3bd016cc23e3f6b4d05913bf9895d19a9d Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Fri, 12 May 2023 09:14:53 +0200 Subject: [PATCH 44/49] Set the final values of tuning parameters --- cusp/system/cuda/ktt/dia_multiply.h | 18 +----------------- cusp/system/cuda/ktt/ell_multiply.h | 2 +- cusp/system/cuda/ktt/kernels/dia_kernel.h | 11 ----------- 3 files changed, 2 insertions(+), 29 deletions(-) diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index ff1ab7f6..60cee249 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -26,28 +26,12 @@ inline void setup_tuning_parameters(const kernel_context& kernel) auto& tuner = *kernel.tuner; auto kernel_id = kernel.kernel_id; - tuner.AddParameter(kernel_id, "KERNEL_TYPE", u64_vec{ 1, 2 }); - // tuner.AddParameter(kernel_id, "SHARED_PREFETCH_FACTOR", - // u64_vec{ 0, 2, 4, 8 }); + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR", u64_vec{ 0, 2, 3, 4 }); tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 }); - tuner.AddParameter(kernel_id, "STRIPING_FACTOR", u64_vec{ 2, 4, 8, 16 }); - tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); - // START OF HACKS to test striping ---- - // tuner.AddParameter(kernel_id, "BLOCKS_PER_SECTOR", u64_vec{ 0 }); - // tuner.AddParameter(kernel_id, "CHUNK_SIZE", u64_vec{ 0 }); - // tuner.AddParameter(kernel_id, "STRIPE_SIZE", u64_vec{ 0 }); - // tuner.AddParameter(kernel_id, "GRID_SIZE", u64_vec{ 0 }); - // tuner.AddParameter(kernel_id, "SECTOR_MAPPING_TYPE", u64_vec{ 0, 1 }); - - // tuner.AddConstraint(kernel.kernel_id, { "SECTOR_MAPPING_TYPE" }, - // [] (const std::vector& values) { - // return values[0] == 0; - // }); - // END OF HACKS ----------------------- tuner.AddThreadModifier( kernel.kernel_id, diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 3981156f..4ea01dc5 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -26,7 +26,7 @@ inline void setup_common_tuning_parameters(const kernel_context& kernel) tuner.AddParameter(kernel_id, "UNCACHED_LOADS", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "DISABLE_UNROLL", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 }); - tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4 }); + tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4, 8 }); tuner.AddThreadModifier( kernel_id, diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 0b7bc0a3..4a07ac69 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -444,17 +444,6 @@ ktt_dia_vector_kernel(const int num_rows, const ValueType2* x, ValueType3* y) { -#if KERNEL_TYPE == 0 - naive_dia_kernel - (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#elif KERNEL_TYPE == 1 blocked_offsets_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#elif KERNEL_TYPE == 2 - striped_dia_kernel - (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#elif KERNEL_TYPE == 3 - double_striped_dia_kernel - (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#endif } From c4039d862741605326093e218f5a3967c54b23c4 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Fri, 12 May 2023 13:26:01 +0200 Subject: [PATCH 45/49] Use profiling only conditionally --- cusp/system/cuda/ktt/csr_multiply.h | 2 ++ cusp/system/cuda/ktt/dia_multiply.h | 2 ++ cusp/system/cuda/ktt/ell_multiply.h | 2 ++ 3 files changed, 6 insertions(+) diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h index 3303fe93..3e117fe6 100644 --- a/cusp/system/cuda/ktt/csr_multiply.h +++ b/cusp/system/cuda/ktt/csr_multiply.h @@ -159,8 +159,10 @@ auto get_launcher(const kernel_context& ctx, if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { +#ifdef PROFILE interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); +#endif } }; } diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 60cee249..3c02a8e8 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -205,8 +205,10 @@ auto get_launcher( if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { +#ifdef PROFILE interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); +#endif } }; } diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 4ea01dc5..88d5cd3e 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -249,8 +249,10 @@ auto get_launcher(const kernel_context& ctx, if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { +#ifdef PROFILE interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); +#endif } }; } From cf135992b3d624144899ebaa77576975ea901210 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Fri, 12 May 2023 13:56:39 +0200 Subject: [PATCH 46/49] Fix dangling reference --- cusp/system/cuda/ktt/csr_multiply.h | 4 +--- cusp/system/cuda/ktt/dia_multiply.h | 2 -- cusp/system/cuda/ktt/ell_multiply.h | 4 +--- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h index 3e117fe6..d35a2fd8 100644 --- a/cusp/system/cuda/ktt/csr_multiply.h +++ b/cusp/system/cuda/ktt/csr_multiply.h @@ -143,7 +143,7 @@ auto get_launcher(const kernel_context& ctx, cusp::array1d& y, bool profile = false) { - return [&] (::ktt::ComputeInterface& interface) + return [&, profile] (::ktt::ComputeInterface& interface) { auto conf = interface.GetCurrentConfiguration(); @@ -159,10 +159,8 @@ auto get_launcher(const kernel_context& ctx, if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { -#ifdef PROFILE interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); -#endif } }; } diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 3c02a8e8..60cee249 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -205,10 +205,8 @@ auto get_launcher( if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { -#ifdef PROFILE interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); -#endif } }; } diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 88d5cd3e..92e31627 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -233,7 +233,7 @@ auto get_launcher(const kernel_context& ctx, cusp::array1d& y, bool profile = false) { - return [&] (::ktt::ComputeInterface& interface) + return [&, profile] (::ktt::ComputeInterface& interface) { const auto& conf = interface.GetCurrentConfiguration(); auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW"); @@ -249,10 +249,8 @@ auto get_launcher(const kernel_context& ctx, if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { -#ifdef PROFILE interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); -#endif } }; } From d6561fbfc005640c1308fcdb3bc5a757c07061ac Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 15 May 2023 19:04:10 +0200 Subject: [PATCH 47/49] Add documentation --- cusp/ktt/ellr_matrix.h | 8 +-- cusp/ktt/ktt.h | 112 +++++++++++++++++++++++------------------ 2 files changed, 68 insertions(+), 52 deletions(-) diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h index 7f8d932c..829f5dfc 100644 --- a/cusp/ktt/ellr_matrix.h +++ b/cusp/ktt/ellr_matrix.h @@ -30,7 +30,7 @@ class ellr_matrix : public cusp::ell_matrix */ ellr_matrix(void) {} - /*! Construct an \p ell_matrix with a specific shape, number of nonzero entries, + /*! Construct an \p ellr_matrix with a specific shape, number of nonzero entries, * and maximum number of nonzero entries per row. * * \param num_rows Number of rows. @@ -43,7 +43,7 @@ class ellr_matrix : public cusp::ell_matrix const size_t num_entries, const size_t num_entries_per_row, const size_t alignment = 32); - /*! Construct an \p ell_matrix from another matrix. + /*! Construct an \p ellr_matrix from another matrix. * * \param matrix Another sparse or dense matrix. */ @@ -71,9 +71,9 @@ class ellr_matrix : public cusp::ell_matrix void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, const size_t num_entries_per_row, const size_t alignment); - /*! Swap the contents of two \p ell_matrix objects. + /*! Swap the contents of two \p ellr_matrix objects. * - * \param matrix Another \p ell_matrix with the same IndexType and ValueType. + * \param matrix Another \p ellr_matrix with the same IndexType and ValueType. */ void swap(ellr_matrix& matrix); diff --git a/cusp/ktt/ktt.h b/cusp/ktt/ktt.h index 88a5d026..37c89de5 100644 --- a/cusp/ktt/ktt.h +++ b/cusp/ktt/ktt.h @@ -15,94 +15,110 @@ inline void enable(); inline ::ktt::Tuner& get_tuner(); + /** * @brief Perform the multiplication y = A*x using an autotuned kernel. + * This function performs a single step of dynamic autotuning. * * @tparam Matrix The type of the sparse matrix to use. - * @tparam ValueType1 The value type of the x vector. - * @tparam ValueType2 The value type of the y vector. + * @tparam IndexType The type used for indices. + * @tparam ValueType1 The value type of the matrix. + * @tparam ValueType2 The value type of the x vector. + * @tparam ValueType3 The value type of the y vector. * * @param A The matrix. * @param x The input vector. * @param y The output vector. */ -template typename Matrix, + typename IndexType, typename ValueType1, - typename ValueType2> -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y); + typename ValueType2, + typename ValueType3> +::ktt::KernelResult multiply( + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y); + -// NOTE: Should we have this? /** * @brief Perform the multiplication y = A*x without autotuning, * instead just run the kernel in the given configuration. * * @tparam Matrix The type of the sparse matrix to use. - * @tparam ValueType1 The value type of the x vector. - * @tparam ValueType2 The value type of the y vector. + * @tparam IndexType The type used for indices. + * @tparam ValueType1 The value type of the matrix. + * @tparam ValueType2 The value type of the x vector. + * @tparam ValueType3 The value type of the y vector. * - * @param A The matrix. - * @param x The input vector. - * @param y The output vector. + * @param A The matrix. + * @param x The input vector. + * @param y The output vector. + * @param configuration The configuration of the kernel. + * @param run_with_profiling Flag used to toggle profiling. */ -template -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& configuration, - bool run_with_profiling = false); - template typename Matrix, typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3> -std::vector<::ktt::KernelResult> -tune(const Matrix& A, +::ktt::KernelResult multiply( + const Matrix& A, const cusp::array1d& x, cusp::array1d& y, - std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt); + const ::ktt::KernelConfiguration& configuration, + bool run_with_profiling = false); + /** - * TODO: Add some documentation. + * @brief Perform offline autotuning on the specified matrix and vectors. * - * @brief + * @tparam Matrix The type of the sparse matrix to use. + * @tparam IndexType The type used for indices. + * @tparam ValueType1 The value type of the matrix. + * @tparam ValueType2 The value type of the x vector. + * @tparam ValueType3 The value type of the y vector. * - * @tparam IndexType - * @tparam ValueType1 - * @tparam ValueType2 - * @tparam ValueType3 - * @tparam Format + * @param A The matrix. + * @param x The input vector. + * @param y The output vector. + * @param reference_computation The reference computation that computes + the correct result. */ -template typename Matrix, + typename IndexType, typename ValueType1, typename ValueType2, - typename ValueType3, - typename Format> -void reset_tuning(); + typename ValueType3> +std::vector<::ktt::KernelResult> +tune(const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt); + /** - * TODO: Add some documentation. + * @brief Resets the tuning for the kernel instantiation used for the given arguments. * - * @brief + * @tparam MatrixType Type of the sparse matrix. + * @tparam ValueType1 The value type of the x vector + * @tparam ValueType2 The value type of the y vector. + * @tparam MemorySpace1 The memory space of the x vector. + * @tparam MemorySpace2 The memory space of the y vector. * - * @tparam MatrixType - * @tparam ValueType1 - * @tparam ValueType2 - * - * @param A - * @param x - * @param y + * @param A The matrix. + * @param x The input vector. + * @param y The output vector. */ template + typename ValueType2, + typename MemorySpace1, + typename MemorySpace2> void reset_tuning(const MatrixType& A, - const cusp::array1d& x, - cusp::array1d& y); + const cusp::array1d& x, + cusp::array1d& y); + } // namespace ktt From 9cc1988612c8dfca8ed8aacab778ea2b9724fed3 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 15 May 2023 19:05:25 +0200 Subject: [PATCH 48/49] Make last minute changes --- cusp/ktt/detail/ktt.inl | 61 ++-- cusp/system/cuda/ktt/dia_multiply.h | 45 +-- cusp/system/cuda/ktt/ell_multiply.h | 3 +- cusp/system/cuda/ktt/kernel.h | 10 +- cusp/system/cuda/ktt/kernels/dia_kernel.h | 337 +++++----------------- cusp/system/cuda/ktt/kernels/ell_kernel.h | 24 +- cusp/system/cuda/ktt/multiply.h | 33 ++- cusp/system/cuda/ktt/utils.h | 4 +- 8 files changed, 143 insertions(+), 374 deletions(-) diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl index cfe25886..63f97368 100644 --- a/cusp/ktt/detail/ktt.inl +++ b/cusp/ktt/detail/ktt.inl @@ -15,6 +15,7 @@ namespace ktt { namespace detail { + inline std::unique_ptr<::ktt::Tuner> tuner; inline bool is_enabled = true; @@ -41,9 +42,11 @@ inline void lazy_init() CUstream stream; cuStreamCreate(&stream, CU_STREAM_DEFAULT); - ::ktt::ComputeApiInitializer initializer(context, std::vector<::ktt::ComputeQueue>{ stream }); + std::vector<::ktt::ComputeQueue> queues = { stream }; + ::ktt::ComputeApiInitializer initializer(context, queues); - tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, initializer); + tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, + initializer); } std::string compiler_flags = "-std=c++17 "; @@ -51,7 +54,6 @@ inline void lazy_init() compiler_flags += "-lineinfo "; #endif tuner->SetCompilerOptions(compiler_flags); - tuner->SetValidationMode(::ktt::ValidationMode::OfflineTuning); std::atexit(cleanup); @@ -77,25 +79,31 @@ inline ::ktt::Tuner& get_tuner() } -template typename Matrix, + typename IndexType, typename ValueType1, - typename ValueType2> -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y) + typename ValueType2, + typename ValueType3> +::ktt::KernelResult multiply( + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y) { return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y); } -template typename Matrix, + typename IndexType, typename ValueType1, - typename ValueType2> -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& configuration, - bool run_with_profiling) + typename ValueType2, + typename ValueType3> +::ktt::KernelResult multiply( + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& configuration, + bool run_with_profiling) { return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y, configuration, run_with_profiling); } @@ -115,27 +123,14 @@ tune(const Matrix& A, return cusp::system::cuda::ktt::tune(get_tuner(), A, x, y, reference_computation); } - -// template -// void reset_tuning() -// { -// using namespace cusp::system::cuda::ktt; - -// kernel_context kernel = get_kernel(get_tuner(), Format{}); -// detail::tuner->ClearData(kernel.kernel_id); -// } - - template + typename ValueType2, + typename MemorySpace1, + typename MemorySpace2> void reset_tuning(const MatrixType& A, - const cusp::array1d& x, - cusp::array1d& y) + const cusp::array1d& x, + cusp::array1d& y) { auto& tuner = get_tuner(); const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 60cee249..8380b6f2 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -27,10 +27,10 @@ inline void setup_tuning_parameters(const kernel_context& kernel) auto kernel_id = kernel.kernel_id; tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); - tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_FACTOR", + tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 3, 4 }); - tuner.AddParameter(kernel_id, "REGISTER_PREFETCH_TYPE", u64_vec{ 0, 1 }); - tuner.AddParameter(kernel_id, "LOAD_TYPE", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "PREFETCH_TYPE", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "SPECIAL_LOADS", u64_vec{ 0, 1 }); tuner.AddThreadModifier( @@ -43,45 +43,15 @@ inline void setup_tuning_parameters(const kernel_context& kernel) return parameters[0]; }); - // Only one type of prefetching can be applied at once. - tuner.AddConstraint( - kernel.kernel_id, - { "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, - [] (const std::vector& values) { - return values[0] == 0 || values[1] == 0; - }); - - // Prefetching is used only for the blocked offsets kernel. - tuner.AddConstraint( - kernel.kernel_id, - { "KERNEL_TYPE", "SHARED_PREFETCH_FACTOR", "REGISTER_PREFETCH_FACTOR" }, - [] (const std::vector& values) { - return values[0] == 1 || (values[1] == 0 && values[2] == 0); - }); // Different register prefetching implementations are used // only when register prefetching is actually enabled tuner.AddConstraint( kernel.kernel_id, - { "REGISTER_PREFETCH_FACTOR", "REGISTER_PREFETCH_TYPE" }, + { "PREFETCH_FACTOR", "PREFETCH_TYPE" }, [] (const std::vector& values) { return values[0] > 0 || values[1] == 0; }); - - // Don't try different striping factors for non-striped kernels. - tuner.AddConstraint( - kernel.kernel_id, - { "KERNEL_TYPE", "STRIPING_FACTOR" }, - [] (const std::vector& values) { - return values[0] == 2 || values[1] == 2; - }); - - tuner.AddConstraint( - kernel.kernel_id, - { "BLOCK_SIZE", "STRIPING_FACTOR" }, - [] (const std::vector& values) { - return values[0]/values[1] >= 32; - }); } @@ -195,13 +165,6 @@ auto get_launcher( ::ktt::DimensionVector grid_size( DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); - // START OF HACKS to test striping ---- - // auto conf = interface.GetCurrentConfiguration(); - // for (const auto& pair : conf.GetPairs()) - // if (pair.GetName() == "GRID_SIZE" && pair.GetValue() != 0) - // grid_size = ::ktt::DimensionVector(pair.GetValue()); - // END OF HACKS ----------------------- - if (!profile) { interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); } else { diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h index 92e31627..d4ff4703 100644 --- a/cusp/system/cuda/ktt/ell_multiply.h +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -23,7 +23,7 @@ inline void setup_common_tuning_parameters(const kernel_context& kernel) auto kernel_id = kernel.kernel_id; tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); - tuner.AddParameter(kernel_id, "UNCACHED_LOADS", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "SPECIAL_LOADS", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "DISABLE_UNROLL", u64_vec{ 0, 1 }); tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 }); tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4, 8 }); @@ -58,7 +58,6 @@ inline void setup_ell_tuning_parameters(const kernel_context& kernel) tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 }); // BREAK can be used only when there is no prefetching - // TODO(KTT): Implement prefetching when using early break and remove this tuner.AddConstraint( kernel.kernel_id, { "BREAK", "PREFETCH_FACTOR" }, diff --git a/cusp/system/cuda/ktt/kernel.h b/cusp/system/cuda/ktt/kernel.h index a34c8b16..4061c558 100644 --- a/cusp/system/cuda/ktt/kernel.h +++ b/cusp/system/cuda/ktt/kernel.h @@ -16,15 +16,17 @@ namespace ktt { struct kernel_context { ::ktt::Tuner* tuner = nullptr; ::ktt::KernelId kernel_id = ::ktt::InvalidKernelId; - + // This is a vector to allow for composite kernels. std::vector<::ktt::KernelDefinitionId> definition_ids; kernel_context(::ktt::Tuner& tuner) : tuner(&tuner) { } - - kernel_context(::ktt::Tuner& tuner, const std::vector<::ktt::KernelDefinitionId>& definition_ids, ::ktt::KernelId kernel_id) + + kernel_context(::ktt::Tuner& tuner, + const std::vector<::ktt::KernelDefinitionId>& definition_ids, + ::ktt::KernelId kernel_id) : tuner(&tuner), - definition_ids(definition_ids), + definition_ids(definition_ids), kernel_id(kernel_id) { } }; diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index 4a07ac69..9c64f6e1 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -1,14 +1,3 @@ -#pragma once - - -#if REGISTER_PREFETCH_FACTOR > 0 - #define PREFETCH_FACTOR REGISTER_PREFETCH_FACTOR -#elif SHARED_PREFETCH_FACTOR > 0 - #define PREFETCH_FACTOR SHARED_PREFETCH_FACTOR -#else - #define PREFETCH_FACTOR 0 -#endif - template __device__ T min(T a, T b) { @@ -24,11 +13,12 @@ __device__ T max(T a, T b) { template __device__ T load_diag_val(const T* val) { -#if LOAD_TYPE == 0 +#if SPECIAL_LOADS == 0 return *val; -#elif LOAD_TYPE == 1 +#elif SPECIAL_LOADS == 1 // Cache streaming, likely to be accessed once. - // The ld.cs load cached streaming operation allocates global lines with evict-first policy in L1 and L2 + // The ld.cs load cached streaming operation allocates global lines with + // evict-first policy in L1 and L2 // to limit cache pollution by temporary streaming data return __ldcs(val); #endif @@ -37,9 +27,9 @@ __device__ T load_diag_val(const T* val) template __device__ T load_x_val(const T* val) { -#if LOAD_TYPE == 0 +#if SPECIAL_LOADS == 0 return *val; -#else +#elif SPECIAL_LOADS == 1 // Needs compute capability at least 3.5 return __ldg(val); #endif @@ -58,14 +48,14 @@ struct UnrolledLoop : UnrolledLoop ValueType1 diag_val = 0; ValueType2 x_val = 0; - __device__ void prefetch(const IndexType* offsets, - const ValueType1* values, - const ValueType2* x, - int pitch, - int cols, - IndexType row, - IndexType offset_base, - IndexType i) + __device__ void prefetch_vals(const IndexType* offsets, + const ValueType1* values, + const ValueType2* x, + int pitch, + int cols, + IndexType row, + IndexType offset_base, + IndexType i) { col = offsets[i] + row; @@ -75,13 +65,15 @@ struct UnrolledLoop : UnrolledLoop x_val = load_x_val(&x[col]); } - parent::prefetch(offsets, values, x, pitch, cols, row, offset_base, i + 1); + parent::prefetch_vals(offsets, values, x, pitch, cols, row, + offset_base, i + 1); } template - __device__ ValueType3 do_iter() + __device__ ValueType3 accumulate() { - return static_cast(diag_val*x_val) + parent::template do_iter(); + return static_cast(diag_val*x_val) + + parent::template accumulate(); } }; @@ -90,47 +82,48 @@ template struct UnrolledLoop { - __device__ void prefetch(const IndexType* offsets, - const ValueType1* values, - const ValueType2* x, - int pitch, - int cols, - IndexType row, - IndexType offset_base, - IndexType i) {} + __device__ void prefetch_vals(const IndexType* offsets, + const ValueType1* values, + const ValueType2* x, + int pitch, + int cols, + IndexType row, + IndexType offset_base, + IndexType i) {} template - __device__ ValueType3 do_iter() { return 0; } + __device__ ValueType3 accumulate() { return 0; } }; -#define PREFETCH_LOAD_ITERATION(iter) \ - IndexType col##iter = offsets[i + iter - 1] + thread_id; \ +#define PREFETCH_VALS_ITERATION(iter) \ + IndexType col##iter = offsets[i + iter - 1] + row; \ ValueType1 diag_val##iter = 0; \ ValueType2 x_val##iter = 0; \ if (col##iter >= 0 && col##iter < num_cols) \ { \ - IndexType idx = (offset_base + i + iter - 1)*pitch + thread_id; \ + IndexType idx = (offset_base + i + iter - 1)*pitch + row ; \ diag_val##iter = load_diag_val(&values[idx]); \ x_val##iter = load_x_val(&x[col##iter]); \ } -#define PREFETCH_ACCUM_ITERATION(iter) \ +#define ACCUMULATE_ITERATION(iter) \ sum += diag_val##iter * x_val##iter; -#define PREFETCH1(ITER) ITER(1); -#define PREFETCH2(ITER) ITER(2); PREFETCH1(ITER) -#define PREFETCH3(ITER) ITER(3); PREFETCH2(ITER) -#define PREFETCH4(ITER) ITER(4); PREFETCH3(ITER) -#define PREFETCH5(ITER) ITER(5); PREFETCH4(ITER) -#define PREFETCH6(ITER) ITER(6); PREFETCH5(ITER) -#define PREFETCH7(ITER) ITER(7); PREFETCH6(ITER) -#define PREFETCH8(ITER) ITER(8); PREFETCH7(ITER) +#define REPEAT1(ITER) ITER(1); +#define REPEAT2(ITER) ITER(2); REPEAT1(ITER) +#define REPEAT3(ITER) ITER(3); REPEAT2(ITER) +#define REPEAT4(ITER) ITER(4); REPEAT3(ITER) +#define REPEAT5(ITER) ITER(5); REPEAT4(ITER) +#define REPEAT6(ITER) ITER(6); REPEAT5(ITER) +#define REPEAT7(ITER) ITER(7); REPEAT6(ITER) +#define REPEAT8(ITER) ITER(8); REPEAT7(ITER) -#define PREFETCH_IMPL(factor, ITER) PREFETCH##factor(ITER) +#define PREFETCH_VALS_IMPL(factor) REPEAT##factor(PREFETCH_VALS_ITERATION) +#define ACCUMULATE_IMPL(factor) REPEAT##factor(ACCUMULATE_ITERATION) -#define PREFETCH_LOAD_VALS(factor) PREFETCH_IMPL(factor, PREFETCH_LOAD_ITERATION) -#define PREFETCH_ACCUMULATE(factor) PREFETCH_IMPL(factor, PREFETCH_ACCUM_ITERATION) +#define PREFETCH_VALS(factor) PREFETCH_VALS_IMPL(factor) +#define ACCUMULATE(factor) ACCUMULATE_IMPL(factor) template 0 - __shared__ ValueType1 values_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE]; - __shared__ ValueType2 x_prefetched[SHARED_PREFETCH_FACTOR*BLOCK_SIZE]; -#endif + const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x; __shared__ IndexType offsets[BLOCK_SIZE]; ValueType3 sum = ValueType3(0); @@ -168,218 +156,33 @@ blocked_offsets_dia_kernel(const int num_rows, ? num_diagonals - offset_base : BLOCK_SIZE; - if (thread_id < num_rows) + if (row < num_rows) { IndexType i = 0; #if PREFETCH_FACTOR > 0 for (; i < batch_size - PREFETCH_FACTOR + 1; i += PREFETCH_FACTOR) { -#if REGISTER_PREFETCH_FACTOR > 0 - -#if REGISTER_PREFETCH_TYPE == 0 - UnrolledLoop loop; - loop.prefetch(offsets, values, x, pitch, num_cols, thread_id, offset_base, i); - sum += loop.template do_iter(); -#elif REGISTER_PREFETCH_TYPE == 1 - PREFETCH_LOAD_VALS(PREFETCH_FACTOR); - PREFETCH_ACCUMULATE(PREFETCH_FACTOR); -#endif // REGISTER_PREFETCH_TYPE == 1 - -#elif SHARED_PREFETCH_FACTOR > 0 - for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j) - { - IndexType col = offsets[i + j] + thread_id; - if (col >= 0 && col < num_cols) - { - values_prefetched[j*BLOCK_SIZE + threadIdx.x] = - load_diag_val(&values[ (offset_base + i + j)*pitch + thread_id ]);; - x_prefetched[j*BLOCK_SIZE + threadIdx.x] = - load_x_val(&x[col]); - } - else - { - values_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType1(0); - x_prefetched[j*BLOCK_SIZE + threadIdx.x] = ValueType2(0); - } - } - - for (int j = 0; j < SHARED_PREFETCH_FACTOR; ++j) - { - sum += values_prefetched[j*BLOCK_SIZE + threadIdx.x] * x_prefetched[j*BLOCK_SIZE + threadIdx.x]; - } -#endif // SHARED_PREFETCH_FACTOR > 0 - } -#endif // PREFETCH_FACTOR > 0 - - for (; i < batch_size; ++i) - { - IndexType col = offsets[i] + thread_id; - if (col >= 0 && col < num_cols) - { - auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + thread_id ]); - auto x_val = load_x_val(&x[col]); - - sum += diag_val*x_val; - } - } - } - - __syncthreads(); - } - - if (thread_id < num_rows) - y[thread_id] = sum; -} - - -template -__device__ void -striped_dia_kernel(const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) -{ -#ifdef STRIPING_FACTOR - const IndexType num_groups = STRIPING_FACTOR; -#else - const IndexType num_groups = 1; +#if PREFETCH_TYPE == 0 + UnrolledLoop loop; + loop.prefetch_vals(offsets, values, x, pitch, num_cols, + row, offset_base, i); + sum += loop.template accumulate(); +#elif PREFETCH_TYPE == 1 + PREFETCH_VALS(PREFETCH_FACTOR); + ACCUMULATE(PREFETCH_FACTOR); #endif - - const IndexType group_size = BLOCK_SIZE/num_groups; - const IndexType stride = num_rows/num_groups; - - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - - const IndexType group_id = threadIdx.x / group_size; - const IndexType group_lane_id = threadIdx.x % group_size; - - const IndexType row = group_id*stride + blockIdx.x*group_size - + group_lane_id; - - // printf("(%d, %d, %d)-> %d\n", blockIdx.x, threadIdx.x, thread_id, row); - - __shared__ IndexType offsets[BLOCK_SIZE]; - ValueType3 sum = ValueType3(0); - - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) - { - if (offset_base + threadIdx.x < num_diagonals) - { - offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; - } - - __syncthreads(); - - int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - - if (row < num_rows) - { - for (IndexType i = 0; i < batch_size; ++i) - { - IndexType col = offsets[i] + row; - if (col >= 0 && col < num_cols) - { - auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); - auto x_val = load_x_val(&x[col]); - - sum += diag_val*x_val; - } } - } - - __syncthreads(); - } - - if (row < num_rows) - y[row] = sum; -} - - -template -__device__ void -double_striped_dia_kernel(const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) -{ -#if KERNEL_TYPE == 3 - IndexType blocks_per_sector = BLOCKS_PER_SECTOR; - IndexType stripe_size = STRIPE_SIZE; - IndexType chunk_size = CHUNK_SIZE; -#else - IndexType blocks_per_sector = 0; - IndexType stripe_size = 0; - IndexType chunk_size = 0; - assert(false); #endif -#if SECTOR_MAPPING_TYPE == 0 - IndexType sector_id = blockIdx.x / blocks_per_sector; - IndexType sector_lane_id = blockIdx.x % blocks_per_sector; - - IndexType stripe_id = threadIdx.x / (chunk_size*32); - IndexType stripe_lane_id = threadIdx.x % (chunk_size*32); - - IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE - + stripe_id*stripe_size - + sector_lane_id*chunk_size*32 - + stripe_lane_id; -#elif SECTOR_MAPPING_TYPE == 1 - IndexType num_sectors = gridDim.x / blocks_per_sector; - - IndexType sector_id = blockIdx.x % num_sectors; - IndexType sector_lane_id = blockIdx.x / num_sectors; - - IndexType stripe_id = threadIdx.x / (chunk_size*32); - IndexType stripe_lane_id = threadIdx.x % (chunk_size*32); - - IndexType row = sector_id*blocks_per_sector*BLOCK_SIZE - + stripe_id*stripe_size - + sector_lane_id*chunk_size*32 - + stripe_lane_id; -#endif - - IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - - //printf("(%d, %d, %d) -> %d\n", blockIdx.x, threadIdx.x, thread_id, row); - - __shared__ IndexType offsets[BLOCK_SIZE]; - ValueType3 sum = ValueType3(0); - - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) - { - if (offset_base + threadIdx.x < num_diagonals) - { - offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; - } - - __syncthreads(); - - int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; - - if (row < num_rows) - { - for (IndexType i = 0; i < batch_size; ++i) + for (; i < batch_size; ++i) { IndexType col = offsets[i] + row; if (col >= 0 && col < num_cols) { - auto diag_val = load_diag_val(&values[ (offset_base + i)*pitch + row ]); + IndexType idx = (offset_base + i)*pitch + row; + + auto diag_val = load_diag_val(&values[idx]); auto x_val = load_x_val(&x[col]); sum += diag_val*x_val; @@ -404,10 +207,10 @@ naive_dia_kernel(const int num_rows, const int num_cols, const int num_diagonals, const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) + const IndexType* __restrict__ diagonal_offsets, + const ValueType1* __restrict__ values, + const ValueType2* __restrict__ x, + ValueType3* __restrict__ y) { const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; @@ -439,10 +242,10 @@ ktt_dia_vector_kernel(const int num_rows, const int num_cols, const int num_diagonals, const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) + const IndexType* __restrict__ diagonal_offsets, + const ValueType1* __restrict__ values, + const ValueType2 *__restrict__ x, + ValueType3* __restrict__ y) { blocked_offsets_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h index 3d89bd60..f71431a9 100644 --- a/cusp/system/cuda/ktt/kernels/ell_kernel.h +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -9,14 +9,10 @@ template __device__ T load(const T* addr) { -#if UNCACHED_LOADS == 0 +#if SPECIAL_LOADS == 0 return *addr; -#elif UNCACHED_LOADS == 1 +#elif SPECIAL_LOADS == 1 return __ldcv(addr); -#elif UNCACHED_LOADS == 2 - return __ldcs(addr); -#elif UNCACHED_LOADS == 3 - return __ldlu(addr); #endif } @@ -35,30 +31,30 @@ struct Prefetcher : Prefetcher __device__ void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset, - IndexType pitch, + IndexType stride, IndexType i) { - col = load(Aj + offset + i*pitch); - parent::prefetch_cols(Aj, offset, pitch, i + 1); + col = load(Aj + offset + i*stride); + parent::prefetch_cols(Aj, offset, stride, i + 1); } __device__ void prefetch_vals(const ValueType* __restrict__ Ax, const ValueType* __restrict__ x, IndexType offset, - IndexType pitch, + IndexType stride, IndexType i) { #ifndef ELLR if (col >= 0) { #endif - A_val = load(Ax + offset + i*pitch); + A_val = load(Ax + offset + i*stride); x_val = x[col]; #ifndef ELLR } #endif - parent::prefetch_vals(Ax, x, offset, pitch, i + 1); + parent::prefetch_vals(Ax, x, offset, stride, i + 1); } __device__ ValueType accumulate_results() @@ -73,12 +69,12 @@ struct Prefetcher { __device__ void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset, - IndexType pitch, IndexType i) { } + IndexType stride, IndexType i) { } __device__ void prefetch_vals(const ValueType* __restrict__ Ax, const ValueType* __restrict__ x, IndexType offset, - IndexType pitch, IndexType i) { } + IndexType stride, IndexType i) { } __device__ ValueType accumulate_results() { diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h index 59729596..3f79d14b 100644 --- a/cusp/system/cuda/ktt/multiply.h +++ b/cusp/system/cuda/ktt/multiply.h @@ -23,11 +23,14 @@ namespace ktt { // instead of having to manually specify the types. template -kernel_context get_kernel(::ktt::Tuner& tuner, - const MatrixType& A, - const cusp::array1d& x, - cusp::array1d& y) + typename ValueType2, + typename MemorySpace1, + typename MemorySpace2> +kernel_context get_kernel( + ::ktt::Tuner& tuner, + const MatrixType& A, + const cusp::array1d& x, + cusp::array1d& y) { using IndexType = typename MatrixType::index_type; using ValueType = typename MatrixType::value_type; @@ -50,8 +53,11 @@ kernel_context get_kernel(::ktt::Tuner& tuner, template -::ktt::KernelResult multiply(::ktt::Tuner& tuner, const MatrixType& A, - const VectorType1& x, VectorType2& y) +::ktt::KernelResult +multiply(::ktt::Tuner& tuner, + const MatrixType& A, + const VectorType1& x, + VectorType2& y) { const kernel_context& kernel = get_kernel(tuner, A, x, y); @@ -72,8 +78,11 @@ template ::ktt::KernelResult -multiply(::ktt::Tuner& tuner, const MatrixType& A, const VectorType1& x, - VectorType2& y, const ::ktt::KernelConfiguration& configuration, +multiply(::ktt::Tuner& tuner, + const MatrixType& A, + const VectorType1& x, + VectorType2& y, + const ::ktt::KernelConfiguration& configuration, bool run_with_profiling = false) { const kernel_context& kernel = get_kernel(tuner, A, x, y); @@ -95,8 +104,10 @@ template std::vector<::ktt::KernelResult> -tune(::ktt::Tuner& tuner, const MatrixType& A, - const VectorType1& x, VectorType2& y, +tune(::ktt::Tuner& tuner, + const MatrixType& A, + const VectorType1& x, + VectorType2& y, std::optional<::ktt::ReferenceComputation> ref_computation = std::nullopt) { using Format = typename MatrixType::format; diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h index a3f4af1e..e773b461 100644 --- a/cusp/system/cuda/ktt/utils.h +++ b/cusp/system/cuda/ktt/utils.h @@ -92,7 +92,7 @@ inline void remove_arguments(const kernel_context& kernel, const std::vector<::k } -uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf, +inline uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf, const std::string& name) { for (const auto& pair : conf.GetPairs()) @@ -102,7 +102,7 @@ uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf, throw std::runtime_error("No paramater with name: " + name); } -double get_parameter_double(const ::ktt::KernelConfiguration& conf, +inline double get_parameter_double(const ::ktt::KernelConfiguration& conf, const std::string& name) { for (const auto& pair : conf.GetPairs()) From b65e416f66d1715067f0facbf53b23ef881d53b9 Mon Sep 17 00:00:00 2001 From: Miroslav Demek Date: Mon, 15 May 2023 20:42:19 +0200 Subject: [PATCH 49/49] Remove warnings --- cusp/detail/temporary_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cusp/detail/temporary_array.h b/cusp/detail/temporary_array.h index 08d27908..8dbed6b0 100644 --- a/cusp/detail/temporary_array.h +++ b/cusp/detail/temporary_array.h @@ -28,7 +28,7 @@ #include #if THRUST_VERSION >= 100800 -#define TEMP_HOST_DEVICE_DECORATORS __host__ __device__ +#define TEMP_HOST_DEVICE_DECORATORS __host__ #else #define TEMP_HOST_DEVICE_DECORATORS #endif