diff --git a/.gitignore b/.gitignore index 769857d7..ff73601b 100644 --- a/.gitignore +++ b/.gitignore @@ -122,3 +122,4 @@ performance/spmm/spmm *.user .vscode +a.out diff --git a/build.sh b/build.sh index 4f53b52f..9809c63c 100644 --- a/build.sh +++ b/build.sh @@ -1,8 +1,8 @@ nvcc -DCUSP_PATH=$(realpath .) \ -I . -I ../KTT/Source \ - -l cuda -l ktt -L ../KTT/Build/x86_64_Debug/ \ - --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Debug/) \ - -std=c++17 -g -O3 -DKTT_LINE_INFO -lineinfo main.cu + -l cuda -l ktt -L ../KTT/Build/x86_64_Release/ \ + --linker-options=-rpath,$(realpath ../KTT/Build/x86_64_Release/) \ + -std=c++17 -g -O3 -lineinfo main.cu [ "$?" -eq "0" ] || exit 1 diff --git a/build/build-env.py b/build/build-env.py index fbd7f216..3d19bc53 100644 --- a/build/build-env.py +++ b/build/build-env.py @@ -277,6 +277,10 @@ def addKTT(env): env.Append(RPATH = [ ktt_lib_path ]) env.Append(LIBS = [ "ktt" ]) + if env['mode'] == 'debug': + env.Append(CFLAGS = ['-DKTT_LINE_INFO']) + env.Append(CXXFLAGS = ['-DKTT_LINE_INFO']) + def Environment(buildDir): # allow the user discretion to choose the MSVC version @@ -465,7 +469,7 @@ def Environment(buildDir): env['ENV']['DYLD_LIBRARY_PATH'] = os.environ['DYLD_LIBRARY_PATH'] elif 'LD_LIBRARY_PATH' in os.environ: env['ENV']['LD_LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH'] - + addKTT(env) # generate help text diff --git a/cusp/detail/temporary_array.h b/cusp/detail/temporary_array.h index 08d27908..8dbed6b0 100644 --- a/cusp/detail/temporary_array.h +++ b/cusp/detail/temporary_array.h @@ -28,7 +28,7 @@ #include #if THRUST_VERSION >= 100800 -#define TEMP_HOST_DEVICE_DECORATORS __host__ __device__ +#define TEMP_HOST_DEVICE_DECORATORS __host__ #else #define TEMP_HOST_DEVICE_DECORATORS #endif diff --git a/cusp/ktt/detail/ellr_matrix.inl b/cusp/ktt/detail/ellr_matrix.inl new file mode 100644 index 00000000..8a67949d --- /dev/null +++ b/cusp/ktt/detail/ellr_matrix.inl @@ -0,0 +1,137 @@ +#include +#include +#include + +#include +#include +#include // raw_pointer_cast + +namespace cusp +{ + +namespace ktt +{ + + +template +struct ell_row_length +{ + const IndexType* column_indices; + size_t num_cols_per_row; + size_t pitch; + + __host__ __device__ IndexType operator()(IndexType row_idx) + { + IndexType len = 0; + + while (len < num_cols_per_row + && column_indices[row_idx + len*pitch] >= 0) + { + len++; + } + + return len; + } +}; + +template +void +compute_row_lengths(cusp::ktt::ellr_matrix& A) +{ + thrust::counting_iterator row_idx_it(0); + + thrust::transform( + row_idx_it, + row_idx_it + A.num_rows, + A.row_lengths.begin(), + ell_row_length{ + thrust::raw_pointer_cast(&A.column_indices(0, 0)), + A.column_indices.num_cols, + A.column_indices.pitch } + ); +} + + +////////////////// +// Constructors // +////////////////// + +template +ellr_matrix +::ellr_matrix(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row, const size_t alignment) + : Parent(num_rows, num_cols, num_entries, num_entries_per_row, alignment) +{ + row_lengths.resize(num_rows); +} + +// construct from a different matrix +template +template +ellr_matrix +::ellr_matrix(const MatrixType& matrix) +{ + cusp::convert(matrix, *this); + update_row_lengths(); +} + +////////////////////// +// Member Functions // +////////////////////// + +template +void +ellr_matrix +::swap(ellr_matrix& matrix) +{ + Parent::swap(matrix); + row_lengths.swap(matrix.row_lengths); +} + +template +void +ellr_matrix +::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row) +{ + Parent::resize(num_rows, num_cols, num_entries, num_entries_per_row); + row_lengths.resize(num_rows); +} + +template +void +ellr_matrix +::resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row, const size_t alignment) +{ + Parent::resize(num_rows, num_cols, num_entries, + num_entries_per_row, alignment); + row_lengths.resize(num_rows); +} + +// assignment from another matrix +template +template +ellr_matrix& +ellr_matrix +::operator=(const MatrixType& matrix) +{ + cusp::convert(matrix, *this); + update_row_lengths(); + + return *this; +} + +template +void ellr_matrix::update_row_lengths() +{ + if (row_lengths.size() != this->num_rows) + row_lengths.resize(this->num_rows); + + compute_row_lengths(*this); +} + + +} // namespace ktt + +} // namespace cusp diff --git a/cusp/ktt/detail/ktt.inl b/cusp/ktt/detail/ktt.inl index f2b3c745..63f97368 100644 --- a/cusp/ktt/detail/ktt.inl +++ b/cusp/ktt/detail/ktt.inl @@ -15,6 +15,7 @@ namespace ktt { namespace detail { + inline std::unique_ptr<::ktt::Tuner> tuner; inline bool is_enabled = true; @@ -41,9 +42,11 @@ inline void lazy_init() CUstream stream; cuStreamCreate(&stream, CU_STREAM_DEFAULT); - ::ktt::ComputeApiInitializer initializer(context, std::vector<::ktt::ComputeQueue>{ stream }); + std::vector<::ktt::ComputeQueue> queues = { stream }; + ::ktt::ComputeApiInitializer initializer(context, queues); - tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, initializer); + tuner = std::make_unique<::ktt::Tuner>(::ktt::ComputeApi::CUDA, + initializer); } std::string compiler_flags = "-std=c++17 "; @@ -51,7 +54,6 @@ inline void lazy_init() compiler_flags += "-lineinfo "; #endif tuner->SetCompilerOptions(compiler_flags); - tuner->SetValidationMode(::ktt::ValidationMode::OfflineTuning); std::atexit(cleanup); @@ -77,36 +79,43 @@ inline ::ktt::Tuner& get_tuner() } -template typename Matrix, + typename IndexType, typename ValueType1, - typename ValueType2> -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y) + typename ValueType2, + typename ValueType3> +::ktt::KernelResult multiply( + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y) { return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y); } -template typename Matrix, + typename IndexType, typename ValueType1, - typename ValueType2> -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& configuration, - bool run_with_profiling) + typename ValueType2, + typename ValueType3> +::ktt::KernelResult multiply( + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& configuration, + bool run_with_profiling) { return cusp::system::cuda::ktt::multiply(get_tuner(), A, x, y, configuration, run_with_profiling); } -template typename Matrix, + typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3> std::vector<::ktt::KernelResult> -tune(const cusp::dia_matrix& A, +tune(const Matrix& A, const cusp::array1d& x, cusp::array1d& y, std::optional<::ktt::ReferenceComputation> reference_computation) @@ -114,33 +123,18 @@ tune(const cusp::dia_matrix& A, return cusp::system::cuda::ktt::tune(get_tuner(), A, x, y, reference_computation); } - -template -void reset_tuning() -{ - using namespace cusp::system::cuda::ktt; - - kernel_context kernel = get_kernel(get_tuner(), Format{}); - detail::tuner->ClearData(kernel.kernel_id); -} - - template + typename ValueType2, + typename MemorySpace1, + typename MemorySpace2> void reset_tuning(const MatrixType& A, - const cusp::array1d& x, - cusp::array1d& y) + const cusp::array1d& x, + cusp::array1d& y) { - using IndexType = typename MatrixType::index_type; - using ValueType = typename MatrixType::value_type; - using Format = typename MatrixType::format; - - return reset_tuning(); + auto& tuner = get_tuner(); + const auto& kernel = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + tuner.ClearData(kernel.kernel_id); } diff --git a/cusp/ktt/ellr_matrix.h b/cusp/ktt/ellr_matrix.h new file mode 100644 index 00000000..829f5dfc --- /dev/null +++ b/cusp/ktt/ellr_matrix.h @@ -0,0 +1,97 @@ +#pragma once + +#include + +#include + +#include + + +namespace cusp +{ + +namespace ktt +{ + + +template +class ellr_matrix : public cusp::ell_matrix +{ + + using Parent = cusp::ell_matrix; + +public: + + using row_lengths_array_type = cusp::array1d; + + row_lengths_array_type row_lengths; + + /*! Construct an empty \p ellr_matrix. + */ + ellr_matrix(void) {} + + /*! Construct an \p ellr_matrix with a specific shape, number of nonzero entries, + * and maximum number of nonzero entries per row. + * + * \param num_rows Number of rows. + * \param num_cols Number of columns. + * \param num_entries Number of nonzero matrix entries. + * \param num_entries_per_row Maximum number of nonzeros per row. + * \param alignment Amount of padding used to align the data structure (default 32). + */ + ellr_matrix(const size_t num_rows, const size_t num_cols, + const size_t num_entries, const size_t num_entries_per_row, + const size_t alignment = 32); + + /*! Construct an \p ellr_matrix from another matrix. + * + * \param matrix Another sparse or dense matrix. + */ + template + ellr_matrix(const MatrixType& matrix); + + /*! Resize matrix dimensions and underlying storage + * + * \param num_rows Number of rows. + * \param num_cols Number of columns. + * \param num_entries Number of nonzero matrix entries. + * \param num_entries_per_row Maximum number of nonzeros per row. + */ + void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row); + + /*! Resize matrix dimensions and underlying storage + * + * \param num_rows Number of rows. + * \param num_cols Number of columns. + * \param num_entries Number of nonzero matrix entries. + * \param num_entries_per_row Maximum number of nonzeros per row. + * \param alignment Amount of padding used to align the data structure (default 32). + */ + void resize(const size_t num_rows, const size_t num_cols, const size_t num_entries, + const size_t num_entries_per_row, const size_t alignment); + + /*! Swap the contents of two \p ellr_matrix objects. + * + * \param matrix Another \p ellr_matrix with the same IndexType and ValueType. + */ + void swap(ellr_matrix& matrix); + + /*! Assignment from another matrix. + * + * \tparam MatrixType Format type of input matrix. + * + * \param matrix Another sparse or dense matrix. + */ + template + ellr_matrix& operator=(const MatrixType& matrix); + + void update_row_lengths(); +}; + + +} // namespace ktt + +} // namespace cusp + +#include diff --git a/cusp/ktt/ktt.h b/cusp/ktt/ktt.h index cab2d637..37c89de5 100644 --- a/cusp/ktt/ktt.h +++ b/cusp/ktt/ktt.h @@ -1,9 +1,10 @@ #pragma once -#include - +#include #include +#include + namespace cusp { namespace ktt { @@ -14,93 +15,110 @@ inline void enable(); inline ::ktt::Tuner& get_tuner(); + /** * @brief Perform the multiplication y = A*x using an autotuned kernel. + * This function performs a single step of dynamic autotuning. * * @tparam Matrix The type of the sparse matrix to use. - * @tparam ValueType1 The value type of the x vector. - * @tparam ValueType2 The value type of the y vector. + * @tparam IndexType The type used for indices. + * @tparam ValueType1 The value type of the matrix. + * @tparam ValueType2 The value type of the x vector. + * @tparam ValueType3 The value type of the y vector. * * @param A The matrix. * @param x The input vector. * @param y The output vector. */ -template typename Matrix, + typename IndexType, typename ValueType1, - typename ValueType2> -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y); + typename ValueType2, + typename ValueType3> +::ktt::KernelResult multiply( + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y); + -// NOTE: Should we have this? /** * @brief Perform the multiplication y = A*x without autotuning, * instead just run the kernel in the given configuration. * * @tparam Matrix The type of the sparse matrix to use. - * @tparam ValueType1 The value type of the x vector. - * @tparam ValueType2 The value type of the y vector. + * @tparam IndexType The type used for indices. + * @tparam ValueType1 The value type of the matrix. + * @tparam ValueType2 The value type of the x vector. + * @tparam ValueType3 The value type of the y vector. * - * @param A The matrix. - * @param x The input vector. - * @param y The output vector. + * @param A The matrix. + * @param x The input vector. + * @param y The output vector. + * @param configuration The configuration of the kernel. + * @param run_with_profiling Flag used to toggle profiling. */ -template -::ktt::KernelResult multiply(const Matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& configuration, - bool run_with_profiling = false); - -template typename Matrix, + typename IndexType, typename ValueType1, typename ValueType2, typename ValueType3> -std::vector<::ktt::KernelResult> -tune(const cusp::dia_matrix& A, +::ktt::KernelResult multiply( + const Matrix& A, const cusp::array1d& x, cusp::array1d& y, - std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt); + const ::ktt::KernelConfiguration& configuration, + bool run_with_profiling = false); + /** - * TODO: Add some documentation. + * @brief Perform offline autotuning on the specified matrix and vectors. * - * @brief + * @tparam Matrix The type of the sparse matrix to use. + * @tparam IndexType The type used for indices. + * @tparam ValueType1 The value type of the matrix. + * @tparam ValueType2 The value type of the x vector. + * @tparam ValueType3 The value type of the y vector. * - * @tparam IndexType - * @tparam ValueType1 - * @tparam ValueType2 - * @tparam ValueType3 - * @tparam Format + * @param A The matrix. + * @param x The input vector. + * @param y The output vector. + * @param reference_computation The reference computation that computes + the correct result. */ -template typename Matrix, + typename IndexType, typename ValueType1, typename ValueType2, - typename ValueType3, - typename Format> -void reset_tuning(); + typename ValueType3> +std::vector<::ktt::KernelResult> +tune(const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt); + /** - * TODO: Add some documentation. - * - * @brief + * @brief Resets the tuning for the kernel instantiation used for the given arguments. * - * @tparam MatrixType - * @tparam ValueType1 - * @tparam ValueType2 + * @tparam MatrixType Type of the sparse matrix. + * @tparam ValueType1 The value type of the x vector + * @tparam ValueType2 The value type of the y vector. + * @tparam MemorySpace1 The memory space of the x vector. + * @tparam MemorySpace2 The memory space of the y vector. * - * @param A - * @param x - * @param y + * @param A The matrix. + * @param x The input vector. + * @param y The output vector. */ template + typename ValueType2, + typename MemorySpace1, + typename MemorySpace2> void reset_tuning(const MatrixType& A, - const cusp::array1d& x, - cusp::array1d& y); + const cusp::array1d& x, + cusp::array1d& y); + } // namespace ktt diff --git a/cusp/ktt/matrix_generation.h b/cusp/ktt/matrix_generation.h index 6c5e0b15..712e0b55 100644 --- a/cusp/ktt/matrix_generation.h +++ b/cusp/ktt/matrix_generation.h @@ -3,11 +3,60 @@ #include #include // min +#include +#include namespace cusp { namespace ktt { + +cusp::dia_matrix +make_diagonal_matrix(int rows, int cols, const std::vector& diag_offsets) +{ + using DiaMatrix = cusp::dia_matrix; + + int diagonal_count = diag_offsets.size(); + + DiaMatrix::diagonal_offsets_array_type offsets; + offsets.reserve(diagonal_count); + + DiaMatrix::values_array_type values(rows, diagonal_count, 0); + + size_t num_entries = 0; + for (int i = 0; i < diagonal_count; ++i) { + int offset = diag_offsets[i]; + + int starting_row = offset < 0 ? -offset : 0; + int starting_col = offset < 0 ? 0 : offset; + + if (starting_row >= rows || starting_col >= cols) + throw std::runtime_error( + "make_diagonal_matrix: Diagonal out of bounds."); + + offsets.push_back(offset); + + int ending_row = starting_row + std::min(rows - starting_row, cols - starting_col); + + for (int row = starting_row; row < ending_row; ++row) { + num_entries++; + values(row, i) = 1; + } + } + + DiaMatrix res; + + res.num_rows = rows; + res.num_cols = cols; + res.num_entries = num_entries; + + res.diagonal_offsets.swap(offsets); + res.values.swap(values); + + return res; +} + + /** * @brief Create a symetric matrix with given number of diagonals filled with ones. * @@ -15,8 +64,8 @@ namespace ktt { * on both sides of it. That means the matrix is actually symetric only for * odd numbers of diagonals. * - * If the number of diagonals is greater then would fit into the matrix, - * the extra diagonals are just discarded. + * If the number of diagonals is greater then would fit into the matrix + * an exception is thrown. * * @param rows The number of rows of the resulting matrix. * @param cols The number of columns of the resulting matrix. @@ -31,45 +80,28 @@ make_diagonal_symmetric_matrix(int rows, int offset_step, int diagonal_count) { - using DiaMatrix = cusp::dia_matrix; - - DiaMatrix::diagonal_offsets_array_type offsets; + std::vector offsets; offsets.reserve(diagonal_count); - DiaMatrix::values_array_type values(rows, diagonal_count, 0); - int starting_offset = -offset_step * diagonal_count/2; - size_t num_entries = 0; for (int i = 0; i < diagonal_count; ++i) { int offset = starting_offset + offset_step*i; - offsets.push_back(offset); int starting_row = offset < 0 ? -offset : 0; int starting_col = offset < 0 ? 0 : offset; - int ending_row = starting_row >= rows || starting_col >= cols - ? 0 - : starting_row + std::min(rows - starting_row, cols - starting_col); + if (starting_row >= rows || starting_col >= cols) + throw std::runtime_error( + "make_diagonal_symmetric_matrix: Too many diagonals."); - for (int row = starting_row; row < ending_row; ++row) { - num_entries++; - values(row, i) = 1; - } + offsets.push_back(offset); } - DiaMatrix res; - - res.num_rows = rows; - res.num_cols = cols; - res.num_entries = num_entries; - - res.diagonal_offsets.swap(offsets); - res.values.swap(values); - - return res; + return make_diagonal_matrix(rows, cols, offsets); } + } // namespace ktt } // namespace cusp diff --git a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h index 7bf4b98d..6f9e2c5c 100644 --- a/cusp/system/cuda/detail/multiply/csr_vector_spmv.h +++ b/cusp/system/cuda/detail/multiply/csr_vector_spmv.h @@ -24,6 +24,13 @@ #include +#ifdef TIME_CSR +#include +#include + +namespace krn = std::chrono; +#endif + namespace cusp { namespace system @@ -190,11 +197,22 @@ void __spmv_csr_vector(cuda::execution_policy& exec, cudaStream_t s = stream(thrust::detail::derived_cast(exec)); + #ifdef TIME_CSR + auto start = krn::steady_clock::now(); + #endif + spmv_csr_vector_kernel <<>> (A.num_rows, A.row_offsets.begin(), A.column_indices.begin(), A.values.begin(), x.begin(), y.begin(), initialize, combine, reduce); + + #ifdef TIME_CSR + cudaStreamSynchronize(s); + auto end = krn::steady_clock::now(); + size_t time = krn::duration_cast(end - start).count(); + std::cout << "cusp csr kernel: " << time << " us\n"; + #endif } template +#ifdef TIME_DIA +#include +#include + +namespace krn = std::chrono; +#endif + namespace cusp { namespace system @@ -165,8 +172,19 @@ void multiply(cuda::execution_policy& exec, cudaStream_t s = stream(thrust::detail::derived_cast(exec)); +#ifdef TIME_DIA + auto start = krn::steady_clock::now(); +#endif + spmv_dia_kernel <<>> (A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets.begin(), A.values.values.begin(), x.begin(), y.begin(), initialize, combine, reduce); + +#ifdef TIME_DIA + cudaStreamSynchronize(s); + auto end = krn::steady_clock::now(); + size_t time = krn::duration_cast(end - start).count(); + std::cout << "cusp dia kernel: " << time << " us\n"; +#endif } } // end namespace detail diff --git a/cusp/system/cuda/detail/multiply/ell_spmv.h b/cusp/system/cuda/detail/multiply/ell_spmv.h index 2a5eab78..18bdecf5 100644 --- a/cusp/system/cuda/detail/multiply/ell_spmv.h +++ b/cusp/system/cuda/detail/multiply/ell_spmv.h @@ -28,6 +28,13 @@ #include #include +#ifdef TIME_ELL +#include +#include + +namespace krn = std::chrono; +#endif + namespace cusp { namespace system @@ -132,8 +139,19 @@ void multiply(cuda::execution_policy& exec, cudaStream_t s = stream(thrust::detail::derived_cast(exec)); +#ifdef TIME_ELL + auto start = krn::steady_clock::now(); +#endif + spmv_ell_kernel <<>> (A.num_rows, A.num_cols, num_entries_per_row, pitch, J, V, x_ptr, y_ptr, initialize, combine, reduce); + +#ifdef TIME_ELL + cudaStreamSynchronize(s); + auto end = krn::steady_clock::now(); + size_t time = krn::duration_cast(end - start).count(); + std::cout << "cusp ell kernel: " << time << " us\n"; +#endif } } // end namespace detail diff --git a/cusp/system/cuda/detail/synchronize.inl b/cusp/system/cuda/detail/synchronize.inl index 8256398e..73919e38 100644 --- a/cusp/system/cuda/detail/synchronize.inl +++ b/cusp/system/cuda/detail/synchronize.inl @@ -32,7 +32,7 @@ namespace detail void synchronize(const char *message) { - cudaError_t error = cudaThreadSynchronize(); + cudaError_t error = cudaDeviceSynchronize(); if(error) { throw thrust::system_error(error, thrust::cuda_category(), std::string("synchronize: ") + message); diff --git a/cusp/system/cuda/ktt/csr_multiply.h b/cusp/system/cuda/ktt/csr_multiply.h index 0c0deec8..d35a2fd8 100644 --- a/cusp/system/cuda/ktt/csr_multiply.h +++ b/cusp/system/cuda/ktt/csr_multiply.h @@ -1,25 +1,18 @@ -#pragma once +#pragma once -#include +#include #include -#include -#include // max_active_blocks - -#include #include -namespace cusp::system::cuda::ktt::csr { - inline ::ktt::KernelId get_kernel_id(::ktt::KernelDefinitionId id, std::string kernel_name) { - static ::ktt::KernelId kernel = cusp::ktt::detail::tuner->CreateSimpleKernel(kernel_name, id); - return kernel; - } -} +#include +#include +#include + +#include + +#include -template<> -inline ::ktt::KernelId cusp::ktt::detail::get_kernel_id() { - return cusp::system::cuda::ktt::csr::get_kernel_id(0, ""); -} namespace cusp { @@ -31,109 +24,148 @@ namespace ktt { namespace csr { -inline void setup_tuning_parameters(::ktt::Tuner& tuner, ::ktt::KernelId kernel) { - tuner.AddParameter(kernel, "TEST_PARAM", std::vector{0, 1, 2}); + +inline void setup_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + tuner.AddParameter(kernel_id, "BLOCK_SIZE", std::vector{ 128 }); + tuner.AddParameter(kernel_id, "THREADS_PER_VECTOR", std::vector{ 32 }); + + tuner.AddThreadModifier( + kernel.kernel_id, + { kernel.definition_ids[0] }, + ::ktt::ModifierType::Local, + ::ktt::ModifierDimension::X, + { std::string("BLOCK_SIZE") }, + [] (const uint64_t defaultSize, const std::vector& parameters) { + return parameters[0]; + } + ); } -} // namespace csr -template -void multiply(cuda::execution_policy& exec, - const MatrixType& A, - const VectorType1& x, - VectorType2& y, - cusp::csr_format) +template +kernel_context initialize_kernel(::ktt::Tuner& tuner) { - static_assert( std::is_same_v - && std::is_same_v - && std::is_same_v, - "All arguments must be in device memory." ); + std::string kernel_path = + STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h"; - using ValueType = typename MatrixType::value_type; + kernel_context kernel(tuner); - if (A.num_entries == 0) { - thrust::fill(y.begin(), y.end(), ValueType(0)); - return; - } + std::vector< std::string > type_names { + std::string(NAMEOF_TYPE(IndexType)), + std::string(NAMEOF_TYPE(ValueType1)), + std::string(NAMEOF_TYPE(ValueType2)), + std::string(NAMEOF_TYPE(ValueType3)), + }; - std::cout << "Hello from out thing\n"; + // NOTE: These can be anything since they are awlays set in the launcher. + // So use some values that will hopefully cause a crash if not set properly + // in the launcher. + ::ktt::DimensionVector block_size(0); + ::ktt::DimensionVector grid_size(0); - // copied from csr_vector_spmv.h - typedef typename MatrixType::row_offsets_array_type::const_iterator RowIterator; - typedef typename MatrixType::column_indices_array_type::const_iterator ColumnIterator; - typedef typename MatrixType::values_array_type::const_iterator ValueIterator1; + auto definition_id = tuner.AddKernelDefinitionFromFile( + "ktt_csr_vector_kernel", + kernel_path, + grid_size, + block_size, + type_names + ); - typedef typename VectorType1::const_iterator ValueIterator2; - typedef typename VectorType2::iterator ValueIterator3; + kernel.definition_ids.push_back(definition_id); + kernel.kernel_id = tuner.CreateSimpleKernel("CsrKernel", definition_id); - auto& tuner = *cusp::ktt::detail::tuner; + setup_tuning_parameters(kernel); - std::string path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/csr_kernel.h"; + return kernel; +} - const size_t THREADS_PER_BLOCK = 128; - const size_t THREADS_PER_VECTOR = 32; - const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR; - - std::vector< std::string > type_names { - std::string(NAMEOF_TYPE(typename MatrixType::index_type)), - std::string(NAMEOF_TYPE(typename MatrixType::value_type)), - std::string(NAMEOF_TYPE(typename VectorType1::value_type)), - std::string(NAMEOF_TYPE(typename VectorType2::value_type)), - std::to_string(VECTORS_PER_BLOCK), - std::to_string(THREADS_PER_VECTOR) - }; - // TODO: we probably wanna change this later - const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks( - ktt_csr_vector_kernel, THREADS_PER_BLOCK, (size_t) 0); - const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK)); +} // namespace csr + + +template +const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::csr_format format) +{ + static kernel_context kernel = + csr::initialize_kernel(tuner); + + return kernel; +} + - const ::ktt::DimensionVector blockDimensions(THREADS_PER_BLOCK); - const ::ktt::DimensionVector gridDimensions(NUM_BLOCKS); +template +std::vector<::ktt::ArgumentId> +add_arguments(const kernel_context& kernel, + const cusp::csr_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + auto args = add_arguments(*kernel.tuner, + A.num_rows, A.row_offsets, A.column_indices, + A.values, x, y); - ::ktt::KernelDefinitionId definition = tuner.GetKernelDefinitionId("ktt_csr_vector_kernel", type_names); + kernel.tuner->SetArguments(kernel.definition_ids[0], args); - bool called_first_time = definition == ::ktt::InvalidKernelDefinitionId; + return args; +} - if (called_first_time) { - definition = tuner.AddKernelDefinitionFromFile( - "ktt_csr_vector_kernel", - path, - gridDimensions, - blockDimensions, - type_names); - } - ::ktt::KernelId kernel = csr::get_kernel_id(definition, "CsrKernel"); +// Returns the argument id of the y vector given a vector of arguments +// returned by a previous call of `add_arguments`. +inline ::ktt::ArgumentId +get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, + cusp::csr_format) +{ + return arguments[5]; +} - if (called_first_time) { - csr::setup_tuning_parameters(tuner, kernel); - } - auto num_rows_id = tuner.AddArgumentScalar(A.num_rows); - auto Ap_id = add_arg(A.row_offsets); - auto Aj_id = add_arg(A.column_indices); - auto Ax_id = add_arg(A.values); - auto x_id = add_arg(x); - auto y_id = add_arg(y, ::ktt::ArgumentAccessType::ReadWrite); +template +auto get_launcher(const kernel_context& ctx, + const cusp::csr_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + bool profile = false) +{ + return [&, profile] (::ktt::ComputeInterface& interface) + { + auto conf = interface.GetCurrentConfiguration(); - std::vector<::ktt::ArgumentId> args = { num_rows_id, Ap_id, Aj_id, Ax_id, x_id, y_id }; + ::ktt::DimensionVector block_size = + interface.GetCurrentLocalSize(ctx.definition_ids[0]); - tuner.SetArguments(definition, args); + auto threads_per_vector = get_parameter_uint(conf, "THREADS_PER_VECTOR"); + auto vectors_per_block = block_size.GetSizeX() / threads_per_vector; - tuner.TuneIteration(kernel, {}); + ::ktt::DimensionVector grid_size( + DIVIDE_INTO(A.num_rows, vectors_per_block)); - tuner.SetArguments(definition, {}); - for (auto arg : args) { - tuner.RemoveArgument(arg); - } + if (!profile) { + interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); + } else { + interface.RunKernelWithProfiling(ctx.definition_ids[0], + grid_size, block_size); + } + }; } + } // namespace ktt } // namespace cuda diff --git a/cusp/system/cuda/ktt/dia_multiply.h b/cusp/system/cuda/ktt/dia_multiply.h index 6a7f31b2..8380b6f2 100644 --- a/cusp/system/cuda/ktt/dia_multiply.h +++ b/cusp/system/cuda/ktt/dia_multiply.h @@ -1,14 +1,14 @@ #pragma once -#include -#include // max_active_blocks +#include -#include #include #include +#include + +#include +#include -#include -#include namespace cusp { @@ -20,36 +20,64 @@ namespace ktt { namespace dia { -inline void setup_tuning_parameters(::ktt::Tuner& tuner, const kernel_context& kernel) + +inline void setup_tuning_parameters(const kernel_context& kernel) { - tuner.AddParameter(kernel.kernel_id, "KERNEL_TYPE", std::vector{ 0, 1, 2 }); + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); + tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", + u64_vec{ 0, 2, 3, 4 }); + tuner.AddParameter(kernel_id, "PREFETCH_TYPE", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "SPECIAL_LOADS", u64_vec{ 0, 1 }); + + + tuner.AddThreadModifier( + kernel.kernel_id, + { kernel.definition_ids[0] }, + ::ktt::ModifierType::Local, + ::ktt::ModifierDimension::X, + { std::string("BLOCK_SIZE") }, + [] (const uint64_t defaultSize, const u64_vec& parameters) { + return parameters[0]; + }); + + + // Different register prefetching implementations are used + // only when register prefetching is actually enabled + tuner.AddConstraint( + kernel.kernel_id, + { "PREFETCH_FACTOR", "PREFETCH_TYPE" }, + [] (const std::vector& values) { + return values[0] > 0 || values[1] == 0; + }); } -} // namespace dia - -template -kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) +template +kernel_context initialize_kernel(::ktt::Tuner& tuner) { kernel_context kernel(tuner); - std::string kernel_path = STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h"; - - const size_t BLOCK_SIZE = 256; - const size_t MAX_BLOCKS = cusp::system::cuda::detail::max_active_blocks( - ktt_dia_vector_kernel, - BLOCK_SIZE, 0); + std::string kernel_path = + STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/dia_kernel.h"; std::vector< std::string > type_names { std::string(NAMEOF_TYPE(IndexType)), std::string(NAMEOF_TYPE(ValueType1)), std::string(NAMEOF_TYPE(ValueType2)), - std::string(NAMEOF_TYPE(ValueType3)), - std::to_string(BLOCK_SIZE) + std::string(NAMEOF_TYPE(ValueType3)) }; - const ::ktt::DimensionVector blockDimensions(BLOCK_SIZE); - const ::ktt::DimensionVector gridDimensions(MAX_BLOCKS); + // NOTE: These can be anything since they are awlays set in the launcher. + // So use some values that will hopefully cause a crash if not set properly + // in the launcher. + const ::ktt::DimensionVector blockDimensions(0); + const ::ktt::DimensionVector gridDimensions(0); auto definition_id = tuner.AddKernelDefinitionFromFile( "ktt_dia_vector_kernel", @@ -61,18 +89,28 @@ kernel_context initialize_kernel(::ktt::Tuner& tuner, cusp::dia_format) kernel.definition_ids.push_back(definition_id); kernel.kernel_id = tuner.CreateSimpleKernel("DiaKernel", definition_id); - dia::setup_tuning_parameters(tuner, kernel); + dia::setup_tuning_parameters(kernel); return kernel; } + +} // namespace dia + + template -const kernel_context& get_kernel(::ktt::Tuner& tuner, cusp::dia_format format) + typename ValueType3, + typename MemorySpace> +const kernel_context& get_kernel( + ::ktt::Tuner& tuner, + const cusp::dia_matrix& A) { - static kernel_context kernel = initialize_kernel(tuner, format); + constexpr auto init_kernel = + dia::initialize_kernel; + + static kernel_context kernel = init_kernel(tuner); return kernel; } @@ -81,138 +119,61 @@ template std::vector<::ktt::ArgumentId> add_arguments( - const kernel_context& kernel, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y) + const kernel_context& kernel, + const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) { const IndexType num_diagonals = A.values.num_cols; - const IndexType pitch = A.values.pitch; // length of one diagonal + possible padding + const IndexType pitch = A.values.pitch; + + auto args = + add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals, + pitch, A.diagonal_offsets, A.values.values, x, y); - auto args = add_arguments(*kernel.tuner, A.num_rows, A.num_cols, num_diagonals, pitch, A.diagonal_offsets, A.values.values, x, y); kernel.tuner->SetArguments(kernel.definition_ids[0], args); return args; } -// Returns the argument id of the y vector given a vector of arguments returned by a previous -// call of `add_arguments`. -inline ::ktt::ArgumentId get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, cusp::dia_format) { - return arguments[7]; -} - -auto get_launcher(const kernel_context& ctx, - size_t num_rows, - size_t num_cols, - bool profile = false) +// Returns the argument id of the y vector given a vector of arguments +// returned by a previous call of `add_arguments`. +inline ::ktt::ArgumentId get_output_argument( + const std::vector<::ktt::ArgumentId>& arguments, + cusp::dia_format) { - return [=] (::ktt::ComputeInterface& interface) { - ::ktt::DimensionVector block_size = interface.GetCurrentLocalSize(ctx.definition_ids[0]); - ::ktt::DimensionVector grid_size( DIVIDE_INTO(num_rows, block_size.GetSizeX()) ); - - if (!profile) { - interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); - } else { - do { - interface.RunKernelWithProfiling(ctx.definition_ids[0], grid_size, block_size); - } while(interface.GetRemainingProfilingRuns(ctx.definition_ids[0]) > 0); - } - }; + return arguments[7]; } -template -::ktt::KernelResult multiply(::ktt::Tuner& tuner, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y) -{ - if (A.num_entries == 0) { - thrust::fill(y.begin(), y.end(), ValueType3(0)); - ::ktt::KernelResult result("DiaKernel", {}); - result.SetStatus(::ktt::ResultStatus::Ok); - return result; - } - - const kernel_context& kernel = get_kernel(tuner, cusp::dia_format{}); - auto args = add_arguments(kernel, A, x, y); - - tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols)); - - auto res = tuner.TuneIteration(kernel.kernel_id, {}); - remove_arguments(kernel, args); - - return res; -} template -::ktt::KernelResult multiply(::ktt::Tuner& tuner, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - const ::ktt::KernelConfiguration& configuration, - bool run_with_profiling = false) +auto get_launcher( + const kernel_context& ctx, + const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + bool profile = false) { - if (A.num_entries == 0) { - thrust::fill(y.begin(), y.end(), ValueType3(0)); - ::ktt::KernelResult result("DiaKernel", {}); - result.SetStatus(::ktt::ResultStatus::Ok); - return result; - } - - kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); - auto args = add_arguments(kernel, A, x, y); - - tuner.SetLauncher(kernel.kernel_id, get_launcher(kernel, A.num_rows, A.num_cols, run_with_profiling)); - - auto result = tuner.Run(kernel.kernel_id, configuration, {}); - remove_arguments(kernel, args); + return [&, profile] (::ktt::ComputeInterface& interface) + { + ::ktt::DimensionVector block_size = + interface.GetCurrentLocalSize(ctx.definition_ids[0]); + ::ktt::DimensionVector grid_size( + DIVIDE_INTO(A.num_rows, block_size.GetSizeX())); - return result; + if (!profile) { + interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); + } else { + interface.RunKernelWithProfiling(ctx.definition_ids[0], + grid_size, block_size); + } + }; } -template -std::vector<::ktt::KernelResult> -tune(::ktt::Tuner& tuner, - const cusp::dia_matrix& A, - const cusp::array1d& x, - cusp::array1d& y, - std::optional<::ktt::ReferenceComputation> reference_computation = std::nullopt) -{ - if (A.num_entries == 0) { - return {}; - } - - kernel_context kernel = get_kernel(tuner, cusp::dia_format{}); - auto args = add_arguments(kernel, A, x, y); - - if (reference_computation) { - tuner.SetReferenceComputation(get_output_argument(args, dia_format{}), reference_computation.value()); - } - - cusp::array1d host_y = y; - tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) { - // clear y so previous results don't affect validation - y = host_y; - auto launcher = cusp::system::cuda::ktt::get_launcher(kernel, A.num_rows, A.num_cols); - launcher(interface); - }); - - auto results = tuner.Tune(kernel.kernel_id); - - remove_arguments(kernel, args); - - return results; -} } // namespace ktt diff --git a/cusp/system/cuda/ktt/ell_multiply.h b/cusp/system/cuda/ktt/ell_multiply.h new file mode 100644 index 00000000..d4ff4703 --- /dev/null +++ b/cusp/system/cuda/ktt/ell_multiply.h @@ -0,0 +1,264 @@ +#pragma once + +#include + +#include +#include +#include + +namespace cusp { + +namespace system { + +namespace cuda { + +namespace ktt { + +namespace ell { + + +inline void setup_common_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + tuner.AddParameter(kernel_id, "BLOCK_SIZE", u64_vec{ 128, 256, 512 }); + tuner.AddParameter(kernel_id, "SPECIAL_LOADS", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "DISABLE_UNROLL", u64_vec{ 0, 1 }); + tuner.AddParameter(kernel_id, "PREFETCH_FACTOR", u64_vec{ 0, 2, 4 }); + tuner.AddParameter(kernel_id, "THREADS_PER_ROW", u64_vec{ 1, 2, 4, 8 }); + + tuner.AddThreadModifier( + kernel_id, + { kernel.definition_ids[0] }, + ::ktt::ModifierType::Local, + ::ktt::ModifierDimension::X, + { std::string("BLOCK_SIZE") }, + [] (const uint64_t defaultSize, const std::vector& parameters) { + return parameters[0]; + } + ); + + // All the warps working on one stripe of the matrix fit in a thread block. + tuner.AddConstraint( + kernel.kernel_id, + { "THREADS_PER_ROW", "BLOCK_SIZE" }, + [] (const std::vector& values) { + return 32*values[0] <= values[1]; + }); +} + +inline void setup_ell_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + setup_common_tuning_parameters(kernel); + + tuner.AddParameter(kernel_id, "BREAK", u64_vec{ 0, 1 }); + + // BREAK can be used only when there is no prefetching + tuner.AddConstraint( + kernel.kernel_id, + { "BREAK", "PREFETCH_FACTOR" }, + [] (const std::vector& values) { + return values[0] == 0 || values[1] == 0; + }); +} + +inline void setup_ellr_tuning_parameters(const kernel_context& kernel) +{ + auto& tuner = *kernel.tuner; + auto kernel_id = kernel.kernel_id; + + setup_common_tuning_parameters(kernel); + + tuner.AddParameter(kernel_id, "ELLR", u64_vec{ 1 }); +} + + +template +kernel_context initialize_kernel(::ktt::Tuner& tuner, + const std::string& kernel_function, + const std::string& kernel_name, + TuningParamsInit init) +{ + std::string kernel_path = + STRING(CUSP_PATH) "/cusp/system/cuda/ktt/kernels/ell_kernel.h"; + + kernel_context kernel(tuner); + + std::vector< std::string > type_names { + std::string(NAMEOF_TYPE(IndexType)), + std::string(NAMEOF_TYPE(ValueType1)) + }; + + // NOTE: These can be anything since they are awlays set in the launcher. + // So use some values that will hopefully cause a crash if not set properly + // in the launcher. + ::ktt::DimensionVector block_size(0); + ::ktt::DimensionVector grid_size(0); + + auto definition_id = tuner.AddKernelDefinitionFromFile( + kernel_function, + kernel_path, + grid_size, + block_size, + type_names + ); + + kernel.definition_ids.push_back(definition_id); + kernel.kernel_id = tuner.CreateSimpleKernel(kernel_name, definition_id); + + init(kernel); + + return kernel; +} + + +} // namespace ell + + +template +const kernel_context& get_kernel( + ::ktt::Tuner& tuner, + const cusp::ell_matrix& A) +{ + static kernel_context kernel = + ell::initialize_kernel( + tuner, "ktt_ell_kernel", "EllKernel", + ell::setup_ell_tuning_parameters); + + return kernel; +} + +template +const kernel_context& get_kernel( + ::ktt::Tuner& tuner, + const cusp::ktt::ellr_matrix& A) +{ + static kernel_context kernel = + ell::initialize_kernel( + tuner, "ktt_ellr_kernel", "EllrKernel", + ell::setup_ellr_tuning_parameters); + + return kernel; +} + + +template +std::vector<::ktt::ArgumentId> +add_common_arguments( + const kernel_context& kernel, + const cusp::ell_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + IndexType pitch = A.column_indices.pitch; + IndexType num_entries_per_row = A.column_indices.num_cols; + + return add_arguments(*kernel.tuner, A.num_rows, A.num_cols, + num_entries_per_row, pitch, A.column_indices.values, + A.values.values, x, y); +} + +template +std::vector<::ktt::ArgumentId> +add_arguments(const kernel_context& kernel, + const cusp::ell_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + auto args = add_common_arguments(kernel, A, x, y); + kernel.tuner->SetArguments(kernel.definition_ids[0], args); + return args; +} + +template +std::vector<::ktt::ArgumentId> +add_arguments( + const kernel_context& kernel, + const cusp::ktt::ellr_matrix& A, + const cusp::array1d& x, + cusp::array1d& y) +{ + auto args = add_common_arguments(kernel, A, x, y); + auto aditional_args = add_arguments(*kernel.tuner, A.row_lengths); + + args.push_back(aditional_args[0]); + + kernel.tuner->SetArguments(kernel.definition_ids[0], args); + + return args; +} + + +// Returns the argument id of the y vector given a vector of arguments +// returned by a previous call of `add_arguments`. +inline ::ktt::ArgumentId +get_output_argument(const std::vector<::ktt::ArgumentId>& arguments, + cusp::ell_format) +{ + return arguments[7]; +} + + +template +auto get_launcher(const kernel_context& ctx, + const cusp::ell_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + bool profile = false) +{ + return [&, profile] (::ktt::ComputeInterface& interface) + { + const auto& conf = interface.GetCurrentConfiguration(); + auto threads_per_row = get_parameter_uint(conf, "THREADS_PER_ROW"); + + auto threads_in_block = + interface.GetCurrentLocalSize(ctx.definition_ids[0]).GetSizeX(); + + ::ktt::DimensionVector block_size( + threads_in_block/threads_per_row, threads_per_row); + ::ktt::DimensionVector grid_size( + DIVIDE_INTO(A.num_rows, threads_in_block/threads_per_row)); + + if (!profile) { + interface.RunKernel(ctx.definition_ids[0], grid_size, block_size); + } else { + interface.RunKernelWithProfiling(ctx.definition_ids[0], + grid_size, block_size); + } + }; +} + + +} // namespace ktt + +} // namespace cuda + +} // namespace system + +} // namespace cusp diff --git a/cusp/system/cuda/ktt/kernel.h b/cusp/system/cuda/ktt/kernel.h index a34c8b16..4061c558 100644 --- a/cusp/system/cuda/ktt/kernel.h +++ b/cusp/system/cuda/ktt/kernel.h @@ -16,15 +16,17 @@ namespace ktt { struct kernel_context { ::ktt::Tuner* tuner = nullptr; ::ktt::KernelId kernel_id = ::ktt::InvalidKernelId; - + // This is a vector to allow for composite kernels. std::vector<::ktt::KernelDefinitionId> definition_ids; kernel_context(::ktt::Tuner& tuner) : tuner(&tuner) { } - - kernel_context(::ktt::Tuner& tuner, const std::vector<::ktt::KernelDefinitionId>& definition_ids, ::ktt::KernelId kernel_id) + + kernel_context(::ktt::Tuner& tuner, + const std::vector<::ktt::KernelDefinitionId>& definition_ids, + ::ktt::KernelId kernel_id) : tuner(&tuner), - definition_ids(definition_ids), + definition_ids(definition_ids), kernel_id(kernel_id) { } }; diff --git a/cusp/system/cuda/ktt/kernels/csr_kernel.h b/cusp/system/cuda/ktt/kernels/csr_kernel.h index f28773a1..85cd6fa6 100644 --- a/cusp/system/cuda/ktt/kernels/csr_kernel.h +++ b/cusp/system/cuda/ktt/kernels/csr_kernel.h @@ -1,17 +1,11 @@ +#pragma once -inline __device__ int fib(int n) { - if (n < 2) { - return n; - } - return fib(n-1) + fib(n-2); -} +#define VECTORS_PER_BLOCK (BLOCK_SIZE/THREADS_PER_VECTOR) template + typename ValueType3> __global__ void ktt_csr_vector_kernel(const unsigned int num_rows, const IndexType* Ap, @@ -20,17 +14,6 @@ ktt_csr_vector_kernel(const unsigned int num_rows, const ValueType2* x, ValueType3* y) { - -#if TEST_PARAM == 0 - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("Test param is 0.\n"); - } -#else - if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("Fib of 20 is %d.\n", fib(20)); - } -#endif - typedef ValueType1 ValueType; __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals diff --git a/cusp/system/cuda/ktt/kernels/dia_kernel.h b/cusp/system/cuda/ktt/kernels/dia_kernel.h index df7ade63..9c64f6e1 100644 --- a/cusp/system/cuda/ktt/kernels/dia_kernel.h +++ b/cusp/system/cuda/ktt/kernels/dia_kernel.h @@ -1,217 +1,252 @@ -#pragma once +template +__device__ T min(T a, T b) { + return a < b ? a : b; +} -template -__device__ void -ktt_dia_vector_kernel_simple( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +template +__device__ T max(T a, T b) { + return a > b ? a : b; +} + + +template +__device__ T load_diag_val(const T* val) { - using ValueType = ValueType3; +#if SPECIAL_LOADS == 0 + return *val; +#elif SPECIAL_LOADS == 1 + // Cache streaming, likely to be accessed once. + // The ld.cs load cached streaming operation allocates global lines with + // evict-first policy in L1 and L2 + // to limit cache pollution by temporary streaming data + return __ldcs(val); +#endif +} - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; +template +__device__ T load_x_val(const T* val) +{ +#if SPECIAL_LOADS == 0 + return *val; +#elif SPECIAL_LOADS == 1 + // Needs compute capability at least 3.5 + return __ldg(val); +#endif +} - if (thread_id < num_rows) { - ValueType sum = ValueType(0); - for (IndexType i = 0; i < num_diagonals; ++i) { - IndexType col = diagonal_offsets[i] + thread_id; - if (col >= 0 && col < num_cols) { - auto val = values[i*pitch + thread_id]; - auto a = x[col]; - sum +=val*a; - } + +template +struct UnrolledLoop : UnrolledLoop +{ + using parent = UnrolledLoop; + + IndexType col; + ValueType1 diag_val = 0; + ValueType2 x_val = 0; + + __device__ void prefetch_vals(const IndexType* offsets, + const ValueType1* values, + const ValueType2* x, + int pitch, + int cols, + IndexType row, + IndexType offset_base, + IndexType i) + { + col = offsets[i] + row; + + if (col >= 0 && col < cols) + { + diag_val = load_diag_val(&values[(offset_base + i)*pitch + row]); + x_val = load_x_val(&x[col]); } - y[thread_id] = sum; + + parent::prefetch_vals(offsets, values, x, pitch, cols, row, + offset_base, i + 1); + } + + template + __device__ ValueType3 accumulate() + { + return static_cast(diag_val*x_val) + + parent::template accumulate(); + } +}; + +template +struct UnrolledLoop +{ + __device__ void prefetch_vals(const IndexType* offsets, + const ValueType1* values, + const ValueType2* x, + int pitch, + int cols, + IndexType row, + IndexType offset_base, + IndexType i) {} + + template + __device__ ValueType3 accumulate() { return 0; } +}; + + +#define PREFETCH_VALS_ITERATION(iter) \ + IndexType col##iter = offsets[i + iter - 1] + row; \ + ValueType1 diag_val##iter = 0; \ + ValueType2 x_val##iter = 0; \ + if (col##iter >= 0 && col##iter < num_cols) \ + { \ + IndexType idx = (offset_base + i + iter - 1)*pitch + row ; \ + diag_val##iter = load_diag_val(&values[idx]); \ + x_val##iter = load_x_val(&x[col##iter]); \ } -} + +#define ACCUMULATE_ITERATION(iter) \ + sum += diag_val##iter * x_val##iter; + +#define REPEAT1(ITER) ITER(1); +#define REPEAT2(ITER) ITER(2); REPEAT1(ITER) +#define REPEAT3(ITER) ITER(3); REPEAT2(ITER) +#define REPEAT4(ITER) ITER(4); REPEAT3(ITER) +#define REPEAT5(ITER) ITER(5); REPEAT4(ITER) +#define REPEAT6(ITER) ITER(6); REPEAT5(ITER) +#define REPEAT7(ITER) ITER(7); REPEAT6(ITER) +#define REPEAT8(ITER) ITER(8); REPEAT7(ITER) + +#define PREFETCH_VALS_IMPL(factor) REPEAT##factor(PREFETCH_VALS_ITERATION) +#define ACCUMULATE_IMPL(factor) REPEAT##factor(ACCUMULATE_ITERATION) + +#define PREFETCH_VALS(factor) PREFETCH_VALS_IMPL(factor) +#define ACCUMULATE(factor) ACCUMULATE_IMPL(factor) + template + typename ValueType3> __device__ void -ktt_dia_vector_kernel_blocked( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +blocked_offsets_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* __restrict__ diagonal_offsets, + const ValueType1* __restrict__ values, + const ValueType2*__restrict__ x, + ValueType3* __restrict__ y) { - using ValueType = ValueType3; - - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; + const IndexType row = BLOCK_SIZE * blockIdx.x + threadIdx.x; __shared__ IndexType offsets[BLOCK_SIZE]; - ValueType sum = ValueType(0); + ValueType3 sum = ValueType3(0); - for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) { - if (offset_base + threadIdx.x < num_diagonals) { + for (int offset_base = 0; offset_base < num_diagonals; offset_base += BLOCK_SIZE) + { + if (offset_base + threadIdx.x < num_diagonals) offsets[threadIdx.x] = diagonal_offsets[offset_base + threadIdx.x]; - } __syncthreads(); - int batch_size = BLOCK_SIZE > num_diagonals - offset_base ? num_diagonals - offset_base : BLOCK_SIZE; + int batch_size = BLOCK_SIZE > num_diagonals - offset_base + ? num_diagonals - offset_base + : BLOCK_SIZE; + + if (row < num_rows) + { + IndexType i = 0; + +#if PREFETCH_FACTOR > 0 + for (; i < batch_size - PREFETCH_FACTOR + 1; i += PREFETCH_FACTOR) + { +#if PREFETCH_TYPE == 0 + UnrolledLoop loop; + loop.prefetch_vals(offsets, values, x, pitch, num_cols, + row, offset_base, i); + sum += loop.template accumulate(); +#elif PREFETCH_TYPE == 1 + PREFETCH_VALS(PREFETCH_FACTOR); + ACCUMULATE(PREFETCH_FACTOR); +#endif + } +#endif + + for (; i < batch_size; ++i) + { + IndexType col = offsets[i] + row; + if (col >= 0 && col < num_cols) + { + IndexType idx = (offset_base + i)*pitch + row; + + auto diag_val = load_diag_val(&values[idx]); + auto x_val = load_x_val(&x[col]); - if (thread_id < num_rows) { - for (IndexType i = 0; i < batch_size; ++i) { - IndexType col = offsets[i] + thread_id; - if (col >= 0 && col < num_cols) { - sum += values[(offset_base + i)*pitch + thread_id]*x[col]; + sum += diag_val*x_val; } } - y[thread_id] = sum; } __syncthreads(); } + + if (row < num_rows) + y[row] = sum; } -#define ITERATION(i, offset) \ - col = offset + thread_id; \ - if (col >= 0 && col < num_cols) { \ - sum += values[i*pitch + thread_id]*x[col]; \ - } template + typename ValueType3> __device__ void -ktt_dia_vector_kernel_crazy( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) +naive_dia_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* __restrict__ diagonal_offsets, + const ValueType1* __restrict__ values, + const ValueType2* __restrict__ x, + ValueType3* __restrict__ y) { - using ValueType = ValueType3; - const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; - ValueType sum = ValueType(0); - - if (thread_id < num_rows) { - IndexType col; - ITERATION(0, -4); - ITERATION(1, -3); - ITERATION(2, -2); - ITERATION(3, -1); - ITERATION(4, -0); - ITERATION(5, -1); - ITERATION(6, -2); - ITERATION(7, -3); - ITERATION(8, -4); - ITERATION(9, -5); + if (thread_id < num_rows) + { + ValueType3 sum = ValueType3(0); + for (IndexType i = 0; i < num_diagonals; ++i) + { + IndexType col = diagonal_offsets[i] + thread_id; + if (col >= 0 && col < num_cols) + { + auto diag_val = load_diag_val(&values[i*pitch + thread_id]); + auto x_val = load_x_val(&x[col]); + + sum += diag_val * x_val; + } + } y[thread_id] = sum; } - } + template -__launch_bounds__(BLOCK_SIZE,1) __global__ void -ktt_dia_vector_kernel( - const int num_rows, - const int num_cols, - const int num_diagonals, - const int pitch, - const IndexType* diagonal_offsets, - const ValueType1* values, - const ValueType2* x, - ValueType3* y) + typename ValueType3> +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_dia_vector_kernel(const int num_rows, + const int num_cols, + const int num_diagonals, + const int pitch, + const IndexType* __restrict__ diagonal_offsets, + const ValueType1* __restrict__ values, + const ValueType2 *__restrict__ x, + ValueType3* __restrict__ y) { -#if KERNEL_TYPE == 0 - ktt_dia_vector_kernel_simple - (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#elif KERNEL_TYPE == 1 - ktt_dia_vector_kernel_blocked + blocked_offsets_dia_kernel (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#elif KERNEL_TYPE == 2 - ktt_dia_vector_kernel_crazy - (num_rows, num_cols, num_diagonals, pitch, diagonal_offsets, values, x, y); -#endif } - -// template -// __launch_bounds__(BLOCK_SIZE,1) __global__ void -// ktt_dia_vector_kernel( -// const int num_rows, -// const int num_cols, -// const int num_diagonals, -// const int pitch, -// const IndexType* diagonal_offsets, -// const ValueType1* values, -// const ValueType2* x, -// ValueType3* y) -// { -// typedef ValueType1 ValueType; - -// __shared__ IndexType offsets[BLOCK_SIZE]; - -// const IndexType thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; -// const IndexType grid_size = BLOCK_SIZE * gridDim.x; - -// for(IndexType base = 0; base < num_diagonals; base += BLOCK_SIZE) -// { -// // read a chunk of the diagonal offsets into shared memory -// const IndexType chunk_size = IndexType(BLOCK_SIZE) < num_diagonals - base ? IndexType(BLOCK_SIZE) : num_diagonals - base; - -// if(threadIdx.x < chunk_size) -// offsets[threadIdx.x] = diagonal_offsets[base + threadIdx.x]; - -// __syncthreads(); - -// // process chunk -// for(IndexType row = thread_id; row < num_rows; row += grid_size) -// { -// ValueType sum = (base == 0) ? ValueType(0) : ValueType(0); - -// // index into values array -// IndexType idx = row + pitch * base; - -// for(IndexType n = 0; n < chunk_size; n++) -// { -// const IndexType col = row + offsets[n]; - -// if(col >= 0 && col < num_cols) -// { -// const ValueType A_ij = values[idx]; -// sum = sum + (A_ij * x[col]); -// } - -// idx += pitch; -// } - -// y[row] = sum; -// } - -// // wait until all threads are done reading offsets -// __syncthreads(); -// } -// } \ No newline at end of file diff --git a/cusp/system/cuda/ktt/kernels/ell_kernel.h b/cusp/system/cuda/ktt/kernels/ell_kernel.h new file mode 100644 index 00000000..f71431a9 --- /dev/null +++ b/cusp/system/cuda/ktt/kernels/ell_kernel.h @@ -0,0 +1,213 @@ + +#if DISABLE_UNROLL == 1 +#define UNROLL 1 +#else +#define UNROLL +#endif + + +template +__device__ T load(const T* addr) +{ +#if SPECIAL_LOADS == 0 + return *addr; +#elif SPECIAL_LOADS == 1 + return __ldcv(addr); +#endif +} + + +template +struct Prefetcher : Prefetcher +{ + using parent = Prefetcher; + + IndexType col; + ValueType A_val = 0; + ValueType x_val = 0; + + __device__ + void prefetch_cols(const IndexType* __restrict__ Aj, + IndexType offset, + IndexType stride, + IndexType i) + { + col = load(Aj + offset + i*stride); + parent::prefetch_cols(Aj, offset, stride, i + 1); + } + + __device__ + void prefetch_vals(const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + IndexType offset, + IndexType stride, + IndexType i) + { +#ifndef ELLR + if (col >= 0) + { +#endif + A_val = load(Ax + offset + i*stride); + x_val = x[col]; +#ifndef ELLR + } +#endif + parent::prefetch_vals(Ax, x, offset, stride, i + 1); + } + + __device__ ValueType accumulate_results() + { + return x_val*A_val + parent::accumulate_results(); + } +}; + +template +struct Prefetcher +{ + __device__ + void prefetch_cols(const IndexType* __restrict__ Aj, IndexType offset, + IndexType stride, IndexType i) { } + + __device__ + void prefetch_vals(const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, IndexType offset, + IndexType stride, IndexType i) { } + + __device__ ValueType accumulate_results() + { + return 0; + } +}; + + +template +__device__ void +ktt_ell_kernel_basic(const IndexType num_rows, + const IndexType num_cols, + const IndexType max_cols_per_row, + const IndexType pitch, + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y, + const IndexType* __restrict__ row_lengths) +{ + const IndexType row = blockDim.x*blockIdx.x + threadIdx.x; + const IndexType stride = THREADS_PER_ROW * pitch; + +#if THREADS_PER_ROW > 1 + __shared__ ValueType sums[BLOCK_SIZE / THREADS_PER_ROW]; + + if (threadIdx.y == 0) + sums[threadIdx.x] = 0; + + __syncthreads(); +#endif + + if (row < num_rows) + { + const IndexType num_elems = row_lengths == NULL + ? max_cols_per_row + : row_lengths[row]; + ValueType sum = 0; + IndexType n = threadIdx.y; + IndexType offset = row + n*pitch; + +#if PREFETCH_FACTOR > 0 + IndexType bound = num_elems - (PREFETCH_FACTOR - 1)*THREADS_PER_ROW; + IndexType step = PREFETCH_FACTOR * THREADS_PER_ROW; + + #pragma unroll UNROLL + for(; n < bound; n += step) + { + Prefetcher prefetcher; + + prefetcher.prefetch_cols(Aj, offset, stride, 0); + prefetcher.prefetch_vals(Ax, x, offset, stride, 0); + sum += prefetcher.accumulate_results(); + + offset += PREFETCH_FACTOR * stride; + } +#endif + + #pragma unroll UNROLL + for (; n < num_elems; n += THREADS_PER_ROW) + { + const IndexType col = load(Aj + offset); + + // This assumes that + // cusp::ell_matrix<...>::invalid_index is always < 0 +#ifndef ELLR + if (col >= 0) + { +#endif + const ValueType x_j = x[col]; + const ValueType A_ij = load(Ax + offset); + + sum += A_ij * x_j; +#ifndef ELLR + } + else + { +#if BREAK == 1 + break; +#endif + } +#endif + + offset += stride; + } + +#if THREADS_PER_ROW > 1 + atomicAdd(&sums[threadIdx.x], sum); + + __syncthreads(); + + if (threadIdx.y == 0) + { + y[row] = sums[threadIdx.x]; + } +#else + y[row] = sum; +#endif + } +} + + +template +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_ell_kernel(const IndexType num_rows, + const IndexType num_cols, + const IndexType num_cols_per_row, + const IndexType pitch, + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y) +{ + ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch, + Aj, Ax, x, y, (const IndexType*)NULL); +} + + +template +__launch_bounds__(BLOCK_SIZE, 1) __global__ void +ktt_ellr_kernel(const IndexType num_rows, + const IndexType num_cols, + const IndexType num_cols_per_row, + const IndexType pitch, + const IndexType* __restrict__ Aj, + const ValueType* __restrict__ Ax, + const ValueType* __restrict__ x, + ValueType* __restrict__ y, + const IndexType* __restrict__ row_lengths) +{ + ktt_ell_kernel_basic(num_rows, num_cols, num_cols_per_row, pitch, + Aj, Ax, x, y, row_lengths); +} diff --git a/cusp/system/cuda/ktt/multiply.h b/cusp/system/cuda/ktt/multiply.h index 5e6accab..3f79d14b 100644 --- a/cusp/system/cuda/ktt/multiply.h +++ b/cusp/system/cuda/ktt/multiply.h @@ -1,8 +1,14 @@ -#pragma once +#pragma once #include -//#include + +#include #include +#include + +#include + +#include namespace cusp { @@ -12,24 +18,128 @@ namespace cuda { namespace ktt { -// TODO: should this be here?? + // Helpful function to get a kernel used for a given multiplication // instead of having to manually specify the types. template -kernel_context get_kernel(::ktt::Tuner& tuner, - const MatrixType& A, - const cusp::array1d& x, - cusp::array1d& y) + typename ValueType2, + typename MemorySpace1, + typename MemorySpace2> +kernel_context get_kernel( + ::ktt::Tuner& tuner, + const MatrixType& A, + const cusp::array1d& x, + cusp::array1d& y) { using IndexType = typename MatrixType::index_type; using ValueType = typename MatrixType::value_type; - using FormatType = typename MatrixType::format; - return get_kernel(tuner, FormatType{}); + if constexpr (std::is_same_v) + { + using DeviceMatrix = + typename MatrixType::rebind::type; + return get_kernel( + tuner, DeviceMatrix{}); + } + else + { + return get_kernel(tuner, A); + } +} + + +template +::ktt::KernelResult +multiply(::ktt::Tuner& tuner, + const MatrixType& A, + const VectorType1& x, + VectorType2& y) +{ + const kernel_context& kernel = get_kernel(tuner, A, x, y); + + auto args = add_arguments(kernel, A, x, y); + + auto launcher = get_launcher(kernel, A, x, y, false); + tuner.SetLauncher(kernel.kernel_id, launcher); + + auto res = tuner.TuneIteration(kernel.kernel_id, {}); + + remove_arguments(kernel, args); + + return res; +} + + +template +::ktt::KernelResult +multiply(::ktt::Tuner& tuner, + const MatrixType& A, + const VectorType1& x, + VectorType2& y, + const ::ktt::KernelConfiguration& configuration, + bool run_with_profiling = false) +{ + const kernel_context& kernel = get_kernel(tuner, A, x, y); + + auto args = add_arguments(kernel, A, x, y); + + auto launcher = get_launcher(kernel, A, x, y, run_with_profiling); + tuner.SetLauncher(kernel.kernel_id, launcher); + + auto result = tuner.Run(kernel.kernel_id, configuration, {}); + + remove_arguments(kernel, args); + + return result; +} + + +template +std::vector<::ktt::KernelResult> +tune(::ktt::Tuner& tuner, + const MatrixType& A, + const VectorType1& x, + VectorType2& y, + std::optional<::ktt::ReferenceComputation> ref_computation = std::nullopt) +{ + using Format = typename MatrixType::format; + using HostVector2 = + typename VectorType2::template rebind::type; + + kernel_context kernel = get_kernel(tuner, A, x, y); + auto args = add_arguments(kernel, A, x, y); + + if (ref_computation) + { + tuner.SetReferenceComputation(get_output_argument(args, Format{}), + *ref_computation); + } + + HostVector2 host_y = y; + tuner.SetLauncher(kernel.kernel_id, [&] (::ktt::ComputeInterface& interface) + { + // clear y so previous results don't affect validation + y = host_y; + auto launcher = get_launcher(kernel, A, x, y, false); + launcher(interface); + }); + + auto results = tuner.Tune(kernel.kernel_id); + + remove_arguments(kernel, args); + + return results; } + } // namespace ktt } // namespace cuda diff --git a/cusp/system/cuda/ktt/utils.h b/cusp/system/cuda/ktt/utils.h index a2d1c029..e773b461 100644 --- a/cusp/system/cuda/ktt/utils.h +++ b/cusp/system/cuda/ktt/utils.h @@ -3,6 +3,10 @@ #include #include +#include // uint64_t +#include // std::runtime_error + + #define STR(str) #str #define STRING(str) STR(str) @@ -14,9 +18,12 @@ namespace cuda { namespace ktt { +using u64_vec = std::vector; + + template void* cast(const T* ptr) -{ +{ return const_cast(static_cast(ptr)); } @@ -65,7 +72,7 @@ void add_arg(::ktt::Tuner& tuner, std::vector<::ktt::ArgumentId>& argument_ids, argument_ids.push_back(id); } -template +template std::vector<::ktt::ArgumentId> add_arguments(::ktt::Tuner& tuner, Args&&... args) { std::vector<::ktt::ArgumentId> argument_ids; @@ -85,6 +92,26 @@ inline void remove_arguments(const kernel_context& kernel, const std::vector<::k } +inline uint64_t get_parameter_uint(const ::ktt::KernelConfiguration& conf, + const std::string& name) +{ + for (const auto& pair : conf.GetPairs()) + if (pair.GetName() == name) + return pair.GetValue(); + + throw std::runtime_error("No paramater with name: " + name); +} + +inline double get_parameter_double(const ::ktt::KernelConfiguration& conf, + const std::string& name) +{ + for (const auto& pair : conf.GetPairs()) + if (pair.GetName() == name) + return pair.GetValueDouble(); + + throw std::runtime_error("No paramater with name: " + name); +} + } // namespace ktt } // namespace cuda diff --git a/cusp/system/detail/generic/conversions/coo_to_other.h b/cusp/system/detail/generic/conversions/coo_to_other.h index 06a88b7f..b25fe8f8 100644 --- a/cusp/system/detail/generic/conversions/coo_to_other.h +++ b/cusp/system/detail/generic/conversions/coo_to_other.h @@ -137,7 +137,8 @@ convert(thrust::execution_policy& exec, DestinationType& dst, cusp::coo_format&, cusp::dia_format&, - size_t alignment = 32) + size_t alignment = 32, + bool dont_throw = false) { typedef typename DestinationType::index_type IndexType; typedef typename DestinationType::value_type ValueType; @@ -156,7 +157,7 @@ convert(thrust::execution_policy& exec, const float size = float(occupied_diagonals) * float(src.num_rows); const float fill_ratio = size / std::max(1.0f, float(src.num_entries)); - if (max_fill < fill_ratio && size > threshold) + if (max_fill < fill_ratio && size > threshold && !dont_throw) throw cusp::format_conversion_exception("dia_matrix fill-in would exceed maximum tolerance"); // compute number of occupied diagonals and enumerate them @@ -219,7 +220,8 @@ convert(thrust::execution_policy& exec, cusp::coo_format&, cusp::ell_format&, size_t num_entries_per_row = 0, - size_t alignment = 32) + size_t alignment = 32, + bool dont_throw = false) { typedef typename DestinationType::index_type IndexType; typedef typename DestinationType::value_type ValueType; @@ -242,7 +244,7 @@ convert(thrust::execution_policy& exec, const float size = float(max_entries_per_row) * float(src.num_rows); const float fill_ratio = size / std::max(1.0f, float(src.num_entries)); - if (max_fill < fill_ratio && size > threshold) + if (max_fill < fill_ratio && size > threshold && !dont_throw) throw cusp::format_conversion_exception("ell_matrix fill-in would exceed maximum tolerance"); num_entries_per_row = max_entries_per_row; diff --git a/cusp/system/detail/generic/conversions/dia_to_other.h b/cusp/system/detail/generic/conversions/dia_to_other.h index 45c26f47..bc30fb5a 100644 --- a/cusp/system/detail/generic/conversions/dia_to_other.h +++ b/cusp/system/detail/generic/conversions/dia_to_other.h @@ -41,6 +41,7 @@ #include #include +#include namespace cusp { @@ -211,6 +212,62 @@ convert(thrust::execution_policy& exec, thrust::copy(exec, src.values.values.begin(), src.values.values.end(), dst.values.values.begin()); + using StridedIndexIterator = thrust::transform_iterator< + cusp::multiplies_value, + IndexIterator>; + using RowElemIndicesIterator = thrust::transform_iterator< + cusp::plus_value, + StridedIndexIterator>; + using RowColsIterator = thrust::permutation_iterator< + decltype(dst.column_indices.values.begin()), + RowElemIndicesIterator>; + using RowValsIterator = thrust::permutation_iterator< + decltype(dst.values.values.begin()), + RowElemIndicesIterator>; + + for (size_t row_idx = 0; row_idx < src.num_rows; ++row_idx) + { + StridedIndexIterator strided_cols( + IndexIterator(0), + cusp::multiplies_value(pitch)); + RowElemIndicesIterator elem_indices( + strided_cols, + cusp::plus_value((IndexType)row_idx)); + + RowColsIterator row_cols(dst.column_indices.values.begin(), elem_indices); + RowValsIterator row_vals(dst.values.values.begin(), elem_indices); + + thrust::stable_partition(exec, + row_vals, + row_vals + dst.column_indices.num_cols, + row_cols, + thrust::placeholders::_1 != IndexType(-1)); + + thrust::stable_partition(exec, + row_cols, + row_cols + dst.column_indices.num_cols, + thrust::placeholders::_1 != IndexType(-1)); + } + + // for (int row = 0; row < dst.num_rows; ++row) + // { + // int last = 0; + // for (int col = 0; col < dst.column_indices.num_cols; ++col) + // { + // if (dst.column_indices(row, col) >= 0) + // { + // auto tmp1 = dst.column_indices(row, last); + // dst.column_indices(row, last) = dst.column_indices(row, col); + // dst.column_indices(row, col) = tmp1; + + // auto tmp2 = dst.values(row, last); + // dst.values(row, last) = dst.values(row, col); + // dst.values(row, col) = tmp2; + + // last++; + // } + // } + // } } } // end namespace generic diff --git a/cusp/system/detail/generic/multiply.inl b/cusp/system/detail/generic/multiply.inl index 6e620701..75e4c85c 100644 --- a/cusp/system/detail/generic/multiply.inl +++ b/cusp/system/detail/generic/multiply.inl @@ -27,10 +27,13 @@ #include #include +#include + #include #include -#include +#include + namespace cusp { @@ -107,44 +110,59 @@ multiply(thrust::execution_policy &exec, cusp::multiply(exec, A, B, C, initialize, combine, reduce); } + +// The has_rebind type traits is used to check whether matrix is a view template struct has_rebind : std::false_type {}; - + template struct has_rebind>> : std::true_type {}; template constexpr bool has_rebind_v = has_rebind::value; + template -typename thrust::detail::disable_if_convertible::type +typename thrust::detail::disable_if_convertible< + typename Matrix::format, + cusp::unknown_format +>::type multiply(cusp::system::cuda::detail::execution_policy &exec, - const LinearOperator& A, - const cusp::array1d& B, - cusp::array1d& C) + const Matrix& A, + const cusp::array1d& x, + cusp::array1d& y) { - typedef typename LinearOperator::format MatrixFormat; - - if constexpr ((/*cusp::detail::is_csr::value ||*/ cusp::detail::is_dia::value) && has_rebind_v) { - if (cusp::ktt::detail::is_enabled) { - cusp::ktt::detail::lazy_init(); - cusp::system::cuda::ktt::multiply(*cusp::ktt::detail::tuner, A, B, C); - return; + using IndexType = typename Matrix::index_type; + using ValueType = typename Matrix::value_type; + + if constexpr ((cusp::detail::is_ell::value + || cusp::detail::is_dia::value + && std::is_fundamental_v + && std::is_fundamental_v + && std::is_fundamental_v + && std::is_fundamental_v) + && has_rebind_v) + { + if (cusp::ktt::detail::is_enabled) + { + cusp::system::cuda::ktt::multiply(cusp::ktt::get_tuner(), A, x, y); + return; } } - typedef typename LinearOperator::value_type ValueType; + // The call is not meant for KTT. Pass it on... cusp::constant_functor initialize(0); thrust::multiplies combine; thrust::plus reduce; - cusp::multiply(exec, A, B, C, initialize, combine, reduce); + cusp::multiply(exec, A, x, y, initialize, combine, reduce); } + template +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace krn = std::chrono; + +template +void write_csv(const std::vector>& data, + const std::vector& col_labels, + const std::vector& row_labels, + const std::string& path) +{ + if (row_labels.size() != data.size()) + throw std::runtime_error("Weird number of rows or row labels."); + + for (int i = 1; i < data.size(); ++i) + { + if (data[i].size() != data[i-1].size()) + throw std::runtime_error("Not all rows have same length."); + if (i == 1 && data[i].size() != col_labels.size()) + throw std::runtime_error("Bad number of column lables."); + } + + std::ofstream file(path); + + if (!file) + throw std::runtime_error("Cannot open: " + path); + + const char* sep = ""; + for (const auto& label : col_labels) + { + file << sep << label; + sep = ","; + } + file << "\n"; + + for (int i = 0; i < data.size(); ++i) + { + file << row_labels[i]; + for (const auto& x : data[i]) + { + file << "," << x; + } + file << "\n"; + } +} + + +struct Timer +{ + Timer() : start_(krn::steady_clock::now()) { } + + void restart() { start_ = krn::steady_clock::now(); } + + void print_elapsed(const std::string& label) + { + auto duration = krn::duration_cast(krn::steady_clock::now() - start_).count(); + std::cout << label << ": " << duration << "\n"; + } + + void checkpoint(const std::string& label) + { + print_elapsed(label); + restart(); + } + +private: + krn::steady_clock::time_point start_; + +}; + + +struct ConversionTimer +{ + cusp::dia_matrix m; + + ConversionTimer() { + m = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 512); + } + + ConversionTimer(const ConversionTimer& o) { + std::cout << "COPY!\n"; + } + + template + void operator()() + { + using Format1 = typename Matrix1::format; + using Format2 = typename Matrix2::format; + + Matrix1 m1 = m; + + auto start = krn::steady_clock::now(); + Matrix2 m2 = m1; + auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + + std::cout << NAMEOF_TYPE(Format1) << " to " << NAMEOF_TYPE(Format2) << ": " << duration << "\n"; + } +}; + + +template +struct type_list +{ + static constexpr bool is_empty = true; +}; + +template< + typename First, + typename... Args +> +struct type_list +{ + using head = First; + using tail = type_list; + + static constexpr bool is_empty = false; +}; + + +template< + typename T, + typename List, + typename Op +> +void for_each_type(Op&& op) +{ + if constexpr (!List::is_empty) + { + if (!std::is_same_v) + { + op.template operator()(); + } + for_each_type(std::forward(op)); + } +} + +template +void for_each_pair(Op&& op) +{ + if constexpr (!List1::is_empty && !List2::is_empty) + { + for_each_type(std::forward(op)); + for_each_pair(std::forward(op)); + } +} + +template +void for_each_pair(Op&& op) +{ + for_each_pair(std::forward(op)); +} + +template +using format_list = type_list< + cusp::dia_matrix, + cusp::csr_matrix, + cusp::coo_matrix, + cusp::ell_matrix +>; + + +void print_ktt_status(const ::ktt::ResultStatus& status) { + switch (status) + { + case ::ktt::ResultStatus::Ok: + std::cout << "OK\n"; + break; + case ::ktt::ResultStatus::CompilationFailed: + std::cout << "COMPILATION FAILED\n"; + break; + case ::ktt::ResultStatus::ComputationFailed: + std::cout << "COMPUTATION FAILED\n"; + break; + case ::ktt::ResultStatus::DeviceLimitsExceeded: + std::cout << "DEVICE LIMIT EXCEEDED\n"; + break; + case ::ktt::ResultStatus::ValidationFailed: + std::cout << "VALIDATION FAILED\n"; + break; + } +} + +template +cusp::dia_matrix +convert(const cusp::coo_matrix& matrix) { + using DiaMatrix = cusp::dia_matrix; + + cusp::array1d diag_map(matrix.num_rows + matrix.num_cols, 0); + + auto start = krn::steady_clock::now(); + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + diag_map[diag_idx] = 1; + } + auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + std::cout << "First loop: " << duration << "\n"; + + typename DiaMatrix::diagonal_offsets_array_type offsets; + offsets.reserve(512); // arbitrary number + + start = krn::steady_clock::now(); + int diag_count = 0; + for (int i = 0; i < diag_map.size(); ++i) { + if (diag_map[i] != 0) { + diag_map[i] = diag_count; + offsets.push_back(i - matrix.num_rows + 1); + diag_count++; + } + } + duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + std::cout << "Third loop: " << duration << "\n"; + + + //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows; + int pitch = matrix.num_rows + 64/sizeof(float); + typename DiaMatrix::values_array_type values(matrix.num_rows, diag_count, ValueType(0), pitch); + //std::cout << values.pitch << "\n"; + + start = krn::steady_clock::now(); + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + int values_array_idx = diag_map[diag_idx]; + values(matrix.row_indices[i], values_array_idx) = matrix.values[i]; + } + duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + std::cout << "Second loop: " << duration << "\n"; + + DiaMatrix res; + + res.num_rows = matrix.num_rows; + res.num_cols = matrix.num_cols; + res.num_entries = matrix.num_entries; + + res.diagonal_offsets.swap(offsets); + res.values.swap(values); + + return res; +} + +template +cusp::dia_matrix +convert2(const cusp::coo_matrix& matrix) { + using DiaMatrix = cusp::dia_matrix; + + Timer timer; + + cusp::array1d diag_map(matrix.num_rows + matrix.num_cols, 0); + + timer.checkpoint("init map"); + + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + diag_map[diag_idx] = 1; + } + + timer.checkpoint("find diags"); + + typename DiaMatrix::diagonal_offsets_array_type offsets; + offsets.reserve(512); // arbitrary number + + int diag_count = 0; + for (int i = 0; i < diag_map.size(); ++i) { + if (diag_map[i] != 0) { + diag_map[i] = diag_count; + offsets.push_back(i - matrix.num_rows + 1); + diag_count++; + } + } + + timer.checkpoint("fill map"); + + + //int pitch = matrix.num_rows % 2 == 0 ? matrix.num_rows + 1 : matrix.num_rows; + typename DiaMatrix::values_array_type values(diag_count, matrix.num_rows, ValueType(0)); + //std::cout << values.pitch << "\n"; + + timer.checkpoint("init values"); + + for (int i = 0; i < matrix.num_entries; ++i) { + int diag_idx = matrix.column_indices[i] - matrix.row_indices[i] + matrix.num_rows - 1; + int values_array_idx = diag_map[diag_idx]; + values(values_array_idx, matrix.row_indices[i]) = matrix.values[i]; + } + + timer.checkpoint("fill values"); + + DiaMatrix res; + + res.num_rows = matrix.num_rows; + res.num_cols = matrix.num_cols; + res.num_entries = matrix.num_entries; + + res.diagonal_offsets.swap(offsets); + res.values.swap(values); + + return res; +} + +template typename Matrix, + typename IndexType, + typename ValueType> +Matrix +send_to_device(const Matrix& host_matrix) +{ + return host_matrix; +} + +template typename Matrix, + typename IndexType, + typename ValueType, + typename MemorySpace> +auto make_test_device_x(const Matrix& matrix) +{ + return cusp::array1d(matrix.num_cols, 1); +} + +template typename Matrix, + typename IndexType, + typename ValueType, + typename MemorySpace> +auto make_test_device_y(const Matrix& matrix) +{ + return cusp::array1d(matrix.num_rows, 0); +} + +template +bool equals(const cusp::dia_matrix& A, + const cusp::dia_matrix& B) +{ + return A.values == B.values && A.diagonal_offsets == B.diagonal_offsets; +} + +std::string size_str(uint64_t bytes) +{ + float size = bytes; + + auto get_str = [](float x, const char* units) + { + std::stringstream s; + s << std::setprecision(3) << std::fixed << x << units; + return s.str(); + }; + + if (size < 1024) + { + return get_str(size, " B"); + } + + size /= 1024; + if (size < 1024) + { + return get_str(size, " KB"); + } + + size /= 1024; + if (size < 1024) + { + return get_str(size, " MB"); + } + + size /= 1024; + return get_str(size, " GB"); +} + +std::string counter_str(const ::ktt::KernelProfilingCounter& counter) +{ + switch (counter.GetType()) { + case ::ktt::ProfilingCounterType::Double: + return std::to_string(counter.GetValueDouble()); + case ::ktt::ProfilingCounterType::Int: + return std::to_string(counter.GetValueInt()); + case ::ktt::ProfilingCounterType::Percent: + return std::to_string(counter.GetValueDouble()) + " %"; + case ::ktt::ProfilingCounterType::Throughput: + return std::to_string(counter.GetValueUint()) + " bytes/sec\n"; + case ::ktt::ProfilingCounterType::UnsignedInt: + return std::to_string(counter.GetValueUint()); + case ::ktt::ProfilingCounterType::UtilizationLevel: + return std::to_string(counter.GetValueUint()); + } + + return ""; +} + +size_t dia_matrix_size(size_t num_rows, size_t num_cols, size_t num_diags) +{ + return sizeof(int)*num_diags + num_rows*num_diags*sizeof(float); +} + +size_t dia_problem_size(size_t num_rows, size_t num_cols, size_t num_diags) +{ + return sizeof(int)*num_diags + + num_rows*num_diags*sizeof(float) + + num_rows*sizeof(float) + + num_cols*sizeof(float); +} + +template +size_t min_read_bytes(const cusp::dia_matrix& A) +{ + // Assume that each column has at least one diagonal passing through it. + size_t result = A.num_cols * sizeof(ValueT); + + for (IndexT offset : A.diagonal_offsets) + { + IndexT start_row = offset >= 0 ? 0 : -offset; + IndexT start_col = offset >= 0 ? offset : 0; + IndexT end_row = start_row + std::min(A.num_cols - start_col, A.num_rows - start_row); + + // Assume that global reads are done in 32 byte transactions that must be aligned. + IndexT elems_per_read = 32 / sizeof(ValueT); + IndexT adjusted_start_row = start_row - (start_row % elems_per_read); + IndexT adjusted_end_row = end_row + elems_per_read - (end_row % elems_per_read); + + result += (adjusted_end_row - adjusted_start_row)*sizeof(ValueT); + } + + return result; +} + +std::vector<::ktt::KernelProfilingCounter> +profile_multiply(const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& conf, + const std::vector& counters, + ::ktt::KernelResult& result) +{ + auto& tuner = cusp::ktt::get_tuner(); + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + tuner.SetProfilingCounters(counters); + tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); + tuner.SetProfiling(true); + + ::ktt::KernelResult kernel_result; + do { + kernel_result = cusp::ktt::multiply(A, x, y, conf, true); + } while(kernel_result.HasRemainingProfilingRuns()); + + result = kernel_result; + + for (const auto& computation_result : kernel_result.GetResults()) + { + if (!computation_result.HasProfilingData()) + { + std::cout << "No profiling data!\n"; + continue; + } + + const auto& profiling_data = computation_result.GetProfilingData(); + if (!profiling_data.IsValid()) + { + std::cout << "profiling data invalid\n"; + } + else + { + return profiling_data.GetCounters(); + } + } + + return {}; +} + +size_t get_actual_read_bytes(::ktt::Tuner& tuner, + const cusp::dia_matrix& A, + const cusp::array1d& x, + cusp::array1d& y, + const ::ktt::KernelConfiguration& conf, + ::ktt::KernelResult* res_p = nullptr) +{ + ::ktt::KernelResult res; + const auto& counters = profile_multiply(A, x, y, conf, {"dram_read_bytes"}, res); + + if (res_p) *res_p = res; + + if (counters.empty()) { + throw std::runtime_error("Profiling failed."); + } + + return counters[0].GetValueUint(); +} + +size_t get_actual_read_bytes(::ktt::Tuner& tuner, + const cusp::dia_matrix& host_matrix, + ::ktt::KernelResult* res_p = nullptr) +{ + cusp::dia_matrix A = host_matrix; + cusp::array1d x(A.num_cols, 1); + cusp::array1d y(A.num_rows); + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, + { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, + { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } }); + + return get_actual_read_bytes(tuner, A, x, y, conf, res_p); +} + +int get_max_size(int num_diags, size_t global_size) +{ + // Assume we have a square matrix of size n. + // + // On the gpu we need: + // - offsets -> num_diags * sizeof(int) bytes + // - diagonals -> num_diags * n * sizeof(float) bytes + // - x -> n * sizeof(float) bytes + // - y -> n * sizeof(float) bytes + // + // It must hold: + // + // global_size >= num_diags*sizeof(int) + (num_diags + 2)*n*sizeof(float) + // + // So: + // + // n <= (global_size - num_diags*sizeof(int)) / (num_diags + 2)*sizeof(float) + // + return (global_size - num_diags*sizeof(int)) / ((num_diags + 2)*sizeof(float)); +} + +void test_poisson(::ktt::Tuner& tuner, + const cusp::dia_matrix& host_matrix, + const std::string& name) +{ + auto expected_bytes = min_read_bytes(host_matrix); + auto actual_bytes = get_actual_read_bytes(tuner, host_matrix); + std::cout << " " << name << ": " << size_str(actual_bytes) << " vs " << size_str(expected_bytes) << "\n"; +} + +void test_poisson_sizes(::ktt::Tuner& tuner, size_t global_size) +{ + std::cout << "Testing poisson\n"; + + int n = get_max_size(5, global_size); + int grid_size = std::sqrt(n); + cusp::dia_matrix dia_host_matrix; + cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "5pt"); + + n = get_max_size(9, global_size); + grid_size = std::sqrt(n); + cusp::gallery::poisson9pt(dia_host_matrix, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "9pt"); + + n = get_max_size(27, global_size); + grid_size = std::cbrt(n); + cusp::gallery::poisson27pt(dia_host_matrix, grid_size, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "27pt"); + + n = get_max_size(7, global_size); + grid_size = std::cbrt(n); + cusp::gallery::poisson7pt(dia_host_matrix, grid_size, grid_size, grid_size); + test_poisson(tuner, dia_host_matrix, "7pt"); + + + // std::cout << " Memory requirements: " + // << size_str(dia_problem_size(dia_host_matrix.num_rows, + // dia_host_matrix.num_cols, + // dia_host_matrix.diagonal_offsets.size())) + // << "\n"; +} + +void test_poisson_7pt(::ktt::Tuner& tuner, size_t global_limit) +{ + std::cout << "Testing different sizes of poisson 7pt problem (actual vs min)\n"; + + size_t mb = (size_t(1) << 20); + cusp::dia_matrix host_matrix; + + for (size_t mem = 512*mb; mem <= global_limit; mem += 512*mb) + { + int n = get_max_size(7, mem); + int grid_size = std::cbrt(n); + + cusp::gallery::poisson7pt(host_matrix, grid_size, grid_size, grid_size); + + auto expected_bytes = min_read_bytes(host_matrix); + auto actual_bytes = get_actual_read_bytes(tuner, host_matrix); + float ratio = float(actual_bytes)/expected_bytes; + + std::cout << " " + << size_str(mem) << " (" << grid_size << ") -> " + << size_str(actual_bytes) << " vs " + << size_str(expected_bytes) << " (" + << ratio << ")\n"; + } +} + +void test_x_caching_uniform(::ktt::Tuner& tuner, size_t global_limit) +{ + std::vector distances; + std::vector diag_counts; + + for (int dist = 64; dist <= 2048; dist *= 2) + distances.push_back(dist); + + for (int count = 64; count <= 2048; count *= 2) + diag_counts.push_back(count + 1); + + std::vector> bytes; + std::vector> times; + + std::cout << "TESTING UNIFORM MATRICES\n" + << "Taget device memory: " << size_str(global_limit) << "\n"; + + for (auto dist : distances) + { + std::cout << "Distance: " << dist << "\n"; + bytes.emplace_back(); + times.emplace_back(); + + ::ktt::KernelResult kernel_result; + size_t reference_time = 1; + + for (auto diag_count : diag_counts) + { + size_t n = get_max_size(diag_count, global_limit); + if (n < 1024) // That is too small. + { + std::cout << "skipping\n"; + bytes.back().push_back(-1); + times.back().push_back(0); + continue; + } + + auto host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, diag_count); + + if (host_matrix.diagonal_offsets.size() != diag_count) + { + std::cout << "skipping\n"; + bytes.back().push_back(-1); + times.back().push_back(0); + continue; + } + + auto expected_bytes = min_read_bytes(host_matrix); + auto actual_bytes = get_actual_read_bytes(tuner, host_matrix, &kernel_result); + float ratio = float(actual_bytes)/expected_bytes; + + if (reference_time == 1) + { + reference_time = kernel_result.GetKernelDuration(); + } + + size_t time = kernel_result.GetKernelDuration(); + float time_ratio = float(time)/reference_time; + + std::cout << diag_count << "(" << n << ") -> " << ratio << "(" << actual_bytes << " : " << expected_bytes << ")\n"; + // << " (" << time_ratio << " : " << time << ")\n"; + + bytes.back().push_back(ratio); + times.back().push_back(time); + } + } + + write_csv(bytes, distances, diag_counts, "bytes_transfered.csv"); +} + +void test_l2() +{ + auto& tuner = cusp::ktt::get_tuner(); + tuner.SetLoggingLevel(::ktt::LoggingLevel::Off); + + cudaDeviceProp prop; + cudaError_t err = cudaGetDeviceProperties(&prop, 0); + if (err != cudaSuccess) + { + std::cout << "Failed to read device properties.\n"; + return; + } + + size_t max_bytes = std::min(2*(size_t(1) << 30), (size_t)prop.totalGlobalMem - 1024*(size_t(1) << 20)); + + std::cout << "L2 size: " << size_str(prop.l2CacheSize) << "\n"; + std::cout << "Global memory available: " << size_str(prop.totalGlobalMem) << "\n"; + std::cout << "Global limit: " << size_str(max_bytes) << "\n"; + + //test_poisson_sizes(tuner, max_bytes); + //test_poisson_7pt(tuner, max_bytes); + test_x_caching_uniform(tuner, max_bytes); + + // std::cout << "Using size: " << size_str(max_bytes) << "\n"; + + // // const size_t n = (1u << 20)*4; + + // for (int dist = 1; dist <= 1024; dist *= 2) + // { + // for (int diags = 4; diags <= 1024; diags *= 2) + // { + // int actual_diags = diags + 1; // to make it nicely symmetric + // int n = max_floats/(actual_diags + 1); + + // if (dist*diags > n) + // { + // std::cout << "skipping\n"; + // continue; + // } + + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, dist, actual_diags); + + // cusp::dia_matrix A = dia_host_matrix; + // cusp::array1d x(A.num_cols, 1); + // cusp::array1d y(A.num_rows); + + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + + // auto expected_bytes = min_read_bytes(A, x); + // auto actual_bytes = get_actual_read_bytes(A, x, y, conf); + + // float ration = float(actual_bytes)/expected_bytes; + + // std::cout << "(" << size_str(n) << ", " << dist << ", " << actual_diags << ") -> " << ration << "\n"; + + // // std::cout << "Matrix size: " << size_str(dia_matrix_size(n, n, actual_diags)) << "\n"; + // // std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + // // std::cout << "Actual: " << size_str(actual_bytes) << "\n"; + // // std::cout << "------------------------------------------------\n"; + // } + // } + + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11); + + // cusp::dia_matrix A = dia_host_matrix; + // cusp::array1d x(A.num_cols, 1); + // cusp::array1d y(A.num_rows); + + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + // auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + + // auto expected_bytes = min_read_bytes(A, x); + // auto actual_bytes = get_actual_read_bytes(A, x, y, conf); + + // std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + // std::cout << "Actual: " << size_str(actual_bytes) << "\n"; +} + +void test_matrix() +{ + auto& tuner = cusp::ktt::get_tuner(); + + int n = 130879; + + //n = get_max_size(5, 2*(size_t(1) << 30)); + //int grid_size = std::sqrt(n); + //cusp::dia_matrix dia_host_matrix; + //cusp::gallery::poisson5pt(dia_host_matrix, grid_size, grid_size); + + auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 512, 2049); + + std::cout << min_read_bytes(dia_host_matrix) << "\n"; + + cusp::dia_matrix A = dia_host_matrix; + cusp::array1d x(A.num_cols, 1); + cusp::array1d y(A.num_rows); + cusp::array1d yy(A.num_rows); + + // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); + + //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11); + //cusp::dia_matrix A2 = dia_host_matrix; + + auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + // tuner.SetProfiling(true); + // tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); + // tuner.SetProfilingCounters({ + // "dram_read_bytes" + // //"dram_write_bytes", + // //"l2_global_load_bytes" + // }); + + std::cout << "Device memory needed: " << size_str(dia_problem_size(A.num_rows, A.num_cols, A.diagonal_offsets.size())) << "\n"; + // cusp::ktt::tune(A, x, y); + + // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); + // cusp::ktt::multiply(A, x, y, conf1); + + auto conf = tuner.CreateConfiguration(kernel_ctx.kernel_id, + { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, + { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(0) } }); + + auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) }, + { std::string("REGISTER_PREFETCH_FACTOR"), uint64_t(0) }, + { std::string("SHARED_PREFETCH_FACTOR"), uint64_t(4) } }); + // size_t non_cached = get_actual_read_bytes(tuner, A, x, y, conf2); + + // conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) }, + // { std::string("PREFETCH_FACTOR"), uint64_t(0) } }); + // size_t cached = get_actual_read_bytes(tuner, A, x, y, conf2); + + std::cout << A.diagonal_offsets[0] << "\n"; + + cusp::ktt::disable(); + cusp::multiply(A, x, yy); + cusp::ktt::enable(); + + cusp::ktt::multiply(A, x, y, conf); + + if (y == yy) { + std::cout << "GOOD!\n"; + } else { + std::cout << "WRONG!\n"; + } + + for (int i = 0; i < y.size(); ++i) + { + if (y[i] != yy[i]) + { + std::cout << i << " -> " << y[i] - yy[i] << "\n"; + } + } + + // std::cout << "non_cached: " << size_str(non_cached) << "\n"; + // std::cout << "cached: " << size_str(cached) << "\n"; + + // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; + + // cusp::ktt::tune(A, x, y); + // cusp::ktt::multiply(A, x, y, conf2); + // for (int i = 0; i < 10; ++i) + // { + // std::cout << y[i] << "\n"; + // } + + // std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + + //auto expected_bytes = min_read_bytes(A, x); + //std::cout << "Expected: " << size_str(expected_bytes) << "\n"; + + //for (int i = 0; i < 10; ++i) { + // std::cout << y[i] << "\n"; + //} + + //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + //std::cout << size_str(get_actual_read_bytes(A, x, y, conf2)) << "\n"; + + // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } }); + // cusp::ktt::multiply(A, x, y, conf3); +} + +int main(void) +{ + auto& tuner = cusp::ktt::get_tuner(); + tuner.SetTimeUnit(::ktt::TimeUnit::Microseconds); + + test_l2(); + // test_matrix(); + + // auto orig = cusp::ktt::make_diagonal_symmetric_matrix(4096, 4096, 4, 256); + // cusp::coo_matrix A = orig; + // cusp::coo_matrix B = orig; + + // auto start = krn::steady_clock::now(); + // cusp::dia_matrix B1 = convert(B); + // auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "ME host: " << duration << "\n\n"; + + // start = krn::steady_clock::now(); + // cusp::dia_matrix B2 = convert(A); + // duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "ME device: " << duration << "\n\n"; + + // start = krn::steady_clock::now(); + // cusp::dia_matrix B3 = A; + // duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "cusp device: " << duration << "\n\n"; + + // cusp::dia_matrix D = convert(A); + // if (!equals(D, orig)) { + // std::cout << "RIIIIIIIP!!!!\n"; + // } + + // auto start = krn::steady_clock::now(); + // cusp::dia_matrix B1 = convert(A); + // auto duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "ME: " << duration << "\n\n"; + + // start = krn::steady_clock::now(); + // cusp::dia_matrix B2 = convert2(A); + // duration = krn::duration_cast(krn::steady_clock::now() - start).count(); + // std::cout << "Transposed: " << duration << "\n\n"; + + // if (!equals(B1, B2)) { + // std::cout << "RIIIIIIP!!!\n"; + // } + + // using list = format_list; + // ConversionTimer timer; + // for_each_pair(timer); + + // cudaDeviceProp prop; + // cudaError_t err = cudaGetDeviceProperties(&prop, 0); + + // if (err == cudaSuccess) { + // std::cout << "Max warps per sm: " << prop.maxThreadsPerMultiProcessor << "\n"; + // } else { + // std::cout << "Failed to retrieve device properties\n"; + // } + + // { + // const int n = (1u << 20); + // //const int n = (1u << 20)/2; + + // //auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 256, 1024); + // auto dia_host_matrix = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1, 11); + // cusp::dia_matrix A = dia_host_matrix; + + // // cusp::io::write_matrix_market_file(dia_host_matrix, "matrix.mtx"); + + // cusp::array1d x(A.num_cols, 1); + // cusp::array1d y(A.num_rows); + + // //auto dia_host_matrix2 = cusp::ktt::make_diagonal_symmetric_matrix(n, n, 1024, 11); + // //cusp::dia_matrix A2 = dia_host_matrix; + + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + // //cusp::ktt::tune(A2, x, y); + // //cusp::ktt::tune(A, x, y); + + // // auto conf1 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(0) } }); + // // cusp::ktt::multiply(A, x, y, conf1); + + // auto conf2 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(1) } }); + // //std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0], conf2) << "\n"; + // cusp::ktt::multiply(A, x, y, conf2); + + // // auto conf3 = tuner.CreateConfiguration(kernel_ctx.kernel_id, { { std::string("KERNEL_TYPE"), uint64_t(2) } }); + // // cusp::ktt::multiply(A, x, y, conf3); + // } + + // { + // const int n = 1u << 15; + // auto coo_host_matrix = make_diagonal_symmetric_matrix(n, n, 512, 16); + // cusp::coo_matrix coo_device_matrix = coo_host_matrix; + // cusp::dia_matrix A = coo_host_matrix; + + // cusp::array1d x(coo_host_matrix.num_cols, 1); + // cusp::array1d y_ref(coo_host_matrix.num_rows); + // cusp::array1d y(coo_host_matrix.num_rows); + + // cusp::multiply(coo_device_matrix, x, y_ref); + + // auto& tuner = cusp::ktt::get_tuner(); + // auto kernel_ctx = cusp::system::cuda::ktt::get_kernel(tuner, A, x, y); + + // // std::cout << tuner.GetPtxSource(kernel_ctx.kernel_id, kernel_ctx.definition_ids[0]) << "\n"; + + // tuner.SetProfiling(true); + // tuner.SetProfiledDefinitions(kernel_ctx.kernel_id, kernel_ctx.definition_ids); + + // tuner.SetProfilingCounters({ + // "tex_cache_hit_rate", + // "tex_cache_transactions", + // "l1_cache_global_hit_rate", + // "l2_l1_read_hit_rate", + // "l2_l1_read_transactions", + // "l2_read_transactions", + // "l2_write_transactions", + // "dram_read_transactions", + // "dram_write_transactions", + // "dram_read_bytes", + // "dram_write_bytes" + // }); + + // auto result = cusp::ktt::multiply(A, x, y, {}, true); + + // for (const auto& res : result.GetResults()) { + // if (!res.HasProfilingData()) { + // std::cout << "No profiling data!\n"; + // continue; + // } + // const auto& prof = res.GetProfilingData(); + // if (!prof.IsValid()) { + // std::cout << "profiling data invalid\n"; + // } else { + // for (const auto& counter : prof.GetCounters()) { + // std::cout << counter.GetName() << " = "; + // switch (counter.GetType()) { + // case ::ktt::ProfilingCounterType::Double: + // std::cout << counter.GetValueDouble() << "\n"; + // break; + // case ::ktt::ProfilingCounterType::Int: + // std::cout << counter.GetValueInt() << "\n"; + // break; + // case ::ktt::ProfilingCounterType::Percent: + // std::cout << counter.GetValueDouble() << " %\n"; + // break; + // case ::ktt::ProfilingCounterType::Throughput: + // std::cout << counter.GetValueUint() << " bytes/sec\n"; + // break; + // case ::ktt::ProfilingCounterType::UnsignedInt: + // std::cout << counter.GetValueUint() << "\n"; + // break; + // case ::ktt::ProfilingCounterType::UtilizationLevel: + // std::cout << counter.GetValueUint() << "\n"; + // break; + // } + // } + // } + // } + + // if (y_ref != y) { + // std::cout << "SECOND DIFF!\n"; + // } + // } + + //cusp::ktt::multiply(A, x, y); + //std::cout << y[0] << "\n"; + + //std::cout << y_ref_host.size() << "\n"; + //std::cout << y.size() << "\n"; + + return 0; +} diff --git a/testing/ktt.cu b/testing/ktt.cu index 613e668a..d3bf2088 100644 --- a/testing/ktt.cu +++ b/testing/ktt.cu @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -29,8 +30,17 @@ } \ DECLARE_UNITTEST(VTEST##Ktt##Fmt##Matrix); -#define DECLARE_KTT_UNITTEST(VTEST) \ - DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST,Dia,dia) +#define DECLARE_KTT_ELLR_UNITTEST(VTEST) \ + void VTEST##Ktt##Ellr##Matrix(void) \ + { \ + VTEST< cusp::ktt::ellr_matrix >(); \ + } \ + DECLARE_UNITTEST(VTEST##Ktt##Ellr##Matrix); + +#define DECLARE_KTT_UNITTEST(VTEST) \ + DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Dia, dia) \ + DECLARE_KTT_SPARSE_FORMAT_UNITTEST(VTEST, Ell, ell) \ + DECLARE_KTT_ELLR_UNITTEST(VTEST) struct UnitTestStopCondition : ::ktt::StopCondition @@ -71,11 +81,12 @@ private: }; -void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& results, - const std::string& ktt_logs, - const std::string& arg_name, - const std::string& filename = "unknown", - int lineno = -1) +void assert_tunning_results_valid( + const std::vector<::ktt::KernelResult>& results, + const std::string& ktt_logs, + const std::string& arg_name, + const std::string& filename = "unknown", + int lineno = -1) { bool failed = false; unittest::UnitTestFailure f; @@ -106,8 +117,10 @@ void assert_tunning_results_valid(const std::vector<::ktt::KernelResult>& result failed = true; - f << "[" << filename << ":" << lineno << "] " << result.GetKernelName() << ": "; - f << "Encountered an error: " << reason << "\n\n"; + f << "[" << filename << ":" << lineno << "] "; + f << result.GetKernelName() << ": "; + f << reason << "\n\n"; + f << "On matrix: " << arg_name << "\n\n"; f << "In configuration:\n";