diff --git a/CHANGELOG b/CHANGELOG index 5fcb1a6..19c7709 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,10 @@ +Version 2.6.0 +- Changed timing measurements to now also include the (varying) kernel launch overhead +- It is now possible to set OpenCL compiler options through the env variable CLTUNE_BUILD_OPTIONS +- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0) +- Added an option to build a static version of the library + Version 2.5.0 - Updated to version 8.0 of the CLCudaAPI header - Made it possible to configure the number of times each kernel is run (to average results) diff --git a/CMakeLists.txt b/CMakeLists.txt index 14a106a..a3c3882 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ # # ================================================================================================== -cmake_minimum_required(VERSION 2.8.10) +cmake_minimum_required(VERSION 2.8.11) # Overrides for MSVC static runtime set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake) @@ -32,10 +32,11 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla # CMake project details project("cltune" CXX) set(cltune_VERSION_MAJOR 2) -set(cltune_VERSION_MINOR 5) +set(cltune_VERSION_MINOR 6) set(cltune_VERSION_PATCH 0) -# Options +# Options and their default values +option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON) option(SAMPLES "Enable compilation of sample programs" ON) option(TESTS "Enable compilation of the Google tests" OFF) @@ -85,6 +86,13 @@ elseif(MSVC) endif() endif() +# DLL Settings +if(MSVC) + if(BUILD_SHARED_LIBS) + add_definitions(" /DCLTUNE_DLL") + endif() +endif(MSVC) + # C++ compiler settings if(MSVC) set(FLAGS "/Ox") @@ -143,13 +151,32 @@ set(TUNER src/ml_models/neural_network.cc) # Creates and links the library -add_library(cltune SHARED ${TUNER}) +if(BUILD_SHARED_LIBS) + add_library(cltune SHARED ${TUNER}) +else(BUILD_SHARED_LIBS) + add_library(cltune STATIC ${TUNER}) +endif() target_link_libraries(cltune ${FRAMEWORK_LIBRARIES}) +# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built +if(MSVC) + if(BUILD_SHARED_LIBS) + target_compile_definitions(cltune PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11 + endif() +endif() + # Installs the library install(TARGETS cltune DESTINATION lib) install(FILES include/cltune.h DESTINATION include) +# Install pkg-config file on Linux +if(UNIX) + configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cltune.pc.in" + "${CMAKE_CURRENT_BINARY_DIR}/cltune.pc" @ONLY IMMEDIATE) + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/cltune.pc + DESTINATION lib/pkgconfig) +endif() + # ================================================================================================== # Optional: Enables compilation of sample programs diff --git a/README.md b/README.md index 0445696..ea27d2d 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ CLTune can be compiled as a shared library using CMake. The pre-requisites are: - Clang 3.3 or newer - AppleClang 5.0 or newer - ICC 14.0 or newer - - MSVC (Visual Studio) 2015 or newer + - MSVC (Visual Studio) 2013 or newer * An OpenCL library. CLTune has been tested with: - Apple OpenCL - NVIDIA CUDA SDK (requires version 7.5 or newer for the CUDA back-end) @@ -48,6 +48,8 @@ You can then link your own programs against the CLTune library. An example for a export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/libcltune.so g++ example.cc -o example -L/path/to/libcltune.so -lcltune -lOpenCL +Furthermore, it is possible to optionally set an OS environmental variable `CLTUNE_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler at run-time. + Example of using the tuner ------------- diff --git a/cltune.pc.in b/cltune.pc.in new file mode 100644 index 0000000..41dd746 --- /dev/null +++ b/cltune.pc.in @@ -0,0 +1,10 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +includedir=${prefix}/include +libdir=${exec_prefix}/lib + +Name: CLTune +Description: CLTune: An automatic OpenCL & CUDA kernel tuner +Version: @cltune_VERSION_MAJOR@.@cltune_VERSION_MINOR@.@cltune_VERSION_PATCH@ +Libs: -L${libdir} -lcltune +Cflags: -I${includedir} diff --git a/include/cltune.h b/include/cltune.h index 9837556..8af3dff 100644 --- a/include/cltune.h +++ b/include/cltune.h @@ -37,8 +37,12 @@ // Exports library functions under Windows when building a DLL. See also: // https://msdn.microsoft.com/en-us/library/a90k134d.aspx -#ifdef _WIN32 - #define PUBLIC_API __declspec(dllexport) +#if defined(_WIN32) && defined(CLTUNE_DLL) + #if defined(COMPILING_DLL) + #define PUBLIC_API __declspec(dllexport) + #else + #define PUBLIC_API __declspec(dllimport) + #endif #else #define PUBLIC_API #endif diff --git a/include/internal/kernel_info.h b/include/internal/kernel_info.h index 90c2fb2..27232f8 100644 --- a/include/internal/kernel_info.h +++ b/include/internal/kernel_info.h @@ -45,6 +45,7 @@ #endif #include "cltune.h" +#include "internal/msvc.h" namespace cltune { // ================================================================================================= @@ -100,7 +101,7 @@ class KernelInfo { }; // Initializes the class with a given name and a string of kernel source-code - explicit KernelInfo(const std::string name, const std::string source, const Device &device); + explicit PUBLIC_API KernelInfo(const std::string name, const std::string source, const Device &device); // Accessors (getters) std::string name() const { return name_; } @@ -117,36 +118,36 @@ class KernelInfo { void set_local_base(IntRange local) { local_base_ = local; local_ = local; } // Prepend to the source-code - void PrependSource(const std::string &extra_source); + void PUBLIC_API PrependSource(const std::string &extra_source); // Adds a new parameter with a name and a vector of possible values - void AddParameter(const std::string &name, const std::vector &values); + void PUBLIC_API AddParameter(const std::string &name, const std::vector &values); // Checks wheter a parameter exists, returns "true" if it does exist - bool ParameterExists(const std::string parameter_name); + bool PUBLIC_API ParameterExists(const std::string parameter_name); // Specifies a modifier in the form of a StringRange to the global/local thread-sizes. This // modifier has to contain (per-dimension) the name of a single parameter or an empty string. The // supported modifiers are given by the ThreadSizeModifierType enumeration. - void AddModifier(const StringRange range, const ThreadSizeModifierType type); + void PUBLIC_API AddModifier(const StringRange range, const ThreadSizeModifierType type); // Adds a new constraint to the set of parameters (e.g. must be equal or larger than). The // constraints come in the form of a function object which takes a number of tuning parameters, // given as a vector of strings (parameter names). Their names are later substituted by actual // values. - void AddConstraint(ConstraintFunction valid_if, const std::vector ¶meters); + void PUBLIC_API AddConstraint(ConstraintFunction valid_if, const std::vector ¶meters); // As above, but for local memory usage - void SetLocalMemoryUsage(LocalMemoryFunction amount, const std::vector ¶meters); + void PUBLIC_API SetLocalMemoryUsage(LocalMemoryFunction amount, const std::vector ¶meters); // Computes the global/local ranges (in NDRange-form) based on all global/local thread-sizes (in // StringRange-form) and a single permutation (i.e. a configuration) containing a list of all // parameter names and their current values. - void ComputeRanges(const Configuration &config); + void PUBLIC_API ComputeRanges(const Configuration &config); // Computes all permutations based on the parameters and their values (the configuration list). // The result is stored as a member variable. - void SetConfigurations(); + void PUBLIC_API SetConfigurations(); private: // Called recursively internally by SetConfigurations diff --git a/include/internal/msvc.h b/include/internal/msvc.h new file mode 100644 index 0000000..bf0fc81 --- /dev/null +++ b/include/internal/msvc.h @@ -0,0 +1,38 @@ + +// ================================================================================================= +// This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses +// a tab-size of two spaces and a max-width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio, +// in particular for versions older than 2015 with limited C++11 support. +// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015) +// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013) +// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012) +// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010) +// MSVC++ 9.0 _MSC_VER == 1500 (Visual Studio 2008) +// +// ================================================================================================= + +#ifndef CLTUNE_MSVC_H_ +#define CLTUNE_MSVC_H_ + +namespace cltune { +// ================================================================================================= +#ifdef _MSC_VER + +// No support for constexpr prior to 2015. Note that this only works with constants, not with +// constexpr functions (unused in this project). +#if _MSC_VER < 1900 +#define constexpr const +#endif + +// _MSC_VER +#endif +// ================================================================================================= +} // namespace cltune + +// CLTUNE_MSVC_H_ +#endif diff --git a/include/internal/searchers/annealing.h b/include/internal/searchers/annealing.h index eb029c3..36f2808 100644 --- a/include/internal/searchers/annealing.h +++ b/include/internal/searchers/annealing.h @@ -42,10 +42,10 @@ class Annealing: public Searcher { // Maximum number of successive visits to already visited states. If this number is exceeded, the // algorithm ends - static constexpr auto kMaxAlreadyVisitedStates = size_t{10}; + static const size_t kMaxAlreadyVisitedStates; // Maximum number of differences to consider this still a neighbour - static constexpr auto kMaxDifferences = size_t{3}; + static const size_t kMaxDifferences; // Takes additionally a fraction of configurations to consider Annealing(const Configurations &configurations, diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h index b471fd3..8b703a7 100644 --- a/include/internal/tuner_impl.h +++ b/include/internal/tuner_impl.h @@ -40,6 +40,7 @@ #endif #include "internal/kernel_info.h" +#include "internal/msvc.h" // Host data-type for half-precision floating-point (16-bit) #include "internal/half.h" @@ -73,7 +74,7 @@ class TunerImpl { public: // Parameters - static constexpr auto kMaxL2Norm = 1e-4; // This is the threshold for 'correctness' + static const double kMaxL2Norm; // This is the threshold for 'correctness' // Messages printed to stdout (in colours) static const std::string kMessageFull; diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc index a774635..aedb93a 100644 --- a/samples/conv/conv.cc +++ b/samples/conv/conv.cc @@ -45,18 +45,18 @@ bool IsMultiple(size_t a, size_t b) { }; // Constants -constexpr auto kDefaultDevice = size_t{0}; -constexpr auto kDefaultPlatform = size_t{0}; -constexpr auto kDefaultSearchMethod = size_t{1}; -constexpr auto kDefaultSearchParameter1 = size_t{4}; +const auto kDefaultDevice = size_t{0}; +const auto kDefaultPlatform = size_t{0}; +const auto kDefaultSearchMethod = size_t{1}; +const auto kDefaultSearchParameter1 = size_t{4}; // Settings (synchronise these with "conv.cc", "conv.opencl" and "conv_reference.opencl") #define HFS (3) // Half filter size #define FS (HFS+HFS+1) // Filter size // Settings (sizes) -constexpr auto kSizeX = size_t{8192}; // Matrix dimension X -constexpr auto kSizeY = size_t{4096}; // Matrix dimension Y +const auto kSizeX = size_t{8192}; // Matrix dimension X +const auto kSizeY = size_t{4096}; // Matrix dimension Y // ================================================================================================= @@ -91,7 +91,7 @@ int main(int argc, char* argv[]) { } // Creates data structures - constexpr auto kExtraSize = size_t{FS*8}; + const auto kExtraSize = size_t{FS*8}; auto mat_a = std::vector((kExtraSize+kSizeX)*(kExtraSize+kSizeY)); auto mat_b = std::vector(kSizeX*kSizeY); auto coeff = std::vector(FS*FS); @@ -230,8 +230,8 @@ int main(int argc, char* argv[]) { tuner.PrintJSON("output.json", {{"sample","convolution"}}); // Also prints the performance of the best-case in terms of GB/s and GFLOPS - constexpr auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6; - constexpr auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6; + const auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6; + const auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6; if (time_ms != 0.0) { printf("[ -------> ] %.1lf ms or %.1lf GB/s or %1.lf GFLOPS\n", time_ms, kMB/time_ms, kMFLOPS/time_ms); diff --git a/samples/conv_simple/conv_simple.cc b/samples/conv_simple/conv_simple.cc index 4e3f19e..e6980ff 100644 --- a/samples/conv_simple/conv_simple.cc +++ b/samples/conv_simple/conv_simple.cc @@ -37,8 +37,8 @@ int main() { #endif // Input/output sizes - constexpr auto kSizeX = size_t{8192}; // Matrix dimension X - constexpr auto kSizeY = size_t{4096}; // Matrix dimension Y + const auto kSizeX = size_t{8192}; // Matrix dimension X + const auto kSizeY = size_t{4096}; // Matrix dimension Y // Creates the input/output matrices and fills them with some example data std::vector mat_a(kSizeX*kSizeY, 2.0f); diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc index 5a2f111..f2c3c82 100644 --- a/samples/gemm/gemm.cc +++ b/samples/gemm/gemm.cc @@ -45,15 +45,15 @@ bool IsMultiple(size_t a, size_t b) { }; // Constants -constexpr auto kDefaultDevice = size_t{0}; -constexpr auto kDefaultPlatform = size_t{0}; -constexpr auto kDefaultSearchMethod = size_t{1}; -constexpr auto kDefaultSearchParameter1 = size_t{4}; +const auto kDefaultDevice = size_t{0}; +const auto kDefaultPlatform = size_t{0}; +const auto kDefaultSearchMethod = size_t{1}; +const auto kDefaultSearchParameter1 = size_t{4}; // Settings (sizes) -constexpr auto kSizeM = size_t{2048}; -constexpr auto kSizeN = size_t{2048}; -constexpr auto kSizeK = size_t{2048}; +const auto kSizeM = size_t{2048}; +const auto kSizeN = size_t{2048}; +const auto kSizeK = size_t{2048}; // ================================================================================================= @@ -203,7 +203,7 @@ int main(int argc, char* argv[]) { tuner.PrintFormatted(); // Also prints the performance of the best-case in terms of GFLOPS - constexpr auto kMGFLOP = (2*kSizeM*kSizeN*kSizeK) * 1.0e-6; + const auto kMGFLOP = (2*kSizeM*kSizeN*kSizeK) * 1.0e-6; if (time_ms != 0.0) { printf("[ -------> ] %.1lf ms or %.3lf GFLOPS\n", time_ms, kMGFLOP/time_ms); } diff --git a/samples/multiple_kernels/multiple_kernels.cc b/samples/multiple_kernels/multiple_kernels.cc index 6978e34..45730a3 100644 --- a/samples/multiple_kernels/multiple_kernels.cc +++ b/samples/multiple_kernels/multiple_kernels.cc @@ -51,8 +51,8 @@ int main() { #endif // Matrix size - constexpr auto kSizeM = size_t{2048}; - constexpr auto kSizeN = size_t{4096}; + const auto kSizeM = size_t{2048}; + const auto kSizeN = size_t{4096}; // Creates data structures std::vector mat_a(kSizeN*kSizeM); // Assumes matrix A is transposed diff --git a/samples/simple/simple.cc b/samples/simple/simple.cc index 0493746..5c957d6 100644 --- a/samples/simple/simple.cc +++ b/samples/simple/simple.cc @@ -28,7 +28,7 @@ int main() { #endif // Vector dimension - constexpr auto kVectorSize = size_t{16*1024*1024}; + const auto kVectorSize = size_t{16*1024*1024}; // Creates the vectors and fills them with some example data std::vector vec_a(kVectorSize, 1.0f); diff --git a/src/searchers/annealing.cc b/src/searchers/annealing.cc index 412cc99..cdbcad6 100644 --- a/src/searchers/annealing.cc +++ b/src/searchers/annealing.cc @@ -34,6 +34,13 @@ namespace cltune { // ================================================================================================= +// Maximum number of successive visits to already visited states. If this number is exceeded, the +// algorithm ends +const size_t Annealing::kMaxAlreadyVisitedStates = size_t{10}; + +// Maximum number of differences to consider this still a neighbour +const size_t Annealing::kMaxDifferences = size_t{3}; + // Initializes the simulated annealing searcher by specifying the fraction of the total search space // to consider and the maximum annealing 'temperature'. Annealing::Annealing(const Configurations &configurations, diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc index 58bb171..37df3f7 100644 --- a/src/tuner_impl.cc +++ b/src/tuner_impl.cc @@ -44,10 +44,14 @@ #include // std::min #include // std::unique_ptr #include // std::tuple +#include // std::getenv namespace cltune { // ================================================================================================= +// This is the threshold for 'correctness' +const double TunerImpl::kMaxL2Norm = 1e-4; + // Messages printed to stdout (in colours) const std::string TunerImpl::kMessageFull = "\x1b[32m[==========]\x1b[0m"; const std::string TunerImpl::kMessageHead = "\x1b[32m[----------]\x1b[0m"; @@ -250,13 +254,19 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker // In case of an exception, skip this run try { - - // Compiles the kernel and prints the compiler errors/warnings #ifdef VERBOSE fprintf(stdout, "%s Starting compilation\n", kMessageVerbose.c_str()); #endif + + // Sets the build options from an environmental variable (if set) + auto options = std::vector(); + const auto environment_variable = std::getenv("CLTUNE_BUILD_OPTIONS"); + if (environment_variable != nullptr) { + options.push_back(std::string(environment_variable)); + } + + // Compiles the kernel and prints the compiler errors/warnings auto program = Program(context_, source); - auto options = std::vector{}; auto build_status = program.Build(device_, options); if (build_status == BuildStatus::kError) { auto message = program.GetBuildInfo(device_); @@ -325,25 +335,30 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker // Prepares the kernel queue_.Finish(); - // Runs the kernel (this is the timed part) + // Multiple runs of the kernel to find the minimum execution time fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str()); auto events = std::vector(num_runs_); + auto elapsed_time = std::numeric_limits::max(); for (auto t=size_t{0}; t::max(); - for (auto t=size_t{0}; t(cpu_timer).count(); + #ifdef VERBOSE + fprintf(stdout, "%s Completed kernel in %.2lf ms\n", kMessageVerbose.c_str(), cpu_timing); + #endif + elapsed_time = std::min(elapsed_time, cpu_timing); } + queue_.Finish(); // Prints diagnostic information fprintf(stdout, "%s Completed %s (%.1lf ms) - %zu out of %zu\n",