Merge pull request #33 from CNugteren/development

Added machine learning, new CLCudaAPI, CUDA, Catch, and MSVC support
CNugteren · Nov 22, 2015 · 9e401f4 · 9e401f4
2 parents 32eb552 + 8bc6684
commit 9e401f4
Show file tree

Hide file tree

Showing 208 changed files with 12,046 additions and 141,617 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,15 @@
 
+Version 2.0.0
+- Added support for machine learning models. These models can be trained on a small fraction of the
+  tuning configurations and can be used to predict the remainder. Two models are supported:
+  * Linear regression
+  * A 3-layer neural network
+- Now using version 4.0 of the CLCudaAPI header (previously known as Claduc)
+- Added experimental support for CUDA kernels
+- Added support for MSVC (Visual Studio) 2015
+- Using Catch instead of GTest for unit-testing
+- Various minor fixes
+
 Version 1.7.1
 - Added additional device properties to JSON-output
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,8 @@
 # ==================================================================================================
 # This file is part of the CLTune project.
 #
-# Author: cedric.nugteren@surfsara.nl (Cedric Nugteren)
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
 #
 # -------------------------------------------------------------------------------------------------
 #
@@ -25,14 +26,23 @@
 # CMake project
 cmake_minimum_required(VERSION 2.8.10)
 project("cltune" CXX)
-set(cltune_VERSION_MAJOR 1)
-set(cltune_VERSION_MINOR 7)
-set(cltune_VERSION_PATCH 1)
+set(cltune_VERSION_MAJOR 2)
+set(cltune_VERSION_MINOR 0)
+set(cltune_VERSION_PATCH 0)
 
 # Options
 option(SAMPLES "Enable compilation of sample programs" ON)
 option(TESTS "Enable compilation of the Google tests" OFF)
 
+# Select between OpenCL and CUDA back-end
+option(USE_OPENCL "Use OpenCL instead of CUDA" ON)
+if(USE_OPENCL)
+  message("-- Building with OpenCL")
+  add_definitions(-DUSE_OPENCL)
+else()
+  message("-- Building with CUDA")
+endif()
+
 # ==================================================================================================
 
 # RPATH settings
@@ -67,17 +77,18 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 endif()
 
 # C++ compiler settings
-set(FLAGS "-O3 -std=c++11")
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  set(FLAGS "${FLAGS} -Wall -Wno-comment")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.4)
-    set(FLAGS "${FLAGS} -Wno-attributes")
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+  set(FLAGS "/Ox")
+else ()
+  set(FLAGS "-O3 -std=c++11")
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    set(FLAGS "${FLAGS} -Wall -Wno-comment")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.4)
+      set(FLAGS "${FLAGS} -Wno-attributes")
+    endif()
+  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    set(FLAGS "${FLAGS} -Wextra")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-  set(FLAGS "${FLAGS} -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
-  set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-weak-vtables")
-  set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-prototypes")
-  set(FLAGS "${FLAGS} -Wno-missing-noreturn -Wno-covered-switch-default")
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
 
@@ -86,13 +97,26 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
 # Package scripts location
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 
-# Requires OpenCL (FindOpenCL is included as part of this project)
-find_package(OpenCL REQUIRED)
+# Requires CUDA or OpenCL. The latter is found through the included "FindOpenCL.cmake".
+if(USE_OPENCL)
+  find_package(OpenCL REQUIRED)
+  set(FRAMEWORK_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS})
+  set(FRAMEWORK_LIBRARY_DIRS )
+  set(FRAMEWORK_LIBRARIES ${OPENCL_LIBRARIES})
+else()
+  find_package(CUDA REQUIRED)
+  set(FRAMEWORK_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS})
+  set(FRAMEWORK_LIBRARY_DIRS ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+  set(FRAMEWORK_LIBRARIES cuda nvrtc)
+endif()
 
 # ==================================================================================================
 
-# The includes
-include_directories(${cltune_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
+# Include directories: CLTune headers and OpenCL/CUDA includes
+include_directories(${cltune_SOURCE_DIR}/include ${FRAMEWORK_INCLUDE_DIRS})
+
+# Link directories: CUDA toolkit
+link_directories(${FRAMEWORK_LIBRARY_DIRS})
 
 # Gathers all source-files
 set(TUNER
@@ -103,48 +127,48 @@ set(TUNER
     src/searchers/full_search.cc
     src/searchers/random_search.cc
     src/searchers/annealing.cc
-    src/searchers/pso.cc)
+    src/searchers/pso.cc
+    src/ml_model.cc
+    src/ml_models/linear_regression.cc
+    src/ml_models/neural_network.cc)
 
 # Creates and links the library
 add_library(cltune SHARED ${TUNER})
-target_link_libraries(cltune ${OPENCL_LIBRARIES})
+target_link_libraries(cltune ${FRAMEWORK_LIBRARIES})
 
 # Installs the library
 install(TARGETS cltune DESTINATION lib)
 install(FILES include/cltune.h DESTINATION include)
 
 # ==================================================================================================
+
 # Optional: Enables compilation of sample programs
 if (SAMPLES)
 
   # Adds sample programs
   add_executable(sample_simple samples/simple/simple.cc)
   add_executable(sample_gemm samples/gemm/gemm.cc)
   add_executable(sample_conv samples/conv/conv.cc)
-  target_link_libraries(sample_simple cltune ${OPENCL_LIBRARIES} ${OpenMP_LIBRARY})
-  target_link_libraries(sample_gemm cltune ${OPENCL_LIBRARIES} ${OpenMP_LIBRARY})
-  target_link_libraries(sample_conv cltune ${OPENCL_LIBRARIES} ${OpenMP_LIBRARY})
+  target_link_libraries(sample_simple cltune ${FRAMEWORK_LIBRARIES} ${OpenMP_LIBRARY})
+  target_link_libraries(sample_gemm cltune ${FRAMEWORK_LIBRARIES} ${OpenMP_LIBRARY})
+  target_link_libraries(sample_conv cltune ${FRAMEWORK_LIBRARIES} ${OpenMP_LIBRARY})
 
   # Note: these are not installed because they depend on their separate OpenCL kernel files
-
 endif()
-# ==================================================================================================
-# Optional: Enables compilation of the Google tests
-if (TESTS)
 
-  # The tests use specific flags to reduce the amount of warnings from GTest.
-  set(CMAKE_CXX_FLAGS "-O3 -std=c++11")
+# ==================================================================================================
 
-  # Enables Google Test tests (source-code is shipped with the project)
-  add_subdirectory(external/gtest-1.7.0)
+# Optional: Enable inclusion of the test-suite
+if (TESTS)
   enable_testing()
-  include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})
-
-  # Compiles the tests
-  add_executable(unit_tests test/tuner.cc test/kernel_info.cc)
-  target_link_libraries(unit_tests gtest gtest_main cltune ${OPENCL_LIBRARIES})
-
-  # Adds the tests
-  add_test(name unit_tests command unit_tests)
+  include_directories(${cltune_SOURCE_DIR}/test ${cltune_SOURCE_DIR}/include ${FRAMEWORK_INCLUDE_DIRS})
+  add_executable(unit_tests
+                 test/main.cc
+                 test/clcudaapi.cc
+                 test/tuner.cc
+                 test/kernel_info.cc)
+  target_link_libraries(unit_tests cltune ${FRAMEWORK_LIBRARIES})
+  add_test(unit_tests unit_tests)
 endif()
+
 # ==================================================================================================
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ CLTune: Automatic OpenCL kernel tuning
 
 [![Build Status](https://travis-ci.org/CNugteren/CLTune.svg?branch=master)](https://travis-ci.org/CNugteren/CLTune)
 
-CLTune is a C++ library which can be used to automatically tune your OpenCL kernels. The only thing you'll need to provide is a tuneable kernel and a list of allowed parameters and values.
+CLTune is a C++ library which can be used to automatically tune your OpenCL and CUDA kernels. The only thing you'll need to provide is a tuneable kernel and a list of allowed parameters and values.
 
 For example, if you would perform loop unrolling or local memory tiling through a pre-processor define, just remove the define from your kernel code, pass the kernel to CLTune and tell it what the name of your parameter(s) are and what values you want to try. CLTune will take care of the rest: it will iterate over all possible permutations, test them, and report the best combination.
 
@@ -20,9 +20,10 @@ CLTune can be compiled as a shared library using CMake. The pre-requisites are:
   - Clang 3.3 or newer
   - AppleClang 5.0 or newer
   - ICC 14.0 or newer
+  - MSVC (Visual Studio) 2015 or newer
 * An OpenCL library. CLTune has been tested with:
   - Apple OpenCL
-  - NVIDIA CUDA SDK
+  - NVIDIA CUDA SDK (requires version 7.5 or newer for the CUDA back-end)
   - AMD APP SDK
 
 An example of an out-of-source build (starting from the root of the CLTune folder):
@@ -37,7 +38,7 @@ A custom installation folder can be specified when calling CMake:
 
     cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory ..
 
-You can then link your own programs against the CLTune library. An example for a Linux-system:
+You can then link your own programs against the CLTune library. An example for a Linux-system with OpenCL:
 
     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/libcltune.so
     g++ example.cc -o example -L/path/to/libcltune.so -lcltune -lOpenCL
@@ -50,20 +51,20 @@ Before we start using the tuner, we'll have to create one. The constructor takes
 
     cltune::Tuner my_tuner(0, 1); // Tuner on device 1 of OpenCL platform 0
 
-Now that we have a tuner, we can add a tuning kernel. This is done by providing a list of paths to OpenCL kernel files (first argument), the name of the kernel (second argument), a list of global thread dimensions (third argument), and a list of local thread or workgroup dimensions (fourth argument). Here is an example:
+For the CUDA back-end use 0 as the platform ID. Now that we have a tuner, we can add a tuning kernel. This is done by providing a list of paths to kernel files (first argument), the name of the kernel (second argument), a list of global thread dimensions (third argument), and a list of local thread or workgroup dimensions (fourth argument). Note that the thread configuration can be dynamic as well, see the included samples. Here is an example of a more basic usage using a static configuration:
 
     size_t id = my_tuner.AddKernel({"path/to/kernel.opencl"}, "my_kernel", {1024,512}, {16,8});
 
-Notice that the AddKernel function returns an integer: it is the ID of the added kernel. We'll need this ID when we want to add tuning parameters to this kernel. Let's say that our kernel has two pre-processor parameters named `PARAM_1` and `PARAM_2`:
+Notice that the AddKernel function returns an integer: it is the ID of the added kernel. We'll need this ID when we want to add tuning parameters to this kernel. Let's say that our kernel has two pre-processor parameters named `PARAM_1` with allowed values 16 and 24 and `PARAM_2` with allowed values 0 through 4:
 
     my_tuner.AddParameter(id, "PARAM_1", {16, 24});
     my_tuner.AddParameter(id, "PARAM_2", {0, 1, 2, 3, 4});
 
-Now that we've added a kernel and its parameters, we can add another one if we wish. When we're done, there are a couple of things left to be done. Let's start with adding an reference kernel. This reference kernel can provide the tuner with the ground-truth and is optional - only when it is provided will the tuner perform verification checks to ensure correctness.
+Now that we've added a kernel and its parameters, we can add another one if we wish. When we're satisfied with the kernels and their parameters, there are a couple of things left to be done. Let's start by adding a reference kernel. This reference kernel can provide the tuner with the ground-truth and is optional - the tuner will only perform verification checks to ensure correctness when it is provided.
 
     my_tuner.SetReference({"path/to/reference.opencl"}, "my_reference", {8192}, {128});
 
-The tuner also needs to know which arguments the kernels take. Scalar arguments can be provided as-is and are passed-by-value, whereas arrays have to be provided as C++ `std::vector`s. That's right, we won't have to create OpenCL buffers, CLTune will handle that for us! Here is an example:
+The tuner also needs to know which arguments the kernels take. Scalar arguments can be provided as-is and are passed-by-value, whereas arrays have to be provided as C++ `std::vector`s. That's right, you don't have to create device buffers yourself, CLTune will handle that! Here is an example:
 
     int my_variable = 900;
     std::vector<float> input_vector(8192);
@@ -82,28 +83,51 @@ Now that we've configured the tuner, it is time to start it and ask it to report
 Other examples
 -------------
 
-Examples are included as part of the CLTune distribution. They illustrate some more advanced features, such as modifying the thread dimensions based on the parameters and adding user-defined parameter constraints. The examples are compiled when providing `-ENABLE_SAMPLES=ON` to CMake (default option). The included examples are:
+Several examples are included as part of the CLTune distribution. They illustrate some more advanced features, such as modifying the thread dimensions based on the parameters and adding user-defined parameter constraints. The examples are compiled when setting `ENABLE_SAMPLES` to `ON` in CMake (default option). The included examples are:
 
 * `simple.cc` providing a basic example of matrix-vector multiplication
-* `gemm.cc` providing an advanced and heavily tuned implementation of matrix-matrix
-  multiplication (GEMM)
-* `conv.cc` providing an advanced and heavily tuned implementation of 2D convolution
+* `gemm.cc` providing an advanced and heavily tunable implementation of matrix-matrix multiplication (GEMM)
+* `conv.cc` providing an advanced and heavily tunable implementation of 2D convolution
 
-The latter two take optionally command-line arguments. The first argument is an integer for the device to run on, the second argument is an integer to select a search strategy (0=random, 1=annealing, 2=PSO, 3=fullsearch), and the third an optional search-strategy parameter.
+The latter two optionally take command-line arguments. The first argument is an integer for the device to run on, the second argument is an integer to select a search strategy (0=random, 1=annealing, 2=PSO, 3=fullsearch), and the third an optional search-strategy parameter.
+
+
+Search strategies and machine-learning
+-------------
+
+The GEMM and 2D convolution examples are additionally configured to use one of the four supported search strategies. More details can be found in the corresponding CLTune paper (see below). These search-strategies can be used for any example as follows:
+
+  tuner.UseFullSearch(); // Default
+  tuner.UseRandomSearch(double fraction);
+  tuner.UseAnnealing(double fraction, double max_temperature);
+  tuner.UsePSO(double fraction, size_t swarm_size, double influence_global, double influence_local, double influence_random);
+
+The 2D convolution example is additionally configured to use machine-learning to predict the quality of parameters based on a limited set of 'training' data. The supported models are linear regression and a 3-layer neural network. These machine-learning models are still experimental, but can be used as follows:
+
+  // Trains a machine learning model based on the search space explored so far. Then, all the
+  // missing data-points are estimated based on this model. This is only useful if a fraction of
+  // the search space is explored, as is the case when doing random-search.
+  tuner.ModelPrediction(Model model_type, float validation_fraction, size_t test_top_x_configurations);
+
+
+Experimental CUDA support
+-------------
+
+CLTune was originally developed for OpenCL kernels, but since it uses the high-level C++ API `CLCudaAPI`, it can also work with CUDA kernels. To compile CLTune with CUDA as a back-end, set the `USE_OPENCL` CMake flag to `OFF`, for example as follows:
+
+    cmake -DUSE_OPENCL=OFF ..
+
+The samples ship with a basic header to convert the included OpenCL samples to CUDA (`cl_to_cuda.h`). This header file is automatically included when CLTune is built with CUDA as a back-end. It has been tested with the `simple` example, but doesn't work with the more advanced kernels. Nevertheless, CLTune should work with any proper CUDA kernel.
 
 
 Development and tests
 -------------
 
-The CLTune project follows the Google C++ styleguide (with some exceptions) and uses a tab-size of
-two spaces and a max-width of 100 characters per line. It is furthermore based on practises from the
-third edition of Effective C++ and the first edition of Effective Modern C++. The project is
-licensed under the APACHE 2.0 license by SURFsara, (c) 2014. The contributing authors so far are:
+The CLTune project follows the Google C++ styleguide (with some exceptions) and uses a tab-size of two spaces and a max-width of 100 characters per line. It is furthermore based on practises from the third edition of Effective C++ and the first edition of Effective Modern C++. The project is licensed under the APACHE 2.0 license by SURFsara, (c) 2014. The contributing authors so far are:
 
 * Cedric Nugteren
 
-CLTune is packaged with Google Test 1.7.0 and a custom test suite. The tests will be compiled when
-providing the `-TESTS=ON` option to CMake. Running the tests goes as follows:
+CLTune is packaged with Catch 1.2.1 and a custom test suite. No external dependencies are needed. The tests will be compiled when providing the `TESTS=ON` option to CMake. Running the tests goes as follows:
 
     ./unit_tests