Merge pull request #38 from CNugteren/development

Update to version 2.3.0
CNugteren · May 22, 2016 · b887e1e · b887e1e
2 parents cba89a4 + ae12ebe
commit b887e1e
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 24 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,9 @@
 
+Version 2.3.0
+- Added support for 'short' and 'cl_half' data-types as kernel buffer and scalar arguments
+- Fixed a bug where failed results would still show up in the tuning results
+- Made MSVC link the run-time libraries statically
+
 Version 2.2.0
 - Added two new simpler samples of using the tuner (vector-add and convolution)
 - Updated the general documentation

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,11 +23,16 @@
 #
 # ==================================================================================================
 
-# CMake project
 cmake_minimum_required(VERSION 2.8.10)
+
+# Overrides for MSVC static runtime
+set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
+set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_flag_overrides.cmake)
+
+# CMake project details
 project("cltune" CXX)
 set(cltune_VERSION_MAJOR 2)
-set(cltune_VERSION_MINOR 2)
+set(cltune_VERSION_MINOR 3)
 set(cltune_VERSION_PATCH 0)
 
 # Options
@@ -54,40 +59,41 @@ set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically deter
 # ==================================================================================================
 
 # Compiler-version check (requires at least CMake 2.8.10)
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
     message(FATAL_ERROR "GCC version must be at least 4.7")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL Clang)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.3)
     message(FATAL_ERROR "Clang version must be at least 3.3")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-    message(FATAL_ERROR "Clang version must be at least 5.0")
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+    message(FATAL_ERROR "AppleClang version must be at least 5.0")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 14.0)
     message(FATAL_ERROR "ICC version must be at least 14.0")
   endif()
-elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
+elseif(MSVC)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 18.0)
     message(FATAL_ERROR "MS Visual Studio version must be at least 18.0")
   endif()
 endif()
 
 # C++ compiler settings
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-  set(FLAGS "/Ox /wd4715 /wd4996")
-else ()
+if(MSVC)
+  set(FLAGS "/Ox")
+  set(FLAGS "${FLAGS} /wd4715 /wd4996")
+else()
   set(FLAGS "-O3 -std=c++11")
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+  if(CMAKE_CXX_COMPILER_ID STREQUAL GNU)
     set(FLAGS "${FLAGS} -Wall -Wno-comment")
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.4)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8.4)
       set(FLAGS "${FLAGS} -Wno-attributes")
     endif()
-  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    set(FLAGS "${FLAGS} -Wextra")
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang)
+    set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic")
   endif()
 endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")

diff --git a/cmake/c_flag_overrides.cmake b/cmake/c_flag_overrides.cmake
@@ -0,0 +1,8 @@
+# Overriding the CMake flags to use static runtime libraries
+# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F
+if(MSVC)
+  set(CMAKE_C_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1")
+  set(CMAKE_C_FLAGS_MINSIZEREL_INIT     "/MT /O1 /Ob1 /D NDEBUG")
+  set(CMAKE_C_FLAGS_RELEASE_INIT        "/MT /O2 /Ob2 /D NDEBUG")
+  set(CMAKE_C_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG")
+endif()
diff --git a/cmake/cxx_flag_overrides.cmake b/cmake/cxx_flag_overrides.cmake
@@ -0,0 +1,8 @@
+# Overriding the CMake flags to use static runtime libraries
+# See http://www.cmake.org/Wiki/CMake_FAQ#How_can_I_build_my_MSVC_application_with_a_static_runtime.3F
+if(MSVC)
+  set(CMAKE_CXX_FLAGS_DEBUG_INIT "/D_DEBUG /MTd /Zi /Ob0 /Od /RTC1")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL_INIT     "/MT /O1 /Ob1 /D NDEBUG")
+  set(CMAKE_CXX_FLAGS_RELEASE_INIT        "/MT /O2 /Ob2 /D NDEBUG")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO_INIT "/MT /Zi /O2 /Ob1 /D NDEBUG")
+endif()
diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h
@@ -50,6 +50,13 @@
 namespace cltune {
 // =================================================================================================
 
+// Host data-type for half-precision floating-point (16-bit)
+#if USE_OPENCL
+  using half = cl_half;
+#else
+  using half = short unsigned int;
+#endif
+
 // Shorthands for complex data-types
 using float2 = std::complex<float>; // cl_float2;
 using double2 = std::complex<double>; // cl_double2;
@@ -62,7 +69,7 @@ using double2 = std::complex<double>; // cl_double2;
 #endif
 
 // Enumeration of currently supported data-types by this class
-enum class MemType { kInt, kSizeT, kFloat, kDouble, kFloat2, kDouble2 };
+enum class MemType { kShort, kInt, kSizeT, kHalf, kFloat, kDouble, kFloat2, kDouble2 };
 
 // See comment at top of file for a description of the class
 class TunerImpl {

diff --git a/src/cltune.cc b/src/cltune.cc
@@ -170,8 +170,10 @@ void Tuner::AddArgumentInput(const std::vector<T> &source) {
 }
 
 // Compiles the function for various data-types
+template void PUBLIC_API Tuner::AddArgumentInput<short>(const std::vector<short>&);
 template void PUBLIC_API Tuner::AddArgumentInput<int>(const std::vector<int>&);
 template void PUBLIC_API Tuner::AddArgumentInput<size_t>(const std::vector<size_t>&);
+template void PUBLIC_API Tuner::AddArgumentInput<half>(const std::vector<half>&);
 template void PUBLIC_API Tuner::AddArgumentInput<float>(const std::vector<float>&);
 template void PUBLIC_API Tuner::AddArgumentInput<double>(const std::vector<double>&);
 template void PUBLIC_API Tuner::AddArgumentInput<float2>(const std::vector<float2>&);
@@ -188,8 +190,10 @@ void Tuner::AddArgumentOutput(const std::vector<T> &source) {
 }
 
 // Compiles the function for various data-types
+template void PUBLIC_API Tuner::AddArgumentOutput<short>(const std::vector<short>&);
 template void PUBLIC_API Tuner::AddArgumentOutput<int>(const std::vector<int>&);
 template void PUBLIC_API Tuner::AddArgumentOutput<size_t>(const std::vector<size_t>&);
+template void PUBLIC_API Tuner::AddArgumentOutput<half>(const std::vector<half>&);
 template void PUBLIC_API Tuner::AddArgumentOutput<float>(const std::vector<float>&);
 template void PUBLIC_API Tuner::AddArgumentOutput<double>(const std::vector<double>&);
 template void PUBLIC_API Tuner::AddArgumentOutput<float2>(const std::vector<float2>&);
@@ -198,12 +202,18 @@ template void PUBLIC_API Tuner::AddArgumentOutput<double2>(const std::vector<dou
 // Sets a scalar value as an argument to the kernel. Since a vector of scalars of any type doesn't
 // exist, there is no general implemenation. Instead, each data-type has its specialised version in
 // which it stores to a specific vector.
+template <> void PUBLIC_API Tuner::AddArgumentScalar<short>(const short argument) {
+  pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument});
+}
 template <> void PUBLIC_API Tuner::AddArgumentScalar<int>(const int argument) {
   pimpl->arguments_int_.push_back({pimpl->argument_counter_++, argument});
 }
 template <> void PUBLIC_API Tuner::AddArgumentScalar<size_t>(const size_t argument) {
   pimpl->arguments_size_t_.push_back({pimpl->argument_counter_++, argument});
 }
+template <> void PUBLIC_API Tuner::AddArgumentScalar<half>(const half argument) {
+  pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument});
+}
 template <> void PUBLIC_API Tuner::AddArgumentScalar<float>(const float argument) {
   pimpl->arguments_float_.push_back({pimpl->argument_counter_++, argument});
 }
@@ -352,10 +362,18 @@ void Tuner::PrintJSON(const std::string &filename,
   fprintf(file, "  \"device_compute_units\": \"%zu\",\n", pimpl->device().ComputeUnits());
   fprintf(file, "  \"results\": [\n");
 
+  // Filters failed configurations
+  auto results = std::vector<TunerImpl::TunerResult>();
+  for (const auto &tuning_result: pimpl->tuning_results_) {
+    if (tuning_result.status && tuning_result.time != std::numeric_limits<double>::max()) {
+      results.push_back(tuning_result);
+    }
+  }
+
   // Loops over all the results
-  auto num_results = pimpl->tuning_results_.size();
+  auto num_results = results.size();
   for (auto r=size_t{0}; r<num_results; ++r) {
-    auto result = pimpl->tuning_results_[r];
+    auto result = results[r];
     fprintf(file, "    {\n");
     fprintf(file, "      \"kernel\": \"%s\",\n", result.kernel_name.c_str());
     fprintf(file, "      \"time\": %.3lf,\n", result.time);

diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc
@@ -206,15 +206,16 @@ void TunerImpl::Tune() {
 
         // Stores the parameters and the timing-result
         tuning_result.configuration = permutation;
-        tuning_results_.push_back(tuning_result);
         if (tuning_result.time == std::numeric_limits<float>::max()) {
           tuning_result.time = 0.0;
           PrintResult(stdout, tuning_result, kMessageFailure);
           tuning_result.time = std::numeric_limits<float>::max();
+          tuning_result.status = false;
         }
         else if (!tuning_result.status) {
           PrintResult(stdout, tuning_result, kMessageWarning);
         }
+        tuning_results_.push_back(tuning_result);
       }
 
       // Prints a log of the searching process. This is disabled per default, but can be enabled
@@ -265,8 +266,10 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
     // Sets the output buffer(s) to zero
     for (auto &output: arguments_output_) {
       switch (output.type) {
+        case MemType::kShort: ResetMemArgument<short>(output); break;
         case MemType::kInt: ResetMemArgument<int>(output); break;
         case MemType::kSizeT: ResetMemArgument<size_t>(output); break;
+        case MemType::kHalf: ResetMemArgument<half>(output); break;
         case MemType::kFloat: ResetMemArgument<float>(output); break;
         case MemType::kDouble: ResetMemArgument<double>(output); break;
         case MemType::kFloat2: ResetMemArgument<float2>(output); break;
@@ -357,8 +360,10 @@ void TunerImpl::StoreReferenceOutput() {
   reference_outputs_.clear();
   for (auto &output_buffer: arguments_output_) {
     switch (output_buffer.type) {
+      case MemType::kShort: DownloadReference<short>(output_buffer); break;
       case MemType::kInt: DownloadReference<int>(output_buffer); break;
       case MemType::kSizeT: DownloadReference<size_t>(output_buffer); break;
+      case MemType::kHalf: DownloadReference<half>(output_buffer); break;
       case MemType::kFloat: DownloadReference<float>(output_buffer); break;
       case MemType::kDouble: DownloadReference<double>(output_buffer); break;
       case MemType::kFloat2: DownloadReference<float2>(output_buffer); break;
@@ -385,8 +390,10 @@ bool TunerImpl::VerifyOutput() {
     auto i = size_t{0};
     for (auto &output_buffer: arguments_output_) {
       switch (output_buffer.type) {
+        case MemType::kShort: status &= DownloadAndCompare<short>(output_buffer, i); break;
         case MemType::kInt: status &= DownloadAndCompare<int>(output_buffer, i); break;
         case MemType::kSizeT: status &= DownloadAndCompare<size_t>(output_buffer, i); break;
+        case MemType::kHalf: status &= DownloadAndCompare<half>(output_buffer, i); break;
         case MemType::kFloat: status &= DownloadAndCompare<float>(output_buffer, i); break;
         case MemType::kDouble: status &= DownloadAndCompare<double>(output_buffer, i); break;
         case MemType::kFloat2: status &= DownloadAndCompare<float2>(output_buffer, i); break;
@@ -612,8 +619,10 @@ void TunerImpl::PrintHeader(const std::string &header_name) const {
 // =================================================================================================
 
 // Get the MemType based on a template argument
+template <> MemType TunerImpl::GetType<short>() { return MemType::kShort; }
 template <> MemType TunerImpl::GetType<int>() { return MemType::kInt; }
 template <> MemType TunerImpl::GetType<size_t>() { return MemType::kSizeT; }
+template <> MemType TunerImpl::GetType<half>() { return MemType::kHalf; }
 template <> MemType TunerImpl::GetType<float>() { return MemType::kFloat; }
 template <> MemType TunerImpl::GetType<double>() { return MemType::kDouble; }
 template <> MemType TunerImpl::GetType<float2>() { return MemType::kFloat2; }