diff --git a/CHANGELOG b/CHANGELOG
index 5fcb1a6..19c7709 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,10 @@
 
+Version 2.6.0
+- Changed timing measurements to now also include the (varying) kernel launch overhead
+- It is now possible to set OpenCL compiler options through the env variable CLTUNE_BUILD_OPTIONS
+- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
+- Added an option to build a static version of the library
+
 Version 2.5.0
 - Updated to version 8.0 of the CLCudaAPI header
 - Made it possible to configure the number of times each kernel is run (to average results)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 14a106a..a3c3882 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@
 #
 # ==================================================================================================
 
-cmake_minimum_required(VERSION 2.8.10)
+cmake_minimum_required(VERSION 2.8.11)
 
 # Overrides for MSVC static runtime
 set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
@@ -32,10 +32,11 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("cltune" CXX)
 set(cltune_VERSION_MAJOR 2)
-set(cltune_VERSION_MINOR 5)
+set(cltune_VERSION_MINOR 6)
 set(cltune_VERSION_PATCH 0)
 
-# Options
+# Options and their default values
+option(BUILD_SHARED_LIBS "Build a shared (ON) or static library (OFF)" ON)
 option(SAMPLES "Enable compilation of sample programs" ON)
 option(TESTS "Enable compilation of the Google tests" OFF)
 
@@ -85,6 +86,13 @@ elseif(MSVC)
   endif()
 endif()
 
+# DLL Settings
+if(MSVC)
+  if(BUILD_SHARED_LIBS)
+    add_definitions(" /DCLTUNE_DLL")
+  endif()
+endif(MSVC)
+
 # C++ compiler settings
 if(MSVC)
   set(FLAGS "/Ox")
@@ -143,13 +151,32 @@ set(TUNER
     src/ml_models/neural_network.cc)
 
 # Creates and links the library
-add_library(cltune SHARED ${TUNER})
+if(BUILD_SHARED_LIBS)
+  add_library(cltune SHARED ${TUNER})
+else(BUILD_SHARED_LIBS)
+  add_library(cltune STATIC ${TUNER})
+endif()
 target_link_libraries(cltune ${FRAMEWORK_LIBRARIES})
 
+# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
+if(MSVC)
+  if(BUILD_SHARED_LIBS)
+    target_compile_definitions(cltune PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
+  endif()
+endif()
+
 # Installs the library
 install(TARGETS cltune DESTINATION lib)
 install(FILES include/cltune.h DESTINATION include)
 
+# Install pkg-config file on Linux
+if(UNIX)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cltune.pc.in"
+                   "${CMAKE_CURRENT_BINARY_DIR}/cltune.pc" @ONLY IMMEDIATE)
+    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/cltune.pc
+            DESTINATION lib/pkgconfig)
+endif()
+
 # ==================================================================================================
 
 # Optional: Enables compilation of sample programs
diff --git a/README.md b/README.md
index 0445696..ea27d2d 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ CLTune can be compiled as a shared library using CMake. The pre-requisites are:
   - Clang 3.3 or newer
   - AppleClang 5.0 or newer
   - ICC 14.0 or newer
-  - MSVC (Visual Studio) 2015 or newer
+  - MSVC (Visual Studio) 2013 or newer
 * An OpenCL library. CLTune has been tested with:
   - Apple OpenCL
   - NVIDIA CUDA SDK (requires version 7.5 or newer for the CUDA back-end)
@@ -48,6 +48,8 @@ You can then link your own programs against the CLTune library. An example for a
     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/path/to/libcltune.so
     g++ example.cc -o example -L/path/to/libcltune.so -lcltune -lOpenCL
 
+Furthermore, it is possible to optionally set an OS environmental variable `CLTUNE_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler at run-time.
+
 
 Example of using the tuner
 -------------
diff --git a/cltune.pc.in b/cltune.pc.in
new file mode 100644
index 0000000..41dd746
--- /dev/null
+++ b/cltune.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib
+
+Name: CLTune
+Description: CLTune: An automatic OpenCL & CUDA kernel tuner
+Version: @cltune_VERSION_MAJOR@.@cltune_VERSION_MINOR@.@cltune_VERSION_PATCH@
+Libs: -L${libdir} -lcltune
+Cflags: -I${includedir}
diff --git a/include/cltune.h b/include/cltune.h
index 9837556..8af3dff 100644
--- a/include/cltune.h
+++ b/include/cltune.h
@@ -37,8 +37,12 @@
 
 // Exports library functions under Windows when building a DLL. See also:
 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
-  #define PUBLIC_API __declspec(dllexport)
+#if defined(_WIN32) && defined(CLTUNE_DLL)
+  #if defined(COMPILING_DLL)
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
 #else
   #define PUBLIC_API
 #endif
diff --git a/include/internal/kernel_info.h b/include/internal/kernel_info.h
index 90c2fb2..27232f8 100644
--- a/include/internal/kernel_info.h
+++ b/include/internal/kernel_info.h
@@ -45,6 +45,7 @@
 #endif
 
 #include "cltune.h"
+#include "internal/msvc.h"
 
 namespace cltune {
 // =================================================================================================
@@ -100,7 +101,7 @@ class KernelInfo {
   };
 
   // Initializes the class with a given name and a string of kernel source-code
-  explicit KernelInfo(const std::string name, const std::string source, const Device &device);
+  explicit PUBLIC_API KernelInfo(const std::string name, const std::string source, const Device &device);
 
   // Accessors (getters)
   std::string name() const { return name_; }
@@ -117,36 +118,36 @@ class KernelInfo {
   void set_local_base(IntRange local) { local_base_ = local; local_ = local; }
 
   // Prepend to the source-code
-  void PrependSource(const std::string &extra_source);
+  void PUBLIC_API PrependSource(const std::string &extra_source);
 
   // Adds a new parameter with a name and a vector of possible values
-  void AddParameter(const std::string &name, const std::vector<size_t> &values);
+  void PUBLIC_API AddParameter(const std::string &name, const std::vector<size_t> &values);
 
   // Checks wheter a parameter exists, returns "true" if it does exist
-  bool ParameterExists(const std::string parameter_name);
+  bool PUBLIC_API ParameterExists(const std::string parameter_name);
 
   // Specifies a modifier in the form of a StringRange to the global/local thread-sizes. This
   // modifier has to contain (per-dimension) the name of a single parameter or an empty string. The
   // supported modifiers are given by the ThreadSizeModifierType enumeration.
-  void AddModifier(const StringRange range, const ThreadSizeModifierType type);
+  void PUBLIC_API AddModifier(const StringRange range, const ThreadSizeModifierType type);
 
   // Adds a new constraint to the set of parameters (e.g. must be equal or larger than). The
   // constraints come in the form of a function object which takes a number of tuning parameters,
   // given as a vector of strings (parameter names). Their names are later substituted by actual
   // values.
-  void AddConstraint(ConstraintFunction valid_if, const std::vector<std::string> &parameters);
+  void PUBLIC_API AddConstraint(ConstraintFunction valid_if, const std::vector<std::string> &parameters);
 
   // As above, but for local memory usage
-  void SetLocalMemoryUsage(LocalMemoryFunction amount, const std::vector<std::string> &parameters);
+  void PUBLIC_API SetLocalMemoryUsage(LocalMemoryFunction amount, const std::vector<std::string> &parameters);
 
   // Computes the global/local ranges (in NDRange-form) based on all global/local thread-sizes (in
   // StringRange-form) and a single permutation (i.e. a configuration) containing a list of all
   // parameter names and their current values.
-  void ComputeRanges(const Configuration &config);
+  void PUBLIC_API ComputeRanges(const Configuration &config);
 
   // Computes all permutations based on the parameters and their values (the configuration list).
   // The result is stored as a member variable.
-  void SetConfigurations();
+  void PUBLIC_API SetConfigurations();
   
  private:
   // Called recursively internally by SetConfigurations 
diff --git a/include/internal/msvc.h b/include/internal/msvc.h
new file mode 100644
index 0000000..bf0fc81
--- /dev/null
+++ b/include/internal/msvc.h
@@ -0,0 +1,38 @@
+
+// =================================================================================================
+// This file is part of the CLTune project, which loosely follows the Google C++ styleguide and uses
+// a tab-size of two spaces and a max-width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
+// in particular for versions older than 2015 with limited C++11 support.
+// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
+// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
+// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
+// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
+// MSVC++ 9.0  _MSC_VER == 1500 (Visual Studio 2008)
+//
+// =================================================================================================
+
+#ifndef CLTUNE_MSVC_H_
+#define CLTUNE_MSVC_H_
+
+namespace cltune {
+// =================================================================================================
+#ifdef _MSC_VER
+
+// No support for constexpr prior to 2015. Note that this only works with constants, not with
+// constexpr functions (unused in this project).
+#if _MSC_VER < 1900
+#define constexpr const
+#endif
+
+// _MSC_VER
+#endif
+// =================================================================================================
+} // namespace cltune
+
+// CLTUNE_MSVC_H_
+#endif
diff --git a/include/internal/searchers/annealing.h b/include/internal/searchers/annealing.h
index eb029c3..36f2808 100644
--- a/include/internal/searchers/annealing.h
+++ b/include/internal/searchers/annealing.h
@@ -42,10 +42,10 @@ class Annealing: public Searcher {
 
   // Maximum number of successive visits to already visited states. If this number is exceeded, the
   // algorithm ends
-  static constexpr auto kMaxAlreadyVisitedStates = size_t{10};
+  static const size_t kMaxAlreadyVisitedStates;
 
   // Maximum number of differences to consider this still a neighbour
-  static constexpr auto kMaxDifferences = size_t{3};
+  static const size_t kMaxDifferences;
 
   // Takes additionally a fraction of configurations to consider
   Annealing(const Configurations &configurations,
diff --git a/include/internal/tuner_impl.h b/include/internal/tuner_impl.h
index b471fd3..8b703a7 100644
--- a/include/internal/tuner_impl.h
+++ b/include/internal/tuner_impl.h
@@ -40,6 +40,7 @@
 #endif
 
 #include "internal/kernel_info.h"
+#include "internal/msvc.h"
 
 // Host data-type for half-precision floating-point (16-bit)
 #include "internal/half.h"
@@ -73,7 +74,7 @@ class TunerImpl {
  public:
 
   // Parameters
-  static constexpr auto kMaxL2Norm = 1e-4; // This is the threshold for 'correctness'
+  static const double kMaxL2Norm; // This is the threshold for 'correctness'
 
   // Messages printed to stdout (in colours)
   static const std::string kMessageFull;
diff --git a/samples/conv/conv.cc b/samples/conv/conv.cc
index a774635..aedb93a 100644
--- a/samples/conv/conv.cc
+++ b/samples/conv/conv.cc
@@ -45,18 +45,18 @@ bool IsMultiple(size_t a, size_t b) {
 };
 
 // Constants
-constexpr auto kDefaultDevice = size_t{0};
-constexpr auto kDefaultPlatform = size_t{0};
-constexpr auto kDefaultSearchMethod = size_t{1};
-constexpr auto kDefaultSearchParameter1 = size_t{4};
+const auto kDefaultDevice = size_t{0};
+const auto kDefaultPlatform = size_t{0};
+const auto kDefaultSearchMethod = size_t{1};
+const auto kDefaultSearchParameter1 = size_t{4};
 
 // Settings (synchronise these with "conv.cc", "conv.opencl" and "conv_reference.opencl")
 #define HFS (3)        // Half filter size
 #define FS (HFS+HFS+1) // Filter size
 
 // Settings (sizes)
-constexpr auto kSizeX = size_t{8192}; // Matrix dimension X
-constexpr auto kSizeY = size_t{4096}; // Matrix dimension Y
+const auto kSizeX = size_t{8192}; // Matrix dimension X
+const auto kSizeY = size_t{4096}; // Matrix dimension Y
 
 // =================================================================================================
 
@@ -91,7 +91,7 @@ int main(int argc, char* argv[]) {
   }
 
   // Creates data structures
-  constexpr auto kExtraSize = size_t{FS*8};
+  const auto kExtraSize = size_t{FS*8};
   auto mat_a = std::vector<float>((kExtraSize+kSizeX)*(kExtraSize+kSizeY));
   auto mat_b = std::vector<float>(kSizeX*kSizeY);
   auto coeff = std::vector<float>(FS*FS);
@@ -230,8 +230,8 @@ int main(int argc, char* argv[]) {
   tuner.PrintJSON("output.json", {{"sample","convolution"}});
 
   // Also prints the performance of the best-case in terms of GB/s and GFLOPS
-  constexpr auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6;
-  constexpr auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6;
+  const auto kMB = (sizeof(float)*2*kSizeX*kSizeY) * 1.0e-6;
+  const auto kMFLOPS = ((1+2*FS*FS)*kSizeX*kSizeY) * 1.0e-6;
   if (time_ms != 0.0) {
     printf("[ -------> ] %.1lf ms or %.1lf GB/s or %1.lf GFLOPS\n",
            time_ms, kMB/time_ms, kMFLOPS/time_ms);
diff --git a/samples/conv_simple/conv_simple.cc b/samples/conv_simple/conv_simple.cc
index 4e3f19e..e6980ff 100644
--- a/samples/conv_simple/conv_simple.cc
+++ b/samples/conv_simple/conv_simple.cc
@@ -37,8 +37,8 @@ int main() {
   #endif
 
   // Input/output sizes
-  constexpr auto kSizeX = size_t{8192}; // Matrix dimension X
-  constexpr auto kSizeY = size_t{4096}; // Matrix dimension Y
+  const auto kSizeX = size_t{8192}; // Matrix dimension X
+  const auto kSizeY = size_t{4096}; // Matrix dimension Y
 
   // Creates the input/output matrices and fills them with some example data
   std::vector<float> mat_a(kSizeX*kSizeY, 2.0f);
diff --git a/samples/gemm/gemm.cc b/samples/gemm/gemm.cc
index 5a2f111..f2c3c82 100644
--- a/samples/gemm/gemm.cc
+++ b/samples/gemm/gemm.cc
@@ -45,15 +45,15 @@ bool IsMultiple(size_t a, size_t b) {
 };
 
 // Constants
-constexpr auto kDefaultDevice = size_t{0};
-constexpr auto kDefaultPlatform = size_t{0};
-constexpr auto kDefaultSearchMethod = size_t{1};
-constexpr auto kDefaultSearchParameter1 = size_t{4};
+const auto kDefaultDevice = size_t{0};
+const auto kDefaultPlatform = size_t{0};
+const auto kDefaultSearchMethod = size_t{1};
+const auto kDefaultSearchParameter1 = size_t{4};
 
 // Settings (sizes)
-constexpr auto kSizeM = size_t{2048};
-constexpr auto kSizeN = size_t{2048};
-constexpr auto kSizeK = size_t{2048};
+const auto kSizeM = size_t{2048};
+const auto kSizeN = size_t{2048};
+const auto kSizeK = size_t{2048};
 
 // =================================================================================================
 
@@ -203,7 +203,7 @@ int main(int argc, char* argv[]) {
   tuner.PrintFormatted();
 
   // Also prints the performance of the best-case in terms of GFLOPS
-  constexpr auto kMGFLOP = (2*kSizeM*kSizeN*kSizeK) * 1.0e-6;
+  const auto kMGFLOP = (2*kSizeM*kSizeN*kSizeK) * 1.0e-6;
   if (time_ms != 0.0) {
     printf("[ -------> ] %.1lf ms or %.3lf GFLOPS\n", time_ms, kMGFLOP/time_ms);
   }
diff --git a/samples/multiple_kernels/multiple_kernels.cc b/samples/multiple_kernels/multiple_kernels.cc
index 6978e34..45730a3 100644
--- a/samples/multiple_kernels/multiple_kernels.cc
+++ b/samples/multiple_kernels/multiple_kernels.cc
@@ -51,8 +51,8 @@ int main() {
   #endif
 
   // Matrix size
-  constexpr auto kSizeM = size_t{2048};
-  constexpr auto kSizeN = size_t{4096};
+  const auto kSizeM = size_t{2048};
+  const auto kSizeN = size_t{4096};
 
   // Creates data structures
   std::vector<float> mat_a(kSizeN*kSizeM); // Assumes matrix A is transposed
diff --git a/samples/simple/simple.cc b/samples/simple/simple.cc
index 0493746..5c957d6 100644
--- a/samples/simple/simple.cc
+++ b/samples/simple/simple.cc
@@ -28,7 +28,7 @@ int main() {
   #endif
 
   // Vector dimension
-  constexpr auto kVectorSize = size_t{16*1024*1024};
+  const auto kVectorSize = size_t{16*1024*1024};
 
   // Creates the vectors and fills them with some example data
   std::vector<float> vec_a(kVectorSize, 1.0f);
diff --git a/src/searchers/annealing.cc b/src/searchers/annealing.cc
index 412cc99..cdbcad6 100644
--- a/src/searchers/annealing.cc
+++ b/src/searchers/annealing.cc
@@ -34,6 +34,13 @@
 namespace cltune {
 // =================================================================================================
 
+// Maximum number of successive visits to already visited states. If this number is exceeded, the
+// algorithm ends
+const size_t Annealing::kMaxAlreadyVisitedStates = size_t{10};
+
+// Maximum number of differences to consider this still a neighbour
+const size_t Annealing::kMaxDifferences = size_t{3};
+
 // Initializes the simulated annealing searcher by specifying the fraction of the total search space
 // to consider and the maximum annealing 'temperature'.
 Annealing::Annealing(const Configurations &configurations,
diff --git a/src/tuner_impl.cc b/src/tuner_impl.cc
index 58bb171..37df3f7 100644
--- a/src/tuner_impl.cc
+++ b/src/tuner_impl.cc
@@ -44,10 +44,14 @@
 #include <algorithm> // std::min
 #include <memory> // std::unique_ptr
 #include <tuple> // std::tuple
+#include <cstdlib> // std::getenv
 
 namespace cltune {
 // =================================================================================================
 
+// This is the threshold for 'correctness'
+const double TunerImpl::kMaxL2Norm = 1e-4;
+
 // Messages printed to stdout (in colours)
 const std::string TunerImpl::kMessageFull    = "\x1b[32m[==========]\x1b[0m";
 const std::string TunerImpl::kMessageHead    = "\x1b[32m[----------]\x1b[0m";
@@ -250,13 +254,19 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
 
   // In case of an exception, skip this run
   try {
-
-    // Compiles the kernel and prints the compiler errors/warnings
     #ifdef VERBOSE
       fprintf(stdout, "%s Starting compilation\n", kMessageVerbose.c_str());
     #endif
+
+     // Sets the build options from an environmental variable (if set)
+    auto options = std::vector<std::string>();
+    const auto environment_variable = std::getenv("CLTUNE_BUILD_OPTIONS");
+    if (environment_variable != nullptr) {
+      options.push_back(std::string(environment_variable));
+    }
+
+    // Compiles the kernel and prints the compiler errors/warnings
     auto program = Program(context_, source);
-    auto options = std::vector<std::string>{};
     auto build_status = program.Build(device_, options);
     if (build_status == BuildStatus::kError) {
       auto message = program.GetBuildInfo(device_);
@@ -325,25 +335,30 @@ TunerImpl::TunerResult TunerImpl::RunKernel(const std::string &source, const Ker
     // Prepares the kernel
     queue_.Finish();
 
-    // Runs the kernel (this is the timed part)
+    // Multiple runs of the kernel to find the minimum execution time
     fprintf(stdout, "%s Running %s\n", kMessageRun.c_str(), kernel.name().c_str());
     auto events = std::vector<Event>(num_runs_);
+    auto elapsed_time = std::numeric_limits<float>::max();
     for (auto t=size_t{0}; t<num_runs_; ++t) {
       #ifdef VERBOSE
         fprintf(stdout, "%s Launching kernel (%zu out of %zu for averaging)\n", kMessageVerbose.c_str(),
                 t + 1, num_runs_);
       #endif
+      const auto start_time = std::chrono::steady_clock::now();
+
+      // Runs the kernel (this is the timed part)
       tune_kernel.Launch(queue_, global, local, events[t].pointer());
       queue_.Finish(events[t]);
-    }
-    queue_.Finish();
 
-    // Collects the timing information
-    auto elapsed_time = std::numeric_limits<float>::max();
-    for (auto t=size_t{0}; t<num_runs_; ++t) {
-      auto this_elapsed_time = events[t].GetElapsedTime();
-      elapsed_time = std::min(elapsed_time, this_elapsed_time);
+      // Collects the timing information
+      const auto cpu_timer = std::chrono::steady_clock::now() - start_time;
+      const auto cpu_timing = std::chrono::duration<float,std::milli>(cpu_timer).count();
+      #ifdef VERBOSE
+        fprintf(stdout, "%s Completed kernel in %.2lf ms\n", kMessageVerbose.c_str(), cpu_timing);
+      #endif
+      elapsed_time = std::min(elapsed_time, cpu_timing);
     }
+    queue_.Finish();
 
     // Prints diagnostic information
     fprintf(stdout, "%s Completed %s (%.1lf ms) - %zu out of %zu\n",