From a960b24380384457817a372314fa5b9032d2e144 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 19 May 2023 16:30:35 +0200
Subject: [PATCH 1/4] Added cl_khr_fp16 support for test_vloadstore from basic
 (issue #142, basic)

---
 test_conformance/basic/test_vloadstore.cpp | 806 +++++++++------------
 test_conformance/basic/utils.h             |  47 ++
 2 files changed, 400 insertions(+), 453 deletions(-)
 create mode 100644 test_conformance/basic/utils.h
diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index e137f9e73..b6c2b93fb 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -13,52 +13,124 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
-
+#include <algorithm>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
+#include <vector>
 
 #include "procs.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
 #include "harness/errorHelpers.h"
 
+#include "utils.h"
+
 // Outputs debug information for stores
 #define DEBUG 0
 // Forces stores/loads to be done with offsets = tid
 #define LINEAR_OFFSETS 0
 #define NUM_LOADS    512
 
-static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+char pragma_str[128] = { 0 };
+char mem_type[64] = { 0 };
+char store_str[128] = { 0 };
+char load_str[128] = { 0 };
+
+// clang-format off
+static const char *store_pattern= "results[ tid ] = tmp;\n";
+static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";
+static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n";
+static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n";
+static const char *kernel_pattern[] = {
+pragma_str,
+"#define STYPE %s\n"
+"__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n"
+};
+
+const char *pattern_local [] = {
+pragma_str,
+"__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    int tid = get_global_id( 0 );\n"
+"   int lid = get_local_id( 0 );\n"
+"\n"
+"    if( lid == 0 )\n"
+"    {\n"
+"        for( int i = 0; i < %d; i++ ) {\n"
+"           ", load_str,
+"        }\n"
+"    }\n"
+//  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
+//  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
+//  updated on all threads at that point
+"   barrier( CLK_LOCAL_MEM_FENCE );\n"
+"\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n" };
+
+const char *pattern_priv [] = {
+pragma_str,
+// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
+// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
+"#define PRIV_TYPE %s\n"
+"#define PRIV_SIZE %d\n"
+"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
+"{\n"
+"    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
+"    int tid = get_global_id( 0 );\n"
+"\n"
+"    for( int i = 0; i < PRIV_SIZE; i++ )\n"
+"      sPrivateStorage[ i ] = src[ i ];\n"
+//    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
+//  anybody else to sync up
+"\n"
+"    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
+"    ", store_str,
+"}\n"};
+// clang-format on
 
 #pragma mark -------------------- vload harness --------------------------
 
-typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize );
+typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t,
+                                  size_t);
+typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType,
+                       unsigned int, create_program_fn, size_t);
 
-int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
-               create_vload_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
+               ExplicitType type, unsigned int vecSize,
+               create_program_fn createFn, size_t bufferSize)
 {
-    int error;
-
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[ 4 ];
+    MTdataHolder d(gRandomSeed);
     const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS;
 
     if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128;
 
     size_t threads[ 1 ], localThreads[ 1 ];
     clProtectedArray inBuffer( bufferSize );
-    char programSrc[ 10240 ];
     cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ];
     size_t numElements, typeSize, i;
     unsigned int outVectorSize;
 
+    pragma_str[0] = '\0';
+    if (type == kDouble)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+    else if (type == kHalf)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
 
     typeSize = get_explicit_type_size( type );
     numElements = bufferSize / ( typeSize * vecSize );
@@ -83,25 +155,19 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
     outVectorSize = vecSize;
 
     // Declare output buffers now
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char outBuffer[ numLoads * typeSize * outVectorSize ];
-    char referenceBuffer[ numLoads * typeSize * vecSize ];
-#else
-    char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char));
-    char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<char> outBuffer(numLoads * typeSize * outVectorSize);
+    std::vector<char> referenceBuffer(numLoads * typeSize * vecSize);
 
     // Create the program
-
-
+    std::string programSrc;
     createFn( programSrc, numElements, type, vecSize, outVectorSize);
 
     // Create our kernel
-    const char *ptr = programSrc;
-
-    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    const char *ptr = programSrc.c_str();
+    cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                               &ptr, "test_fn");
     test_error( error, "Unable to create testing kernel" );
-    if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+    if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
 
     // Get the number of args to differentiate the kernels with local storage. (They have 5)
     cl_uint numArgs;
@@ -115,7 +181,9 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
     test_error( error, "Unable to create kernel stream" );
     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error );
     test_error( error, "Unable to create kernel stream" );
-    streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error );
+    streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                                numLoads * typeSize * outVectorSize,
+                                (void *)outBuffer.data(), &error);
     test_error( error, "Unable to create kernel stream" );
 
     // Set parameters and run
@@ -145,28 +213,32 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
     test_error( error, "Unable to exec kernel" );
 
     // Get the results
-    error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL );
+    error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0,
+                                numLoads * typeSize * outVectorSize
+                                    * sizeof(cl_char),
+                                (void *)outBuffer.data(), 0, NULL, NULL);
     test_error( error, "Unable to read results" );
 
-
     // Create the reference results
-    memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char));
+    referenceBuffer.assign(0, numLoads * typeSize * vecSize);
     for( i = 0; i < numLoads; i++ )
     {
-        memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize,
-               typeSize * vecSize );
+        memcpy(&referenceBuffer[i * typeSize * vecSize],
+               ((char *)(void *)inBuffer)
+                   + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize,
+               typeSize * vecSize);
     }
 
     // Validate the results now
-    char *expected = referenceBuffer;
-    char *actual = outBuffer;
+    char *expected = referenceBuffer.data();
+    char *actual = outBuffer.data();
     char *in = (char *)(void *)inBuffer;
 
     if (DEBUG) {
         log_info("Memory contents:\n");
+        char inString[1024];
+        char expectedString[1024], actualString[1024];
         for (i=0; i<numElements; i++) {
-            char  inString[1024];
-            char expectedString[ 1024 ], actualString[ 1024 ];
             if (i < numLoads) {
                 log_info("buffer %3d: input: %s expected: %s got: %s (load offset %3d, alignment offset %3d)", (int)i, GetDataVectorString( &(in[i*typeSize*vecSize]), typeSize, vecSize, inString ),
                          GetDataVectorString( &(expected[i*typeSize*vecSize]), typeSize, vecSize, expectedString ),
@@ -197,35 +269,42 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue,
         expected += typeSize * vecSize;
         actual += typeSize * outVectorSize;
     }
-
     return 0;
 }
 
-int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queue, create_vload_program_fn createFn, size_t bufferSize )
+template <test_fn test_func_ptr>
+int test_vset(cl_device_id device, cl_context context, cl_command_queue queue,
+              create_program_fn createFn, size_t bufferSize)
 {
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
+    std::vector<ExplicitType> vecType = { kChar,  kUChar, kShort, kUShort,
+                                          kInt,   kUInt,  kLong,  kULong,
+                                          kFloat, kHalf,  kDouble };
     unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
     const char *size_names[] = { "2", "3", "4", "8", "16"};
-    unsigned int typeIdx, sizeIdx;
     int error = 0;
-    MTdata mtData = init_genrand( gRandomSeed );
 
     log_info("Testing with buffer size of %d.\n", (int)bufferSize);
 
-    for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
-    {
+    bool hasDouble = is_extension_available(device, "cl_khr_fp64");
+    bool hasHalf = is_extension_available(device, "cl_khr_fp16");
 
-        if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
+    for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++)
+    {
+        if (vecType[typeIdx] == kDouble && !hasDouble)
             continue;
-
-        if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
+        else if (vecType[typeIdx] == kHalf && !hasHalf)
+            continue;
+        else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong)
+                 && !gHasLong)
             continue;
 
-        for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
+        for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++)
         {
             log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
 
-            int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData );
+            int error_this_type =
+                test_func_ptr(device, context, queue, vecType[typeIdx],
+                              vecSizes[sizeIdx], createFn, bufferSize);
             if (error_this_type) {
                 error += error_this_type;
                 log_error("Failure; skipping further sizes for this type.");
@@ -233,125 +312,59 @@ int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queu
             }
         }
     }
-
-    free_mtdata(mtData);
-
     return error;
 }
 
 #pragma mark -------------------- vload test cases --------------------------
 
-void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_global_load_code(std::string &destBuffer, size_t inBufferSize,
+                             ExplicitType type, size_t inVectorSize,
+                             size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(mem_type, sizeof(mem_type), "__global");
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
-                (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+    destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+                             typeName, (int)inVectorSize, (int)inVectorSize);
 }
 
 int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
-    return test_vloadset( device, context, queue, create_global_load_code, 10240 );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_global_load_code, 10240);
 }
 
-
-void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_local_load_code(std::string &destBuffer, size_t inBufferSize,
+                            ExplicitType type, size_t inVectorSize,
+                            size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    //"   __local %s%d sSharedStorage[ %d ];\n"
-    "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "   int lid = get_local_id( 0 );\n"
-    "\n"
-    "    if( lid == 0 )\n"
-    "    {\n"
-    "        for( int i = 0; i < %d; i++ )\n"
-    "           sSharedStorage[ i ] = src[ i ];\n"
-    "    }\n"
-    //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
-    //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
-    //  updated on all threads at that point
-    "   barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    //"   __local %s%d sSharedStorage[ %d ];\n"
-    "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "   int lid = get_local_id( 0 );\n"
-    "\n"
-    "    if( lid == 0 )\n"
-    "    {\n"
-    "        for( int i = 0; i < %d; i++ ) {\n"
-    "           sSharedStorage[ 3*i   ] = src[ 3*i   ];\n"
-    "           sSharedStorage[ 3*i +1] = src[ 3*i +1];\n"
-    "           sSharedStorage[ 3*i +2] = src[ 3*i +2];\n"
-    "        }\n"
-    "    }\n"
-    //  Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all
-    //  threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be
-    //  updated on all threads at that point
-    "   barrier( CLK_LOCAL_MEM_FENCE );\n"
-    "\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid +1] = tmp.s1;\n"
-    "   results[ 3*tid +2] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
+    std::snprintf(load_str, sizeof(load_str), load_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble  ? doubleExtensionPragma : "",
-                "",
-                typeName, /*(int)inBufferSize,*/
-                typeName, typeName,
-                (int)inBufferSize,
-                typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble  ? doubleExtensionPragma : "",
-                "",
-                typeName, (int)inVectorSize, /*(int)inBufferSize,*/
-                typeName, (int)inVectorSize, typeName, (int)outVectorSize,
-                (int)inBufferSize,
-                typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    std::string inTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
+        std::snprintf(load_str, sizeof(load_str), load_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0]));
+    destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(),
+                             outTypeName.c_str(), (int)inBufferSize, typeName,
+                             (int)inVectorSize, (int)inVectorSize, typeName);
 }
 
 int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -360,53 +373,30 @@ int test_vload_local(cl_device_id device, cl_context context, cl_command_queue q
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    if( localSize > 10240 )
-        localSize = 10240;
-    if (localSize > 4096)
-        localSize -= 2048;
-    else
-        localSize /= 2;
+    localSize = std::min(localSize, (cl_ulong)2048);
 
-    return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize );
+    return test_vset<test_vload>(device, context, queue, create_local_load_code,
+                                 (size_t)localSize);
 }
 
-
-void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_constant_load_code(std::string &destBuffer, size_t inBufferSize,
+                               ExplicitType type, size_t inVectorSize,
+                               size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    int tid = get_global_id( 0 );\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(mem_type, sizeof(mem_type), "__constant");
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName,  typeName,
-                typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize,
-                (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0]));
+    destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(),
+                             typeName, (int)inVectorSize, (int)inVectorSize);
 }
 
 int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -415,109 +405,67 @@ int test_vload_constant(cl_device_id device, cl_context context, cl_command_queu
     cl_ulong maxSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
     test_error( error, "Unable to get max size of constant memory buffer" );
-    if( maxSize > 10240 )
-        maxSize = 10240;
-    if (maxSize > 4096)
-        maxSize -= 2048;
-    else
-        maxSize /= 2;
+    maxSize = std::min(maxSize, (cl_ulong)2048);
 
-    return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_constant_load_code, (size_t)maxSize);
 }
 
-
-void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize )
+void create_private_load_code(std::string &destBuffer, size_t inBufferSize,
+                              ExplicitType type, size_t inVectorSize,
+                              size_t outVectorSize)
 {
-    const char *pattern =
-    "%s%s"
-    // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
-    // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
-    "#define PRIV_TYPE %s%d\n"
-    "#define PRIV_SIZE %d\n"
-    "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n"
-    "{\n"
-    "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
-    "    int tid = get_global_id( 0 );\n"
-    "\n"
-    "    for( int i = 0; i < %d; i++ )\n"
-    "      sPrivateStorage[ i ] = src[ i ];\n"
-    //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
-    //  anybody else to sync up
-    "\n"
-    "    %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ tid ] = tmp;\n"
-    "}\n";
-
-    const char *patternV3 =
-    "%s%s"
-    // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
-    // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
-    "#define PRIV_TYPE %s\n"
-    "#define PRIV_SIZE %d\n"
-    "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n"
-    "{\n"
-    "    __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n"
-    "    int tid = get_global_id( 0 );\n"
-    "\n"
-    "    for( int i = 0; i < PRIV_SIZE; i++ )\n"
-    "    {\n"
-    "        sPrivateStorage[ i ] = src[ i ];\n"
-    "    }\n"
-    //    Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for
-    //  anybody else to sync up
-    "\n"
-    "    %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n"
-    "   results[ 3*tid   ] = tmp.s0;\n"
-    "   results[ 3*tid+1 ] = tmp.s1;\n"
-    "   results[ 3*tid+2 ] = tmp.s2;\n"
-    "}\n";
-
+    std::snprintf(store_str, sizeof(store_str), store_patternV3);
     const char *typeName = get_explicit_type_name(type);
-    if(inVectorSize ==3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, 3*((int)inBufferSize),
-                typeName, typeName,
-                typeName );
-        // log_info("Src is \"\n%s\n\"\n", destBuffer);
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                "",
-                typeName, (int)inVectorSize, (int)inBufferSize,
-                typeName, (int)inVectorSize, typeName, (int)outVectorSize,
-                (int)inBufferSize,
-                typeName, (int)inVectorSize, (int)inVectorSize, typeName );
+    std::string outTypeName = typeName;
+    std::string inTypeName = typeName;
+    int bufSize = (int)inBufferSize * 3;
+    if (inVectorSize != 3)
+    {
+        outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize);
+        inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize);
+        bufSize = (int)inBufferSize;
+        std::snprintf(store_str, sizeof(store_str), store_pattern);
     }
+
+    std::string kernel_src = concat_kernel(
+        pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0]));
+    destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize,
+                             inTypeName.c_str(), outTypeName.c_str(), typeName,
+                             (int)inVectorSize, (int)inVectorSize, typeName);
 }
 
 int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
     // We have no idea how much actual private storage is available, so just pick a reasonable value,
     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
-    return test_vloadset( device, context, queue, create_private_load_code, 256 );
+    return test_vset<test_vload>(device, context, queue,
+                                 create_private_load_code, 256);
 }
 
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma mark -------------------- vstore harness --------------------------
 
-typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize );
-
-int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize,
-                create_vstore_program_fn createFn, size_t bufferSize, MTdata d )
+int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue,
+                ExplicitType type, unsigned int vecSize,
+                create_program_fn createFn, size_t bufferSize)
 {
-    int error;
-
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper streams[ 3 ];
+    MTdataHolder d(gRandomSeed);
 
     size_t threads[ 1 ], localThreads[ 1 ];
-
     size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS;
 
+    pragma_str[0] = '\0';
+    if (type == kDouble)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n");
+    else if (type == kHalf)
+        std::snprintf(pragma_str, sizeof(pragma_str),
+                      "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n");
+
     if (DEBUG)
         bufferSize = (bufferSize < 128) ? bufferSize : 128;
 
@@ -534,39 +482,22 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     }
     if (DEBUG)
         log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize);
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    cl_uint offsets[ numStores ];
-#else
-    cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint));
-#endif
-    char programSrc[ 10240 ];
-    size_t i;
-
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char inBuffer[ numStores * typeSize * vecSize ];
-#else
-    char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char));
-#endif
+
+    std::vector<cl_uint> offsets(numStores);
+    std::vector<char> inBuffer(numStores * typeSize * vecSize);
+
     clProtectedArray outBuffer( numElements * typeSize * vecSize );
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char referenceBuffer[ numElements * typeSize * vecSize ];
-#else
-    char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char));
-#endif
+    std::vector<char> referenceBuffer(numElements * typeSize * vecSize);
 
     // Create some random input data and random offsets to load from
-    generate_random_data( type, numStores * vecSize, d, (void *)inBuffer );
+    generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data());
 
     // Note: make sure no two offsets are the same, otherwise the output would depend on
     // the order that threads ran in, and that would be next to impossible to verify
-#if !(defined(_WIN32) && defined(_MSC_VER))
-    char flags[ numElements ];
-#else
-    char* flags = (char*)_malloca( numElements * sizeof(char));
-#endif
-
-    memset( flags, 0, numElements * sizeof(char) );
-    for( i = 0; i < numStores; i++ )
+    std::vector<char> flags(numElements);
+    flags.assign(flags.size(), 0);
+
+    for (size_t i = 0; i < numStores; i++)
     {
         do
         {
@@ -579,13 +510,15 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     if (LINEAR_OFFSETS)
         log_info("Offsets set to thread IDs to simplify output.\n");
 
-    createFn( programSrc, numElements, type, vecSize );
+    std::string programSrc;
+    createFn(programSrc, numElements, type, vecSize, vecSize);
 
     // Create our kernel
-    const char *ptr = programSrc;
-    error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" );
+    const char *ptr = programSrc.c_str();
+    cl_int error = create_single_kernel_helper(context, &program, &kernel, 1,
+                                               &ptr, "test_fn");
     test_error( error, "Unable to create testing kernel" );
-    if (DEBUG) log_info("Kernel: \n%s\n", programSrc);
+    if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str());
 
     // Get the number of args to differentiate the kernels with local storage. (They have 5)
     cl_uint numArgs;
@@ -593,9 +526,14 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     test_error( error, "clGetKernelInfo failed");
 
     // Set up parameters
-    streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error );
+    streams[0] =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       numStores * typeSize * vecSize * sizeof(cl_char),
+                       (void *)inBuffer.data(), &error);
     test_error( error, "Unable to create kernel stream" );
-    streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error );
+    streams[1] =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       numStores * sizeof(cl_uint), offsets.data(), &error);
     test_error( error, "Unable to create kernel stream" );
     streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error );
     test_error( error, "Unable to create kernel stream" );
@@ -606,7 +544,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
         // We need to set the size of the local storage
         error = clSetKernelArg(kernel, 0, bufferSize, NULL);
         test_error( error, "clSetKernelArg for buffer failed");
-        for( i = 0; i < 3; i++ )
+        for (size_t i = 0; i < 3; i++)
         {
             error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] );
             test_error( error, "Unable to set kernel argument" );
@@ -615,11 +553,10 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
     else
     {
         // No local storage
-        for( i = 0; i < 3; i++ )
+        for (size_t i = 0; i < 3; i++)
         {
             error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] );
-            if (error)
-                log_info("%s\n", programSrc);
+            if (error) log_info("%s\n", programSrc.c_str());
             test_error( error, "Unable to set kernel argument" );
         }
     }
@@ -654,25 +591,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
         error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL );
         test_error( error, "Unable to read results" );
 
-
         // Create the reference results
-        memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) );
-        for( i = 0; i < numStores; i++ )
+        referenceBuffer.assign(referenceBuffer.size(), 0);
+        for (size_t i = 0; i < numStores; i++)
         {
-            memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize );
+            memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset)
+                                    * typeSize],
+                   &inBuffer[i * typeSize * vecSize], typeSize * vecSize);
         }
 
         // Validate the results now
-        char *expected = referenceBuffer;
+        char *expected = referenceBuffer.data();
         char *actual = (char *)(void *)outBuffer;
 
         if (DEBUG)
         {
             log_info("Memory contents:\n");
-            for (i=0; i<numElements; i++)
+            char inString[1024];
+            char expectedString[1024], actualString[1024];
+            for (size_t i = 0; i < numElements; i++)
             {
-                char  inString[1024];
-                char expectedString[ 1024 ], actualString[ 1024 ];
                 if (i < numStores)
                 {
                     log_info("buffer %3d: input: %s expected: %s got: %s (store offset %3d)", (int)i, GetDataVectorString( &(inBuffer[i*typeSize*vecSize]), typeSize, vecSize, inString ),
@@ -693,7 +631,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
             }
         }
 
-        for( i = 0; i < numElements; i++ )
+        for (size_t i = 0; i < numElements; i++)
         {
             if( memcmp( expected, actual, typeSize * vecSize ) != 0 )
             {
@@ -719,62 +657,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue
             actual += typeSize * vecSize;
         }
     }
-
     return 0;
 }
 
-int test_vstoreset(cl_device_id device, cl_context context, cl_command_queue queue, create_vstore_program_fn createFn, size_t bufferSize )
-{
-    ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes };
-    unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 };
-    const char *size_names[] = { "2", "3", "4", "8", "16"};
-    unsigned int typeIdx, sizeIdx;
-    int error = 0;
-    MTdata d = init_genrand( gRandomSeed );
-
-    log_info("Testing with buffer size of %d.\n", (int)bufferSize);
-
-    for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ )
-    {
-        if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) )
-            continue;
-
-        if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong )
-            continue;
-
-        for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ )
-        {
-            log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]);
-
-            int error_this_type = test_vstore( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, d );
-            if (error_this_type)
-            {
-                log_error("Failure; skipping further sizes for this type.\n");
-                error += error_this_type;
-                break;
-            }
-        }
-    }
-
-    free_mtdata(d);
-    return error;
-}
-
-
 #pragma mark -------------------- vstore test cases --------------------------
 
-void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_global_store_code(std::string &destBuffer, size_t inBufferSize,
+                              ExplicitType type, size_t inVectorSize,
+                              size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
+    // clang-format off
+    const char *pattern [] = {
+    pragma_str,
     "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     "    vstore%d( srcValues[ tid ], offsets[ tid ], destBuffer + alignmentOffset );\n"
-    "}\n";
+    "}\n" };
 
-    const char *patternV3 =
-    "%s"
+    const char *patternV3 [] = {
+    pragma_str,
     "__kernel void test_fn( __global %s3 *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
@@ -783,45 +685,48 @@ void create_global_store_code( char *destBuffer, size_t inBufferSize, ExplicitTy
     "    } else {\n"
     "      vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n"
     "    }\n"
-    "}\n";
+    "}\n" };
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
-
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, typeName, typeName);
-
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, typeName, typeName);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, (int)inVectorSize,
+                                 typeName, (int)inVectorSize);
     }
-    // if(inVectorSize == 3 || inVectorSize == 4) {
-    //     log_info("\n----\n%s\n----\n", destBuffer);
-    // }
 }
 
 int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
 {
-    return test_vstoreset( device, context, queue, create_global_store_code, 10240 );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_global_store_code, 10240);
 }
 
-
-void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_local_store_code(std::string &destBuffer, size_t inBufferSize,
+                             ExplicitType type, size_t inVectorSize,
+                             size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
-    "\n"
-    "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+    // clang-format off
+    const char *pattern[] = {
+    pragma_str,
+    "#define LOC_TYPE %s\n"
+    "#define LOC_VTYPE %s%d\n"
+    "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n"
+    " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n"
     " sSharedStorage[ offsets[tid] +1 ] =  sSharedStorage[ offsets[tid] ];\n"
     "   barrier( CLK_LOCAL_MEM_FENCE );\n"
     "\n"
-    "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n"
+    "    vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n"
     "\n"
     // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output
     // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy
@@ -830,20 +735,20 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  int i;\n"
-    "  __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+    "  __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n"
+    "  __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n" };
 
-    const char *patternV3 =
-    "%s"
-    "\n"
-    "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n"
+    const char *patternV3 [] = {
+    pragma_str,
+    "#define LOC_TYPE %s\n"
+    "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
     "    int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    "    sSharedStorage[ 3*offsets[tid]   ] = (%s)0;\n"
+    "    sSharedStorage[ 3*offsets[tid]   ] = (LOC_TYPE)0;\n"
     "    sSharedStorage[ 3*offsets[tid] +1 ] =  \n"
     "        sSharedStorage[ 3*offsets[tid] ];\n"
     "    sSharedStorage[ 3*offsets[tid] +2 ] =  \n"
@@ -865,30 +770,26 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  int i;\n"
-    "  __local %s *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
+    "  __local LOC_TYPE *sp =  (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n"
+    "  __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; i < 3; i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n" };
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName,
-                typeName,
-                typeName,  typeName,
-                typeName, typeName, typeName  );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, typeName,
-                (int)inVectorSize, typeName, typeName,
-                typeName, typeName, typeName  );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer = str_sprintf(kernel_src, typeName);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer = str_sprintf(kernel_src, typeName, typeName,
+                                 (int)inVectorSize, (int)inVectorSize);
     }
-    // log_info(destBuffer);
 }
 
 int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems )
@@ -897,81 +798,79 @@ int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    if( localSize > 10240 )
-        localSize = 10240;
-    if (localSize > 4096)
-        localSize -= 2048;
-    else
-        localSize /= 2;
-    return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize );
-}
+    localSize = std::min(localSize, (cl_ulong)2048);
 
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_local_store_code, (size_t)localSize);
+}
 
-void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize )
+void create_private_store_code(std::string &destBuffer, size_t inBufferSize,
+                               ExplicitType type, size_t inVectorSize,
+                               size_t /*unused*/)
 {
-    const char *pattern =
-    "%s"
+    // clang-format off
+    const char *pattern [] = {
+    pragma_str,
+    "#define PRIV_TYPE %s\n"
+    "#define PRIV_VTYPE %s%d\n"
     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
     "\n"
-    "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n"
+    "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
-    "    __private %s%d sPrivateStorage[ %d ];\n"
-    "    int tid = get_global_id( 0 );\n"
+    "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n"
+    "  int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sPrivateStorage[tid] = (%s%d)(%s)0;\n"
+    " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
     "\n"
-    "   vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
+    "  vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
     "\n"
     // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
     // otherwise, local threads would be overwriting results from other local threads
     "  uint i;\n"
-    "  __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
-    "  __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
+    "  __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n"
+    "  __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n"
     "  for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
-
+    "}\n"};
 
-    const char *patternV3 =
-    "%s"
+    const char *patternV3  [] = {
+    pragma_str,
+    "#define PRIV_TYPE %s\n"
+    "#define PRIV_VTYPE %s3\n"
     // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means
     // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test
     "\n"
-    "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n"
+    "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n"
     "{\n"
-    "    __private %s3 sPrivateStorage[ %d ];\n" // keep this %d
-    "    int tid = get_global_id( 0 );\n"
+    "  __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d
+    "  int tid = get_global_id( 0 );\n"
     // We need to zero the shared storage since any locations we don't write to will have garbage otherwise.
-    " sPrivateStorage[tid] = (%s3)(%s)0;\n"
+    " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n"
     "\n"
-
-    "   vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n"
-    "\n"
-    // Note: we only copy the relevant portion of our local storage over to the dest buffer, because
-    // otherwise, local threads would be overwriting results from other local threads
+    "  vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n"
     "  uint i;\n"
-    "  __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
-    "  __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
+    "  __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n"
+    "  __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n"
     "  for( i = 0; i < 3; i++ ) \n"
     "       dp[i] = sp[i];\n"
-    "}\n";
+    "}\n"};
+    // clang-format on
 
     const char *typeName = get_explicit_type_name(type);
     if(inVectorSize == 3) {
-        sprintf( destBuffer, patternV3,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName,  typeName,
-                typeName, (int)inBufferSize,
-                typeName, typeName,
-                typeName, typeName, typeName, typeName, typeName );
-    } else {
-        sprintf( destBuffer, pattern,
-                type == kDouble ? doubleExtensionPragma : "",
-                typeName, (int)inVectorSize, typeName, (int)inVectorSize,
-                typeName, (int)inVectorSize, (int)inBufferSize,
-                typeName, (int)inVectorSize, typeName,
-                (int)inVectorSize, typeName, typeName, typeName, typeName, typeName );
+        std::string kernel_src =
+            concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0]));
+        destBuffer =
+            str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize);
+    }
+    else
+    {
+        std::string kernel_src =
+            concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0]));
+        destBuffer =
+            str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize,
+                        (int)inBufferSize, (int)inVectorSize);
     }
 }
 
@@ -979,7 +878,8 @@ int test_vstore_private(cl_device_id device, cl_context context, cl_command_queu
 {
     // We have no idea how much actual private storage is available, so just pick a reasonable value,
     // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes
-    return test_vstoreset( device, context, queue, create_private_store_code, 256 );
+    return test_vset<test_vstore>(device, context, queue,
+                                  create_private_store_code, 256);
 }
 
 
diff --git a/test_conformance/basic/utils.h b/test_conformance/basic/utils.h
new file mode 100644
index 000000000..d7b30a09b
--- /dev/null
+++ b/test_conformance/basic/utils.h
@@ -0,0 +1,47 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef BASIC_UTILS_H
+#define BASIC_UTILS_H
+
+#include <memory>
+#include <string>
+#include <CL/cl_half.h>
+
+extern cl_half_rounding_mode halfRoundingMode;
+
+inline std::string concat_kernel(const char *sstr[], int num)
+{
+    std::string res;
+    for (int i = 0; i < num; i++) res += std::string(sstr[i]);
+    return res;
+}
+
+template <typename... Args>
+inline std::string str_sprintf(const std::string &str, Args... args)
+{
+    int str_size = std::snprintf(nullptr, 0, str.c_str(), args...) + 1;
+    if (str_size <= 0) throw std::runtime_error("Formatting error.");
+    size_t s = static_cast<size_t>(str_size);
+    std::unique_ptr<char[]> buffer(new char[s]);
+    std::snprintf(buffer.get(), s, str.c_str(), args...);
+    return std::string(buffer.get(), buffer.get() + s - 1);
+}
+
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
+#endif // BASIC_UTIL_H

From 310d2ed1d19cbb7962a65f7b941ad56b46407811 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Mon, 12 Jun 2023 09:38:51 +0200
Subject: [PATCH 2/4] Moved string helper procedures due to request from
 test_commonfns PR #1695

---
 .../utils.h => test_common/harness/stringHelpers.h    |  7 -------
 test_conformance/basic/test_astype.cpp                |  7 +++----
 test_conformance/basic/test_vloadstore.cpp            | 11 ++++++++---
 3 files changed, 11 insertions(+), 14 deletions(-)
 rename test_conformance/basic/utils.h => test_common/harness/stringHelpers.h (88%)

diff --git a/test_conformance/basic/utils.h b/test_common/harness/stringHelpers.h
similarity index 88%
rename from test_conformance/basic/utils.h
rename to test_common/harness/stringHelpers.h
index 6409efcde..3f6bf64db 100644
--- a/test_conformance/basic/utils.h
+++ b/test_common/harness/stringHelpers.h
@@ -20,10 +20,6 @@
 #include <memory>
 #include <string>
 
-#include <CL/cl_half.h>
-
-extern cl_half_rounding_mode halfRoundingMode;
-
 inline std::string concat_kernel(const char *sstr[], int num)
 {
     std::string res;
@@ -42,7 +38,4 @@ inline std::string str_sprintf(const std::string &str, Args... args)
     return std::string(buffer.get(), buffer.get() + s - 1);
 }
 
-#define HFF(num) cl_half_from_float(num, halfRoundingMode)
-#define HTF(num) cl_half_to_float(num)
-
 #endif // BASIC_UTIL_H
diff --git a/test_conformance/basic/test_astype.cpp b/test_conformance/basic/test_astype.cpp
index 08a4cb85a..45669a7cb 100644
--- a/test_conformance/basic/test_astype.cpp
+++ b/test_conformance/basic/test_astype.cpp
@@ -14,6 +14,9 @@
 // limitations under the License.
 //
 #include "harness/compat.h"
+#include "harness/conversions.h"
+#include "harness/stringHelpers.h"
+#include "harness/typeWrappers.h"
 
 #include <limits.h>
 #include <stdio.h>
@@ -22,11 +25,7 @@
 #include <sys/stat.h>
 #include <vector>
 
-#include "harness/conversions.h"
-#include "harness/typeWrappers.h"
-
 #include "procs.h"
-#include "utils.h"
 
 // clang-format off
 
diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index b6c2b93fb..c2f7399e1 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -22,24 +22,29 @@
 #include <sys/stat.h>
 #include <vector>
 
+#include <CL/cl_half.h>
+
 #include "procs.h"
 #include "harness/conversions.h"
-#include "harness/typeWrappers.h"
 #include "harness/errorHelpers.h"
-
-#include "utils.h"
+#include "harness/stringHelpers.h"
+#include "harness/typeWrappers.h"
 
 // Outputs debug information for stores
 #define DEBUG 0
 // Forces stores/loads to be done with offsets = tid
 #define LINEAR_OFFSETS 0
 #define NUM_LOADS    512
+#define HFF(num) cl_half_from_float(num, halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
 
 char pragma_str[128] = { 0 };
 char mem_type[64] = { 0 };
 char store_str[128] = { 0 };
 char load_str[128] = { 0 };
 
+extern cl_half_rounding_mode halfRoundingMode;
+
 // clang-format off
 static const char *store_pattern= "results[ tid ] = tmp;\n";
 static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n";

From 5ecafc82098a6c724ba24bcaf13bd13dbbfde612 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 15 Jun 2023 08:48:14 +0200
Subject: [PATCH 3/4] restored original test sizes

---
 test_conformance/basic/test_vloadstore.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index c2f7399e1..e4a36b776 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -378,7 +378,11 @@ int test_vload_local(cl_device_id device, cl_context context, cl_command_queue q
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    localSize = std::min(localSize, (cl_ulong)2048);
+    if (localSize > 10240) localSize = 10240;
+    if (localSize > 4096)
+        localSize -= 2048;
+    else
+        localSize /= 2;
 
     return test_vset<test_vload>(device, context, queue, create_local_load_code,
                                  (size_t)localSize);
@@ -410,7 +414,11 @@ int test_vload_constant(cl_device_id device, cl_context context, cl_command_queu
     cl_ulong maxSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL );
     test_error( error, "Unable to get max size of constant memory buffer" );
-    maxSize = std::min(maxSize, (cl_ulong)2048);
+    if (maxSize > 10240) maxSize = 10240;
+    if (maxSize > 4096)
+        maxSize -= 2048;
+    else
+        maxSize /= 2;
 
     return test_vset<test_vload>(device, context, queue,
                                  create_constant_load_code, (size_t)maxSize);
@@ -803,8 +811,11 @@ int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue
     cl_ulong localSize;
     int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL );
     test_error( error, "Unable to get max size of local memory buffer" );
-    localSize = std::min(localSize, (cl_ulong)2048);
-
+    if (localSize > 10240) localSize = 10240;
+    if (localSize > 4096)
+        localSize -= 2048;
+    else
+        localSize /= 2;
     return test_vset<test_vstore>(device, context, queue,
                                   create_local_store_code, (size_t)localSize);
 }

From c3f337bc5093d1bc18244aff1a138904a770d45f Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 20 Jun 2023 15:55:29 +0200
Subject: [PATCH 4/4] Corrected invalid initialization of reference buffer

---
 test_conformance/basic/test_vloadstore.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp
index e4a36b776..d34ecbf90 100644
--- a/test_conformance/basic/test_vloadstore.cpp
+++ b/test_conformance/basic/test_vloadstore.cpp
@@ -225,7 +225,7 @@ int test_vload(cl_device_id device, cl_context context, cl_command_queue queue,
     test_error( error, "Unable to read results" );
 
     // Create the reference results
-    referenceBuffer.assign(0, numLoads * typeSize * vecSize);
+    referenceBuffer.assign(numLoads * typeSize * vecSize, 0);
     for( i = 0; i < numLoads; i++ )
     {
         memcpy(&referenceBuffer[i * typeSize * vecSize],