From a960b24380384457817a372314fa5b9032d2e144 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Fri, 19 May 2023 16:30:35 +0200 Subject: [PATCH 1/4] Added cl_khr_fp16 support for test_vloadstore from basic (issue #142, basic) --- test_conformance/basic/test_vloadstore.cpp | 806 +++++++++------------ test_conformance/basic/utils.h | 47 ++ 2 files changed, 400 insertions(+), 453 deletions(-) create mode 100644 test_conformance/basic/utils.h diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp index e137f9e73..b6c2b93fb 100644 --- a/test_conformance/basic/test_vloadstore.cpp +++ b/test_conformance/basic/test_vloadstore.cpp @@ -13,52 +13,124 @@ // See the License for the specific language governing permissions and // limitations under the License. // -#include "harness/compat.h" - +#include #include #include #include #include #include #include - +#include #include "procs.h" #include "harness/conversions.h" #include "harness/typeWrappers.h" #include "harness/errorHelpers.h" +#include "utils.h" + // Outputs debug information for stores #define DEBUG 0 // Forces stores/loads to be done with offsets = tid #define LINEAR_OFFSETS 0 #define NUM_LOADS 512 -static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; +char pragma_str[128] = { 0 }; +char mem_type[64] = { 0 }; +char store_str[128] = { 0 }; +char load_str[128] = { 0 }; + +// clang-format off +static const char *store_pattern= "results[ tid ] = tmp;\n"; +static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n"; +static const char *load_pattern = "sSharedStorage[ i ] = src[ i ];\n"; +static const char *load_patternV3 = "sSharedStorage[3*i] = src[ 3*i]; sSharedStorage[3*i+1] = src[3*i+1]; sSharedStorage[3*i+2] = src[3*i+2];\n"; +static const char *kernel_pattern[] = { +pragma_str, +"#define STYPE %s\n" +"__kernel void test_fn( ", mem_type, " STYPE *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" +"{\n" +" int tid = get_global_id( 0 );\n" +" %s%d tmp = vload%d( offsets[ tid ], ( (", mem_type, " STYPE *) src ) + alignmentOffsets[ tid ] );\n" +" ", store_str, +"}\n" +}; + +const char *pattern_local [] = { +pragma_str, +"__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" +"{\n" +" int tid = get_global_id( 0 );\n" +" int lid = get_local_id( 0 );\n" +"\n" +" if( lid == 0 )\n" +" {\n" +" for( int i = 0; i < %d; i++ ) {\n" +" ", load_str, +" }\n" +" }\n" +// Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all +// threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be +// updated on all threads at that point +" barrier( CLK_LOCAL_MEM_FENCE );\n" +"\n" +" %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n" +" ", store_str, +"}\n" }; + +const char *pattern_priv [] = { +pragma_str, +// Private memory is unique per thread, unlike local storage which is unique per local work group. Which means +// for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test +"#define PRIV_TYPE %s\n" +"#define PRIV_SIZE %d\n" +"__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" +"{\n" +" __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n" +" int tid = get_global_id( 0 );\n" +"\n" +" for( int i = 0; i < PRIV_SIZE; i++ )\n" +" sPrivateStorage[ i ] = src[ i ];\n" +// Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for +// anybody else to sync up +"\n" +" %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n" +" ", store_str, +"}\n"}; +// clang-format on #pragma mark -------------------- vload harness -------------------------- -typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ); +typedef void (*create_program_fn)(std::string &, size_t, ExplicitType, size_t, + size_t); +typedef int (*test_fn)(cl_device_id, cl_context, cl_command_queue, ExplicitType, + unsigned int, create_program_fn, size_t); -int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize, - create_vload_program_fn createFn, size_t bufferSize, MTdata d ) +int test_vload(cl_device_id device, cl_context context, cl_command_queue queue, + ExplicitType type, unsigned int vecSize, + create_program_fn createFn, size_t bufferSize) { - int error; - clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[ 4 ]; + MTdataHolder d(gRandomSeed); const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS; if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128; size_t threads[ 1 ], localThreads[ 1 ]; clProtectedArray inBuffer( bufferSize ); - char programSrc[ 10240 ]; cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ]; size_t numElements, typeSize, i; unsigned int outVectorSize; + pragma_str[0] = '\0'; + if (type == kDouble) + std::snprintf(pragma_str, sizeof(pragma_str), + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"); + else if (type == kHalf) + std::snprintf(pragma_str, sizeof(pragma_str), + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"); typeSize = get_explicit_type_size( type ); numElements = bufferSize / ( typeSize * vecSize ); @@ -83,25 +155,19 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, outVectorSize = vecSize; // Declare output buffers now -#if !(defined(_WIN32) && defined(_MSC_VER)) - char outBuffer[ numLoads * typeSize * outVectorSize ]; - char referenceBuffer[ numLoads * typeSize * vecSize ]; -#else - char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char)); - char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char)); -#endif + std::vector outBuffer(numLoads * typeSize * outVectorSize); + std::vector referenceBuffer(numLoads * typeSize * vecSize); // Create the program - - + std::string programSrc; createFn( programSrc, numElements, type, vecSize, outVectorSize); // Create our kernel - const char *ptr = programSrc; - - error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" ); + const char *ptr = programSrc.c_str(); + cl_int error = create_single_kernel_helper(context, &program, &kernel, 1, + &ptr, "test_fn"); test_error( error, "Unable to create testing kernel" ); - if (DEBUG) log_info("Kernel: \n%s\n", programSrc); + if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str()); // Get the number of args to differentiate the kernels with local storage. (They have 5) cl_uint numArgs; @@ -115,7 +181,9 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, test_error( error, "Unable to create kernel stream" ); streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error ); test_error( error, "Unable to create kernel stream" ); - streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error ); + streams[3] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + numLoads * typeSize * outVectorSize, + (void *)outBuffer.data(), &error); test_error( error, "Unable to create kernel stream" ); // Set parameters and run @@ -145,28 +213,32 @@ int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, test_error( error, "Unable to exec kernel" ); // Get the results - error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL ); + error = clEnqueueReadBuffer(queue, streams[3], CL_TRUE, 0, + numLoads * typeSize * outVectorSize + * sizeof(cl_char), + (void *)outBuffer.data(), 0, NULL, NULL); test_error( error, "Unable to read results" ); - // Create the reference results - memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char)); + referenceBuffer.assign(0, numLoads * typeSize * vecSize); for( i = 0; i < numLoads; i++ ) { - memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize, - typeSize * vecSize ); + memcpy(&referenceBuffer[i * typeSize * vecSize], + ((char *)(void *)inBuffer) + + ((offsets[i] * vecSize) + alignmentOffsets[i]) * typeSize, + typeSize * vecSize); } // Validate the results now - char *expected = referenceBuffer; - char *actual = outBuffer; + char *expected = referenceBuffer.data(); + char *actual = outBuffer.data(); char *in = (char *)(void *)inBuffer; if (DEBUG) { log_info("Memory contents:\n"); + char inString[1024]; + char expectedString[1024], actualString[1024]; for (i=0; i +int test_vset(cl_device_id device, cl_context context, cl_command_queue queue, + create_program_fn createFn, size_t bufferSize) { - ExplicitType vecType[] = { kChar, kUChar, kShort, kUShort, kInt, kUInt, kLong, kULong, kFloat, kDouble, kNumExplicitTypes }; + std::vector vecType = { kChar, kUChar, kShort, kUShort, + kInt, kUInt, kLong, kULong, + kFloat, kHalf, kDouble }; unsigned int vecSizes[] = { 2, 3, 4, 8, 16, 0 }; const char *size_names[] = { "2", "3", "4", "8", "16"}; - unsigned int typeIdx, sizeIdx; int error = 0; - MTdata mtData = init_genrand( gRandomSeed ); log_info("Testing with buffer size of %d.\n", (int)bufferSize); - for( typeIdx = 0; vecType[ typeIdx ] != kNumExplicitTypes; typeIdx++ ) - { + bool hasDouble = is_extension_available(device, "cl_khr_fp64"); + bool hasHalf = is_extension_available(device, "cl_khr_fp16"); - if( vecType[ typeIdx ] == kDouble && !is_extension_available( device, "cl_khr_fp64" ) ) + for (unsigned typeIdx = 0; typeIdx < vecType.size(); typeIdx++) + { + if (vecType[typeIdx] == kDouble && !hasDouble) continue; - - if(( vecType[ typeIdx ] == kLong || vecType[ typeIdx ] == kULong ) && !gHasLong ) + else if (vecType[typeIdx] == kHalf && !hasHalf) + continue; + else if ((vecType[typeIdx] == kLong || vecType[typeIdx] == kULong) + && !gHasLong) continue; - for( sizeIdx = 0; vecSizes[ sizeIdx ] != 0; sizeIdx++ ) + for (unsigned sizeIdx = 0; vecSizes[sizeIdx] != 0; sizeIdx++) { log_info("Testing %s%s...\n", get_explicit_type_name(vecType[typeIdx]), size_names[sizeIdx]); - int error_this_type = test_vload( device, context, queue, vecType[ typeIdx ], vecSizes[ sizeIdx ], createFn, bufferSize, mtData ); + int error_this_type = + test_func_ptr(device, context, queue, vecType[typeIdx], + vecSizes[sizeIdx], createFn, bufferSize); if (error_this_type) { error += error_this_type; log_error("Failure; skipping further sizes for this type."); @@ -233,125 +312,59 @@ int test_vloadset(cl_device_id device, cl_context context, cl_command_queue queu } } } - - free_mtdata(mtData); - return error; } #pragma mark -------------------- vload test cases -------------------------- -void create_global_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ) +void create_global_load_code(std::string &destBuffer, size_t inBufferSize, + ExplicitType type, size_t inVectorSize, + size_t outVectorSize) { - const char *pattern = - "%s%s" - "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n" - "{\n" - " int tid = get_global_id( 0 );\n" - " %s%d tmp = vload%d( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n" - " results[ tid ] = tmp;\n" - "}\n"; - - const char *patternV3 = - "%s%s" - "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" - "{\n" - " int tid = get_global_id( 0 );\n" - " %s3 tmp = vload3( offsets[ tid ], ( (__global %s *) src ) + alignmentOffsets[ tid ] );\n" - " results[ 3*tid ] = tmp.s0;\n" - " results[ 3*tid+1 ] = tmp.s1;\n" - " results[ 3*tid+2 ] = tmp.s2;\n" - "}\n"; - + std::snprintf(mem_type, sizeof(mem_type), "__global"); + std::snprintf(store_str, sizeof(store_str), store_patternV3); const char *typeName = get_explicit_type_name(type); - if(inVectorSize == 3) { - sprintf( destBuffer, patternV3, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, typeName, typeName, typeName ); - } else { - sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "", - "", - typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize, - (int)inVectorSize, typeName ); + std::string outTypeName = typeName; + if (inVectorSize != 3) + { + outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize); + std::snprintf(store_str, sizeof(store_str), store_pattern); } + + std::string kernel_src = concat_kernel( + kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0])); + destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(), + typeName, (int)inVectorSize, (int)inVectorSize); } int test_vload_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { - return test_vloadset( device, context, queue, create_global_load_code, 10240 ); + return test_vset(device, context, queue, + create_global_load_code, 10240); } - -void create_local_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ) +void create_local_load_code(std::string &destBuffer, size_t inBufferSize, + ExplicitType type, size_t inVectorSize, + size_t outVectorSize) { - const char *pattern = - "%s%s" - //" __local %s%d sSharedStorage[ %d ];\n" - "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n" - "{\n" - " int tid = get_global_id( 0 );\n" - " int lid = get_local_id( 0 );\n" - "\n" - " if( lid == 0 )\n" - " {\n" - " for( int i = 0; i < %d; i++ )\n" - " sSharedStorage[ i ] = src[ i ];\n" - " }\n" - // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all - // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be - // updated on all threads at that point - " barrier( CLK_LOCAL_MEM_FENCE );\n" - "\n" - " %s%d tmp = vload%d( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n" - " results[ tid ] = tmp;\n" - "}\n"; - - const char *patternV3 = - "%s%s" - //" __local %s%d sSharedStorage[ %d ];\n" - "__kernel void test_fn(__local %s *sSharedStorage, __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" - "{\n" - " int tid = get_global_id( 0 );\n" - " int lid = get_local_id( 0 );\n" - "\n" - " if( lid == 0 )\n" - " {\n" - " for( int i = 0; i < %d; i++ ) {\n" - " sSharedStorage[ 3*i ] = src[ 3*i ];\n" - " sSharedStorage[ 3*i +1] = src[ 3*i +1];\n" - " sSharedStorage[ 3*i +2] = src[ 3*i +2];\n" - " }\n" - " }\n" - // Note: the above loop will only run on the first thread of each local group, but this barrier should ensure that all - // threads are caught up (including the first one with the copy) before any proceed, i.e. the shared storage should be - // updated on all threads at that point - " barrier( CLK_LOCAL_MEM_FENCE );\n" - "\n" - " %s3 tmp = vload3( offsets[ tid ], ( (__local %s *) sSharedStorage ) + alignmentOffsets[ tid ] );\n" - " results[ 3*tid ] = tmp.s0;\n" - " results[ 3*tid +1] = tmp.s1;\n" - " results[ 3*tid +2] = tmp.s2;\n" - "}\n"; - + std::snprintf(store_str, sizeof(store_str), store_patternV3); + std::snprintf(load_str, sizeof(load_str), load_patternV3); const char *typeName = get_explicit_type_name(type); - if(inVectorSize == 3) { - sprintf( destBuffer, patternV3, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, /*(int)inBufferSize,*/ - typeName, typeName, - (int)inBufferSize, - typeName, typeName ); - } else { - sprintf( destBuffer, pattern, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, (int)inVectorSize, /*(int)inBufferSize,*/ - typeName, (int)inVectorSize, typeName, (int)outVectorSize, - (int)inBufferSize, - typeName, (int)inVectorSize, (int)inVectorSize, typeName ); + std::string outTypeName = typeName; + std::string inTypeName = typeName; + if (inVectorSize != 3) + { + outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize); + inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize); + std::snprintf(store_str, sizeof(store_str), store_pattern); + std::snprintf(load_str, sizeof(load_str), load_pattern); } + + std::string kernel_src = concat_kernel( + pattern_local, sizeof(pattern_local) / sizeof(pattern_local[0])); + destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), inTypeName.c_str(), + outTypeName.c_str(), (int)inBufferSize, typeName, + (int)inVectorSize, (int)inVectorSize, typeName); } int test_vload_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) @@ -360,53 +373,30 @@ int test_vload_local(cl_device_id device, cl_context context, cl_command_queue q cl_ulong localSize; int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL ); test_error( error, "Unable to get max size of local memory buffer" ); - if( localSize > 10240 ) - localSize = 10240; - if (localSize > 4096) - localSize -= 2048; - else - localSize /= 2; + localSize = std::min(localSize, (cl_ulong)2048); - return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize ); + return test_vset(device, context, queue, create_local_load_code, + (size_t)localSize); } - -void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ) +void create_constant_load_code(std::string &destBuffer, size_t inBufferSize, + ExplicitType type, size_t inVectorSize, + size_t outVectorSize) { - const char *pattern = - "%s%s" - "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n" - "{\n" - " int tid = get_global_id( 0 );\n" - " %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n" - " results[ tid ] = tmp;\n" - "}\n"; - - const char *patternV3 = - "%s%s" - "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" - "{\n" - " int tid = get_global_id( 0 );\n" - " %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n" - " results[ 3*tid ] = tmp.s0;\n" - " results[ 3*tid+1 ] = tmp.s1;\n" - " results[ 3*tid+2 ] = tmp.s2;\n" - "}\n"; - + std::snprintf(mem_type, sizeof(mem_type), "__constant"); + std::snprintf(store_str, sizeof(store_str), store_patternV3); const char *typeName = get_explicit_type_name(type); - if(inVectorSize == 3) { - sprintf( destBuffer, patternV3, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, typeName, typeName, - typeName ); - } else { - sprintf( destBuffer, pattern, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize, - (int)inVectorSize, typeName ); + std::string outTypeName = typeName; + if (inVectorSize != 3) + { + outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize); + std::snprintf(store_str, sizeof(store_str), store_pattern); } + + std::string kernel_src = concat_kernel( + kernel_pattern, sizeof(kernel_pattern) / sizeof(kernel_pattern[0])); + destBuffer = str_sprintf(kernel_src, typeName, outTypeName.c_str(), + typeName, (int)inVectorSize, (int)inVectorSize); } int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) @@ -415,109 +405,67 @@ int test_vload_constant(cl_device_id device, cl_context context, cl_command_queu cl_ulong maxSize; int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL ); test_error( error, "Unable to get max size of constant memory buffer" ); - if( maxSize > 10240 ) - maxSize = 10240; - if (maxSize > 4096) - maxSize -= 2048; - else - maxSize /= 2; + maxSize = std::min(maxSize, (cl_ulong)2048); - return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize ); + return test_vset(device, context, queue, + create_constant_load_code, (size_t)maxSize); } - -void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ) +void create_private_load_code(std::string &destBuffer, size_t inBufferSize, + ExplicitType type, size_t inVectorSize, + size_t outVectorSize) { - const char *pattern = - "%s%s" - // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means - // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test - "#define PRIV_TYPE %s%d\n" - "#define PRIV_SIZE %d\n" - "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n" - "{\n" - " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n" - " int tid = get_global_id( 0 );\n" - "\n" - " for( int i = 0; i < %d; i++ )\n" - " sPrivateStorage[ i ] = src[ i ];\n" - // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for - // anybody else to sync up - "\n" - " %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n" - " results[ tid ] = tmp;\n" - "}\n"; - - const char *patternV3 = - "%s%s" - // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means - // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test - "#define PRIV_TYPE %s\n" - "#define PRIV_SIZE %d\n" - "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" - "{\n" - " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n" - " int tid = get_global_id( 0 );\n" - "\n" - " for( int i = 0; i < PRIV_SIZE; i++ )\n" - " {\n" - " sPrivateStorage[ i ] = src[ i ];\n" - " }\n" - // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for - // anybody else to sync up - "\n" - " %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n" - " results[ 3*tid ] = tmp.s0;\n" - " results[ 3*tid+1 ] = tmp.s1;\n" - " results[ 3*tid+2 ] = tmp.s2;\n" - "}\n"; - + std::snprintf(store_str, sizeof(store_str), store_patternV3); const char *typeName = get_explicit_type_name(type); - if(inVectorSize ==3) { - sprintf( destBuffer, patternV3, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, 3*((int)inBufferSize), - typeName, typeName, - typeName ); - // log_info("Src is \"\n%s\n\"\n", destBuffer); - } else { - sprintf( destBuffer, pattern, - type == kDouble ? doubleExtensionPragma : "", - "", - typeName, (int)inVectorSize, (int)inBufferSize, - typeName, (int)inVectorSize, typeName, (int)outVectorSize, - (int)inBufferSize, - typeName, (int)inVectorSize, (int)inVectorSize, typeName ); + std::string outTypeName = typeName; + std::string inTypeName = typeName; + int bufSize = (int)inBufferSize * 3; + if (inVectorSize != 3) + { + outTypeName = str_sprintf("%s%d", typeName, (int)outVectorSize); + inTypeName = str_sprintf("%s%d", typeName, (int)inVectorSize); + bufSize = (int)inBufferSize; + std::snprintf(store_str, sizeof(store_str), store_pattern); } + + std::string kernel_src = concat_kernel( + pattern_priv, sizeof(pattern_priv) / sizeof(pattern_priv[0])); + destBuffer = str_sprintf(kernel_src, inTypeName.c_str(), bufSize, + inTypeName.c_str(), outTypeName.c_str(), typeName, + (int)inVectorSize, (int)inVectorSize, typeName); } int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { // We have no idea how much actual private storage is available, so just pick a reasonable value, // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes - return test_vloadset( device, context, queue, create_private_load_code, 256 ); + return test_vset(device, context, queue, + create_private_load_code, 256); } - /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #pragma mark -------------------- vstore harness -------------------------- -typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize ); - -int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize, - create_vstore_program_fn createFn, size_t bufferSize, MTdata d ) +int test_vstore(cl_device_id device, cl_context context, cl_command_queue queue, + ExplicitType type, unsigned int vecSize, + create_program_fn createFn, size_t bufferSize) { - int error; - clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[ 3 ]; + MTdataHolder d(gRandomSeed); size_t threads[ 1 ], localThreads[ 1 ]; - size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS; + pragma_str[0] = '\0'; + if (type == kDouble) + std::snprintf(pragma_str, sizeof(pragma_str), + "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"); + else if (type == kHalf) + std::snprintf(pragma_str, sizeof(pragma_str), + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"); + if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128; @@ -534,39 +482,22 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue } if (DEBUG) log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize); -#if !(defined(_WIN32) && defined(_MSC_VER)) - cl_uint offsets[ numStores ]; -#else - cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint)); -#endif - char programSrc[ 10240 ]; - size_t i; - -#if !(defined(_WIN32) && defined(_MSC_VER)) - char inBuffer[ numStores * typeSize * vecSize ]; -#else - char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char)); -#endif + + std::vector offsets(numStores); + std::vector inBuffer(numStores * typeSize * vecSize); + clProtectedArray outBuffer( numElements * typeSize * vecSize ); -#if !(defined(_WIN32) && defined(_MSC_VER)) - char referenceBuffer[ numElements * typeSize * vecSize ]; -#else - char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char)); -#endif + std::vector referenceBuffer(numElements * typeSize * vecSize); // Create some random input data and random offsets to load from - generate_random_data( type, numStores * vecSize, d, (void *)inBuffer ); + generate_random_data(type, numStores * vecSize, d, (void *)inBuffer.data()); // Note: make sure no two offsets are the same, otherwise the output would depend on // the order that threads ran in, and that would be next to impossible to verify -#if !(defined(_WIN32) && defined(_MSC_VER)) - char flags[ numElements ]; -#else - char* flags = (char*)_malloca( numElements * sizeof(char)); -#endif - - memset( flags, 0, numElements * sizeof(char) ); - for( i = 0; i < numStores; i++ ) + std::vector flags(numElements); + flags.assign(flags.size(), 0); + + for (size_t i = 0; i < numStores; i++) { do { @@ -579,13 +510,15 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n"); - createFn( programSrc, numElements, type, vecSize ); + std::string programSrc; + createFn(programSrc, numElements, type, vecSize, vecSize); // Create our kernel - const char *ptr = programSrc; - error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" ); + const char *ptr = programSrc.c_str(); + cl_int error = create_single_kernel_helper(context, &program, &kernel, 1, + &ptr, "test_fn"); test_error( error, "Unable to create testing kernel" ); - if (DEBUG) log_info("Kernel: \n%s\n", programSrc); + if (DEBUG) log_info("Kernel: \n%s\n", programSrc.c_str()); // Get the number of args to differentiate the kernels with local storage. (They have 5) cl_uint numArgs; @@ -593,9 +526,14 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue test_error( error, "clGetKernelInfo failed"); // Set up parameters - streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error ); + streams[0] = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + numStores * typeSize * vecSize * sizeof(cl_char), + (void *)inBuffer.data(), &error); test_error( error, "Unable to create kernel stream" ); - streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error ); + streams[1] = + clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, + numStores * sizeof(cl_uint), offsets.data(), &error); test_error( error, "Unable to create kernel stream" ); streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error ); test_error( error, "Unable to create kernel stream" ); @@ -606,7 +544,7 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue // We need to set the size of the local storage error = clSetKernelArg(kernel, 0, bufferSize, NULL); test_error( error, "clSetKernelArg for buffer failed"); - for( i = 0; i < 3; i++ ) + for (size_t i = 0; i < 3; i++) { error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] ); test_error( error, "Unable to set kernel argument" ); @@ -615,11 +553,10 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue else { // No local storage - for( i = 0; i < 3; i++ ) + for (size_t i = 0; i < 3; i++) { error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] ); - if (error) - log_info("%s\n", programSrc); + if (error) log_info("%s\n", programSrc.c_str()); test_error( error, "Unable to set kernel argument" ); } } @@ -654,25 +591,26 @@ int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL ); test_error( error, "Unable to read results" ); - // Create the reference results - memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) ); - for( i = 0; i < numStores; i++ ) + referenceBuffer.assign(referenceBuffer.size(), 0); + for (size_t i = 0; i < numStores; i++) { - memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize ); + memcpy(&referenceBuffer[((offsets[i] * vecSize) + addressOffset) + * typeSize], + &inBuffer[i * typeSize * vecSize], typeSize * vecSize); } // Validate the results now - char *expected = referenceBuffer; + char *expected = referenceBuffer.data(); char *actual = (char *)(void *)outBuffer; if (DEBUG) { log_info("Memory contents:\n"); - for (i=0; i(device, context, queue, + create_global_store_code, 10240); } - -void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize ) +void create_local_store_code(std::string &destBuffer, size_t inBufferSize, + ExplicitType type, size_t inVectorSize, + size_t /*unused*/) { - const char *pattern = - "%s" - "\n" - "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n" + // clang-format off + const char *pattern[] = { + pragma_str, + "#define LOC_TYPE %s\n" + "#define LOC_VTYPE %s%d\n" + "__kernel void test_fn(__local LOC_VTYPE *sSharedStorage, __global LOC_VTYPE *srcValues, __global uint *offsets, __global LOC_VTYPE *destBuffer, uint alignmentOffset )\n" "{\n" " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. - " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n" + " sSharedStorage[ offsets[tid] ] = (LOC_VTYPE)(LOC_TYPE)0;\n" " sSharedStorage[ offsets[tid] +1 ] = sSharedStorage[ offsets[tid] ];\n" " barrier( CLK_LOCAL_MEM_FENCE );\n" "\n" - " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n" + " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local LOC_TYPE *)sSharedStorage ) + alignmentOffset );\n" "\n" // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy @@ -830,20 +735,20 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " int i;\n" - " __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n" - " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n" + " __local LOC_TYPE *sp = (__local LOC_TYPE*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n" + " __global LOC_TYPE *dp = (__global LOC_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n" " for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n" " dp[i] = sp[i];\n" - "}\n"; + "}\n" }; - const char *patternV3 = - "%s" - "\n" - "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n" + const char *patternV3 [] = { + pragma_str, + "#define LOC_TYPE %s\n" + "__kernel void test_fn(__local LOC_TYPE *sSharedStorage, __global LOC_TYPE *srcValues, __global uint *offsets, __global LOC_TYPE *destBuffer, uint alignmentOffset )\n" "{\n" " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. - " sSharedStorage[ 3*offsets[tid] ] = (%s)0;\n" + " sSharedStorage[ 3*offsets[tid] ] = (LOC_TYPE)0;\n" " sSharedStorage[ 3*offsets[tid] +1 ] = \n" " sSharedStorage[ 3*offsets[tid] ];\n" " sSharedStorage[ 3*offsets[tid] +2 ] = \n" @@ -865,30 +770,26 @@ void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitTyp // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " int i;\n" - " __local %s *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n" - " __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n" + " __local LOC_TYPE *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n" + " __global LOC_TYPE *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n" " for( i = 0; i < 3; i++ ) \n" " dp[i] = sp[i];\n" - "}\n"; + "}\n" }; + // clang-format on const char *typeName = get_explicit_type_name(type); if(inVectorSize == 3) { - sprintf( destBuffer, patternV3, - type == kDouble ? doubleExtensionPragma : "", - typeName, - typeName, - typeName, typeName, - typeName, typeName, typeName ); - } else { - sprintf( destBuffer, pattern, - type == kDouble ? doubleExtensionPragma : "", - typeName, (int)inVectorSize, - typeName, (int)inVectorSize, typeName, (int)inVectorSize, - typeName, (int)inVectorSize, typeName, - (int)inVectorSize, typeName, typeName, - typeName, typeName, typeName ); + std::string kernel_src = + concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0])); + destBuffer = str_sprintf(kernel_src, typeName); + } + else + { + std::string kernel_src = + concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0])); + destBuffer = str_sprintf(kernel_src, typeName, typeName, + (int)inVectorSize, (int)inVectorSize); } - // log_info(destBuffer); } int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) @@ -897,81 +798,79 @@ int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue cl_ulong localSize; int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL ); test_error( error, "Unable to get max size of local memory buffer" ); - if( localSize > 10240 ) - localSize = 10240; - if (localSize > 4096) - localSize -= 2048; - else - localSize /= 2; - return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize ); -} + localSize = std::min(localSize, (cl_ulong)2048); + return test_vset(device, context, queue, + create_local_store_code, (size_t)localSize); +} -void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize ) +void create_private_store_code(std::string &destBuffer, size_t inBufferSize, + ExplicitType type, size_t inVectorSize, + size_t /*unused*/) { - const char *pattern = - "%s" + // clang-format off + const char *pattern [] = { + pragma_str, + "#define PRIV_TYPE %s\n" + "#define PRIV_VTYPE %s%d\n" // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test "\n" - "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n" + "__kernel void test_fn( __global PRIV_VTYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n" "{\n" - " __private %s%d sPrivateStorage[ %d ];\n" - " int tid = get_global_id( 0 );\n" + " __private PRIV_VTYPE sPrivateStorage[ %d ];\n" + " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. - " sPrivateStorage[tid] = (%s%d)(%s)0;\n" + " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n" "\n" - " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n" + " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n" "\n" // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " uint i;\n" - " __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n" - " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n" + " __private PRIV_TYPE *sp = (__private PRIV_TYPE*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n" + " __global PRIV_TYPE *dp = (__global PRIV_TYPE*) (destBuffer + offsets[tid]) + alignmentOffset;\n" " for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n" " dp[i] = sp[i];\n" - "}\n"; - + "}\n"}; - const char *patternV3 = - "%s" + const char *patternV3 [] = { + pragma_str, + "#define PRIV_TYPE %s\n" + "#define PRIV_VTYPE %s3\n" // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test "\n" - "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n" + "__kernel void test_fn( __global PRIV_TYPE *srcValues, __global uint *offsets, __global PRIV_VTYPE *destBuffer, uint alignmentOffset )\n" "{\n" - " __private %s3 sPrivateStorage[ %d ];\n" // keep this %d - " int tid = get_global_id( 0 );\n" + " __private PRIV_VTYPE sPrivateStorage[ %d ];\n" // keep this %d + " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. - " sPrivateStorage[tid] = (%s3)(%s)0;\n" + " sPrivateStorage[tid] = (PRIV_VTYPE)(PRIV_TYPE)0;\n" "\n" - - " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n" - "\n" - // Note: we only copy the relevant portion of our local storage over to the dest buffer, because - // otherwise, local threads would be overwriting results from other local threads + " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private PRIV_TYPE *)sPrivateStorage ) + alignmentOffset );\n" " uint i;\n" - " __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n" - " __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n" + " __private PRIV_TYPE *sp = ((__private PRIV_TYPE*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n" + " __global PRIV_TYPE *dp = ((__global PRIV_TYPE*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n" " for( i = 0; i < 3; i++ ) \n" " dp[i] = sp[i];\n" - "}\n"; + "}\n"}; + // clang-format on const char *typeName = get_explicit_type_name(type); if(inVectorSize == 3) { - sprintf( destBuffer, patternV3, - type == kDouble ? doubleExtensionPragma : "", - typeName, typeName, - typeName, (int)inBufferSize, - typeName, typeName, - typeName, typeName, typeName, typeName, typeName ); - } else { - sprintf( destBuffer, pattern, - type == kDouble ? doubleExtensionPragma : "", - typeName, (int)inVectorSize, typeName, (int)inVectorSize, - typeName, (int)inVectorSize, (int)inBufferSize, - typeName, (int)inVectorSize, typeName, - (int)inVectorSize, typeName, typeName, typeName, typeName, typeName ); + std::string kernel_src = + concat_kernel(patternV3, sizeof(patternV3) / sizeof(patternV3[0])); + destBuffer = + str_sprintf(kernel_src, typeName, typeName, (int)inBufferSize); + } + else + { + std::string kernel_src = + concat_kernel(pattern, sizeof(pattern) / sizeof(pattern[0])); + destBuffer = + str_sprintf(kernel_src, typeName, typeName, (int)inVectorSize, + (int)inBufferSize, (int)inVectorSize); } } @@ -979,7 +878,8 @@ int test_vstore_private(cl_device_id device, cl_context context, cl_command_queu { // We have no idea how much actual private storage is available, so just pick a reasonable value, // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes - return test_vstoreset( device, context, queue, create_private_store_code, 256 ); + return test_vset(device, context, queue, + create_private_store_code, 256); } diff --git a/test_conformance/basic/utils.h b/test_conformance/basic/utils.h new file mode 100644 index 000000000..d7b30a09b --- /dev/null +++ b/test_conformance/basic/utils.h @@ -0,0 +1,47 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef BASIC_UTILS_H +#define BASIC_UTILS_H + +#include +#include +#include + +extern cl_half_rounding_mode halfRoundingMode; + +inline std::string concat_kernel(const char *sstr[], int num) +{ + std::string res; + for (int i = 0; i < num; i++) res += std::string(sstr[i]); + return res; +} + +template +inline std::string str_sprintf(const std::string &str, Args... args) +{ + int str_size = std::snprintf(nullptr, 0, str.c_str(), args...) + 1; + if (str_size <= 0) throw std::runtime_error("Formatting error."); + size_t s = static_cast(str_size); + std::unique_ptr buffer(new char[s]); + std::snprintf(buffer.get(), s, str.c_str(), args...); + return std::string(buffer.get(), buffer.get() + s - 1); +} + +#define HFF(num) cl_half_from_float(num, halfRoundingMode) +#define HTF(num) cl_half_to_float(num) + +#endif // BASIC_UTIL_H From 310d2ed1d19cbb7962a65f7b941ad56b46407811 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Mon, 12 Jun 2023 09:38:51 +0200 Subject: [PATCH 2/4] Moved string helper procedures due to request from test_commonfns PR #1695 --- .../utils.h => test_common/harness/stringHelpers.h | 7 ------- test_conformance/basic/test_astype.cpp | 7 +++---- test_conformance/basic/test_vloadstore.cpp | 11 ++++++++--- 3 files changed, 11 insertions(+), 14 deletions(-) rename test_conformance/basic/utils.h => test_common/harness/stringHelpers.h (88%) diff --git a/test_conformance/basic/utils.h b/test_common/harness/stringHelpers.h similarity index 88% rename from test_conformance/basic/utils.h rename to test_common/harness/stringHelpers.h index 6409efcde..3f6bf64db 100644 --- a/test_conformance/basic/utils.h +++ b/test_common/harness/stringHelpers.h @@ -20,10 +20,6 @@ #include #include -#include - -extern cl_half_rounding_mode halfRoundingMode; - inline std::string concat_kernel(const char *sstr[], int num) { std::string res; @@ -42,7 +38,4 @@ inline std::string str_sprintf(const std::string &str, Args... args) return std::string(buffer.get(), buffer.get() + s - 1); } -#define HFF(num) cl_half_from_float(num, halfRoundingMode) -#define HTF(num) cl_half_to_float(num) - #endif // BASIC_UTIL_H diff --git a/test_conformance/basic/test_astype.cpp b/test_conformance/basic/test_astype.cpp index 08a4cb85a..45669a7cb 100644 --- a/test_conformance/basic/test_astype.cpp +++ b/test_conformance/basic/test_astype.cpp @@ -14,6 +14,9 @@ // limitations under the License. // #include "harness/compat.h" +#include "harness/conversions.h" +#include "harness/stringHelpers.h" +#include "harness/typeWrappers.h" #include #include @@ -22,11 +25,7 @@ #include #include -#include "harness/conversions.h" -#include "harness/typeWrappers.h" - #include "procs.h" -#include "utils.h" // clang-format off diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp index b6c2b93fb..c2f7399e1 100644 --- a/test_conformance/basic/test_vloadstore.cpp +++ b/test_conformance/basic/test_vloadstore.cpp @@ -22,24 +22,29 @@ #include #include +#include + #include "procs.h" #include "harness/conversions.h" -#include "harness/typeWrappers.h" #include "harness/errorHelpers.h" - -#include "utils.h" +#include "harness/stringHelpers.h" +#include "harness/typeWrappers.h" // Outputs debug information for stores #define DEBUG 0 // Forces stores/loads to be done with offsets = tid #define LINEAR_OFFSETS 0 #define NUM_LOADS 512 +#define HFF(num) cl_half_from_float(num, halfRoundingMode) +#define HTF(num) cl_half_to_float(num) char pragma_str[128] = { 0 }; char mem_type[64] = { 0 }; char store_str[128] = { 0 }; char load_str[128] = { 0 }; +extern cl_half_rounding_mode halfRoundingMode; + // clang-format off static const char *store_pattern= "results[ tid ] = tmp;\n"; static const char *store_patternV3 = "results[3*tid] = tmp.s0; results[3*tid+1] = tmp.s1; results[3*tid+2] = tmp.s2;\n"; From 5ecafc82098a6c724ba24bcaf13bd13dbbfde612 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 15 Jun 2023 08:48:14 +0200 Subject: [PATCH 3/4] restored original test sizes --- test_conformance/basic/test_vloadstore.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp index c2f7399e1..e4a36b776 100644 --- a/test_conformance/basic/test_vloadstore.cpp +++ b/test_conformance/basic/test_vloadstore.cpp @@ -378,7 +378,11 @@ int test_vload_local(cl_device_id device, cl_context context, cl_command_queue q cl_ulong localSize; int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL ); test_error( error, "Unable to get max size of local memory buffer" ); - localSize = std::min(localSize, (cl_ulong)2048); + if (localSize > 10240) localSize = 10240; + if (localSize > 4096) + localSize -= 2048; + else + localSize /= 2; return test_vset(device, context, queue, create_local_load_code, (size_t)localSize); @@ -410,7 +414,11 @@ int test_vload_constant(cl_device_id device, cl_context context, cl_command_queu cl_ulong maxSize; int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL ); test_error( error, "Unable to get max size of constant memory buffer" ); - maxSize = std::min(maxSize, (cl_ulong)2048); + if (maxSize > 10240) maxSize = 10240; + if (maxSize > 4096) + maxSize -= 2048; + else + maxSize /= 2; return test_vset(device, context, queue, create_constant_load_code, (size_t)maxSize); @@ -803,8 +811,11 @@ int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue cl_ulong localSize; int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL ); test_error( error, "Unable to get max size of local memory buffer" ); - localSize = std::min(localSize, (cl_ulong)2048); - + if (localSize > 10240) localSize = 10240; + if (localSize > 4096) + localSize -= 2048; + else + localSize /= 2; return test_vset(device, context, queue, create_local_store_code, (size_t)localSize); } From c3f337bc5093d1bc18244aff1a138904a770d45f Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 20 Jun 2023 15:55:29 +0200 Subject: [PATCH 4/4] Corrected invalid initialization of reference buffer --- test_conformance/basic/test_vloadstore.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_conformance/basic/test_vloadstore.cpp b/test_conformance/basic/test_vloadstore.cpp index e4a36b776..d34ecbf90 100644 --- a/test_conformance/basic/test_vloadstore.cpp +++ b/test_conformance/basic/test_vloadstore.cpp @@ -225,7 +225,7 @@ int test_vload(cl_device_id device, cl_context context, cl_command_queue queue, test_error( error, "Unable to read results" ); // Create the reference results - referenceBuffer.assign(0, numLoads * typeSize * vecSize); + referenceBuffer.assign(numLoads * typeSize * vecSize, 0); for( i = 0; i < numLoads; i++ ) { memcpy(&referenceBuffer[i * typeSize * vecSize],