From d25b828df75278c34e9c1e325cb7da34416dd3d8 Mon Sep 17 00:00:00 2001 From: "Wawiorko, Grzegorz" Date: Wed, 8 Jun 2022 10:16:56 +0200 Subject: [PATCH 01/24] Enable fp16 in math bruteforce --- .../math_brute_force/CMakeLists.txt | 8 + .../math_brute_force/binary_half.cpp | 879 ++++++++++++++++++ .../math_brute_force/binary_i_double.cpp | 6 +- .../math_brute_force/binary_i_float.cpp | 7 +- .../math_brute_force/binary_i_half.cpp | 669 +++++++++++++ .../binary_operator_double.cpp | 3 +- .../binary_operator_float.cpp | 3 +- .../math_brute_force/function_list.cpp | 91 +- .../math_brute_force/function_list.h | 4 + .../math_brute_force/i_unary_half.cpp | 306 ++++++ .../math_brute_force/macro_binary_double.cpp | 3 +- .../math_brute_force/macro_binary_float.cpp | 3 +- .../math_brute_force/macro_binary_half.cpp | 652 +++++++++++++ .../math_brute_force/macro_unary_half.cpp | 543 +++++++++++ .../math_brute_force/mad_half.cpp | 295 ++++++ test_conformance/math_brute_force/main.cpp | 65 +- .../math_brute_force/reference_math.cpp | 34 + .../math_brute_force/reference_math.h | 4 + .../math_brute_force/ternary_double.cpp | 3 +- .../math_brute_force/ternary_float.cpp | 3 +- .../math_brute_force/test_functions.h | 36 +- .../math_brute_force/unary_half.cpp | 600 ++++++++++++ .../math_brute_force/unary_u_half.cpp | 334 +++++++ test_conformance/math_brute_force/utility.h | 4 + 24 files changed, 4500 insertions(+), 55 deletions(-) create mode 100644 test_conformance/math_brute_force/binary_half.cpp create mode 100644 test_conformance/math_brute_force/binary_i_half.cpp create mode 100644 test_conformance/math_brute_force/i_unary_half.cpp create mode 100644 test_conformance/math_brute_force/macro_binary_half.cpp create mode 100644 test_conformance/math_brute_force/macro_unary_half.cpp create mode 100644 test_conformance/math_brute_force/mad_half.cpp create mode 100644 test_conformance/math_brute_force/unary_half.cpp create mode 100644 test_conformance/math_brute_force/unary_u_half.cpp diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index 28d2716f8..1b9c28f85 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -3,8 +3,10 @@ set(MODULE_NAME BRUTEFORCE) set(${MODULE_NAME}_SOURCES binary_double.cpp binary_float.cpp + binary_half.cpp binary_i_double.cpp binary_i_float.cpp + binary_i_half.cpp binary_operator_double.cpp binary_operator_float.cpp binary_two_results_i_double.cpp @@ -14,12 +16,16 @@ set(${MODULE_NAME}_SOURCES function_list.h i_unary_double.cpp i_unary_float.cpp + i_unary_half.cpp macro_binary_double.cpp macro_binary_float.cpp + macro_binary_half.cpp macro_unary_double.cpp macro_unary_float.cpp + macro_unary_half.cpp mad_double.cpp mad_float.cpp + mad_half.cpp main.cpp reference_math.cpp reference_math.h @@ -30,12 +36,14 @@ set(${MODULE_NAME}_SOURCES test_functions.h unary_double.cpp unary_float.cpp + unary_half.cpp unary_two_results_double.cpp unary_two_results_float.cpp unary_two_results_i_double.cpp unary_two_results_i_float.cpp unary_u_double.cpp unary_u_float.cpp + unary_u_half.cpp utility.cpp utility.h ) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp new file mode 100644 index 000000000..770472c5c --- /dev/null +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -0,0 +1,879 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" +#include "reference_math.h" +#include + + +static int BuildKernelHalf(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in1, __global half", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half* out, __global half* in, __global half* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 d0 = vload3( 0, in + 3 * i );\n" + " half3 d1 = vload3( 0, in2 + 3 * i );\n" + " d0 = ", + name, + "( d0, d1 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " half3 d0, d1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (half3)( in[3*i], NAN, NAN ); \n" + " d1 = (half3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isFDim; + int skipNanInf; + int isNextafter; +} TestInfo; + +// A table of more difficult cases to get right +static const cl_half specialValuesHalf[] = { + 0xffff, + 0x0000, + 0x0001, + 0x7c00 /*INFINITY*/, + 0xfc00 /*-INFINITY*/, + 0x8000 /*-0*/, + 0x7bff /*HALF_MAX*/, + 0x0400 /*HALF_MIN*/ +}; + +static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); + +int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, + bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; + test_info.isNextafter = isNextafter; + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, + test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + + vlog("\n"); + + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + + int isFDim = job->isFDim; + int skipNanInf = job->skipNanInf; + int isNextafter = job->isNextafter; + cl_ushort *t; + cl_half *r; + float *s = 0, *s2 = 0; + + RoundingMode oldRoundMode; + cl_int copysign_test = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint16_t pattern = 0xdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + FPU_mode_type oldMode; + oldRoundMode = kRoundToNearestEven; + if (isFDim) + { + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + } + + if (!strcmp(name, "copysign")) copysign_test = 1; + +#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2)) + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + t = (cl_ushort *)r; + s = (float *)malloc(buffer_elements * sizeof(float)); + s2 = (float *)malloc(buffer_elements * sizeof(float)); + for (j = 0; j < buffer_elements; j++) + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + if (isNextafter) + r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), + CL_HALF_RTE); + else + r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); + } + + if (isFDim && ftz) RestoreFPState(&oldMode); + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, + 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double correct; + if (isNextafter) + correct = reference_nextafterh(s[j], s2[j]); + else + correct = ref_func(s[j], s2[j]); + + float test = cl_half_to_float(q[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (skipNanInf) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfSubnormal( + cl_half_from_float(correct, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + if (isNextafter) + correct2 = reference_nextafterh(0.0, s2[j]); + else + correct2 = ref_func(0.0, s2[j]); + if (isNextafter) + correct3 = reference_nextafterh(-0.0, s2[j]); + else + correct3 = ref_func(-0.0, s2[j]); + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfSubnormal( + cl_half_from_float(correct2, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct3, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p[j]); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + if (isNextafter) + correct2 = reference_nextafterh(0.0, 0.0); + else + correct2 = ref_func(0.0, 0.0); + if (isNextafter) + correct3 = reference_nextafterh(-0.0, 0.0); + else + correct3 = ref_func(-0.0, 0.0); + if (isNextafter) + correct4 = reference_nextafterh(0.0, -0.0); + else + correct4 = ref_func(0.0, -0.0); + if (isNextafter) + correct5 = reference_nextafterh(-0.0, -0.0); + else + correct5 = ref_func(-0.0, -0.0); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfSubnormal( + cl_half_from_float(correct2, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct3, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct4, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct5, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p2[j]); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + if (isNextafter) + correct2 = reference_nextafterh(s[j], 0.0); + else + correct2 = ref_func(s[j], 0.0); + if (isNextafter) + correct3 = reference_nextafterh(s[j], -0.0); + else + correct3 = ref_func(s[j], -0.0); + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfSubnormal( + cl_half_from_float(correct2, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct3, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p2[j]); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), " + "%a (0x%0.4x)}\nExpected: %a (half 0x%0.4x) " + "\nActual: %a (half 0x%0.4x) at index: %d\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], cl_half_to_float(r[j]), r[j], test, q[j], + j); + error = -1; + goto exit; + } + } + } + } + + if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } +exit: + if (s) free(s); + if (s2) free(s2); + return error; +} + + +int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode); +} + +int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode); +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp index f15c21ede..8c83b9bf5 100644 --- a/test_conformance/math_brute_force/binary_i_double.cpp +++ b/test_conformance/math_brute_force/binary_i_double.cpp @@ -279,16 +279,14 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); const int specialValuesInt[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX, }; -constexpr size_t specialValuesIntCount = - sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); +constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp index 9e27b0073..527861c12 100644 --- a/test_conformance/math_brute_force/binary_i_float.cpp +++ b/test_conformance/math_brute_force/binary_i_float.cpp @@ -269,8 +269,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, @@ -279,9 +278,7 @@ const int specialValuesInt[] = { -0x04000001, -1465264071, -1488522147, }; -constexpr size_t specialValuesIntCount = - sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); - +constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp new file mode 100644 index 000000000..63196f324 --- /dev/null +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -0,0 +1,669 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +static int BuildKernelHalf(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in1, __global int", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half* out, __global half* in, __global int* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 d0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = vload3( 0, in2 + 3 * i );\n" + " d0 = ", + name, + "( d0, i0 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " half3 d0;\n" + " int3 i0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (half3)( in[3*i], NAN, NAN ); \n" + " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" + " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, i0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + cl_int maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + // no special values +} TestInfo; + + +// A table of more difficult cases to get right +static const cl_half specialValuesHalf[] = { + 0xffff, + 0x0000, + 0x0001, + 0x7c00 /*INFINITY*/, + 0xfc00 /*-INFINITY*/, + 0x8000 /*-0*/, + 0x7bff /*HALF_MAX*/, + 0x0400 /*HALF_MIN*/ +}; + +static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +static const int specialValuesInt3[] = { 0, 1, 2, 3, + 1022, 1023, 1024, INT_MIN, + INT_MAX, -1, -2, -3, + -1022, -1023, -11024, -INT_MAX }; +static size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); + +int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, + test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + if (error) goto exit; + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + + vlog("\n"); + + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_ushort *t; + cl_half *r; + float *s = 0; + cl_int *s2; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesInt3Count; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesInt3[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesInt3Count) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_elements * sizeof(cl_half), p, 0, + NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_elements * sizeof(cl_int), p2, 0, + NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint16_t pattern = 0xdead; + memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half)); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + t = (cl_ushort *)r; + s = (float *)malloc(buffer_elements * sizeof(float)); + s2 = (cl_int *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + r[j] = cl_half_from_float(func.f_fi(s[j], s2[j]), CL_HALF_RTE); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + float test = cl_half_to_float(q[j]); + double correct = func.f_fi(s[j], s2[j]); + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfSubnormal( + cl_half_from_float(correct, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + correct2 = func.f_fi(0.0, s2[j]); + correct3 = func.f_fi(-0.0, s2[j]); + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfSubnormal( + cl_half_from_float(correct2, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct3, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), " + "%d}\nExpected: %a (half 0x%0.4x) \nActual: %a " + "(half 0x%0.4x) at index: %d\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + cl_half_to_float(r[j]), r[j], test, q[j], + (cl_uint)j); + error = -1; + goto exit; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + goto exit; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + +exit: + if (s) free(s); + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp index c407fdaaf..6bcec98bc 100644 --- a/test_conformance/math_brute_force/binary_operator_double.cpp +++ b/test_conformance/math_brute_force/binary_operator_double.cpp @@ -281,8 +281,7 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp index 7fbb07c28..3b1be2902 100644 --- a/test_conformance/math_brute_force/binary_operator_float.cpp +++ b/test_conformance/math_brute_force/binary_operator_float.cpp @@ -271,8 +271,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 917362852..1dcd4d900 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -32,33 +32,37 @@ #define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ RELAXED_OFF, _type \ } #define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ _relaxed_embedded_ulp) \ { \ STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, \ - _rmode, RELAXED_ON, _type \ + _ulp, _ulp, _ulp, _embedded_ulp, _relaxed_ulp, \ + _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), { NULL }, \ - { NULL }, { NULL }, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, \ - _rmode, RELAXED_OFF, _type \ + { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, \ + INFINITY, _rmode, RELAXED_OFF, _type \ } #define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ { \ STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ + _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } #define unaryF NULL +#define unaryOF NULL #define i_unaryF NULL #define unaryF_u NULL #define macro_unaryF NULL #define binaryF NULL +#define binaryOF NULL +#define binaryF_nextafter NULL #define binaryOperatorF NULL #define binaryF_i NULL #define macro_binaryF NULL @@ -80,7 +84,7 @@ { \ STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ - _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ RELAXED_OFF, _type \ } #define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ @@ -88,19 +92,21 @@ { \ STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ { (void*)reference_##_name##l }, \ - { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _embedded_ulp, \ - _relaxed_ulp, _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ + { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _ulp, \ + _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ + RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), \ { (void*)reference_##_name }, { NULL }, { NULL }, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ + _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } #define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ { \ STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, \ + { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _ulp, \ _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } @@ -108,85 +114,114 @@ static constexpr vtbl _unary = { "unary", TestFunc_Float_Float, TestFunc_Double_Double, + TestFunc_Half_Half, }; +static constexpr vtbl _unaryof = { "unaryof", TestFunc_Float_Float, NULL, + NULL }; + static constexpr vtbl _i_unary = { "i_unary", TestFunc_Int_Float, TestFunc_Int_Double, + TestFunc_Int_Half, }; static constexpr vtbl _unary_u = { "unary_u", TestFunc_Float_UInt, TestFunc_Double_ULong, + TestFunc_Half_UShort, }; static constexpr vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float, TestMacro_Int_Double, + TestMacro_Int_Half, }; static constexpr vtbl _binary = { "binary", TestFunc_Float_Float_Float, TestFunc_Double_Double_Double, + TestFunc_Half_Half_Half, }; +static constexpr vtbl _binary_nextafter = { + "binary", + TestFunc_Float_Float_Float, + TestFunc_Double_Double_Double, + TestFunc_Half_Half_Half_nextafter, +}; + +static constexpr vtbl _binaryof = { "binaryof", TestFunc_Float_Float_Float, + NULL, NULL }; + static constexpr vtbl _binary_operator = { "binaryOperator", TestFunc_Float_Float_Float_Operator, TestFunc_Double_Double_Double_Operator, + NULL, }; static constexpr vtbl _binary_i = { "binary_i", TestFunc_Float_Float_Int, TestFunc_Double_Double_Int, + TestFunc_Half_Half_Int, }; static constexpr vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float, TestMacro_Int_Double_Double, + TestMacro_Int_Half_Half, }; static constexpr vtbl _ternary = { "ternary", TestFunc_Float_Float_Float_Float, TestFunc_Double_Double_Double_Double, + NULL, }; static constexpr vtbl _unary_two_results = { "unary_two_results", TestFunc_Float2_Float, TestFunc_Double2_Double, + NULL, }; static constexpr vtbl _unary_two_results_i = { "unary_two_results_i", TestFunc_FloatI_Float, TestFunc_DoubleI_Double, + NULL, }; static constexpr vtbl _binary_two_results_i = { "binary_two_results_i", TestFunc_FloatI_Float_Float, TestFunc_DoubleI_Double_Double, + NULL, }; static constexpr vtbl _mad_tbl = { "ternary", TestFunc_mad_Float, TestFunc_mad_Double, + TestFunc_mad_Half, }; #define unaryF &_unary +#define unaryOF &_unaryof #define i_unaryF &_i_unary #define unaryF_u &_unary_u #define macro_unaryF &_macro_unary #define binaryF &_binary +#define binaryF_nextafter &_binary_nextafter +#define binaryOF &_binaryof #define binaryOperatorF &_binary_operator #define binaryF_i &_binary_i #define macro_binaryF &_macro_binary @@ -278,7 +313,7 @@ const Func functionList[] = { ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF), ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u), - ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter), ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF, 8192.0f), // in derived mode the ulp error is calculated as // exp2(y*log2(x)) and in non-derived it is the same as @@ -308,6 +343,7 @@ const Func functionList[] = { { NULL }, 3.0f, 0.0f, + 0.0f, 4.0f, INFINITY, INFINITY, @@ -322,6 +358,7 @@ const Func functionList[] = { 0.0f, 0.0f, 0.0f, + 0.0f, INFINITY, INFINITY, FTZ_OFF, @@ -339,20 +376,20 @@ const Func functionList[] = { // sure this requirement is realistic ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF), - HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryF), - HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryF), - HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryF), + HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF), + HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryOF), + HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF), // basic operations OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), @@ -364,6 +401,7 @@ const Func functionList[] = { { (void*)reference_relaxed_divide }, 2.5f, 0.0f, + 0.0f, 3.0f, 2.5f, INFINITY, @@ -378,6 +416,7 @@ const Func functionList[] = { 0.0f, 0.0f, 0.0f, + 0.0f, 0.f, INFINITY, FTZ_OFF, diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h index 95a294593..6ea0fa9e2 100644 --- a/test_conformance/math_brute_force/function_list.h +++ b/test_conformance/math_brute_force/function_list.h @@ -70,6 +70,9 @@ struct vtbl int (*DoubleTestFunc)( const struct Func *, MTdata, bool); // may be NULL if function is single precision only + int (*HalfTestFunc)( + const struct Func *, MTdata, + bool); // may be NULL if function is single precision only }; struct Func @@ -82,6 +85,7 @@ struct Func fptr rfunc; float float_ulps; float double_ulps; + float half_ulps; float float_embedded_ulps; float relaxed_error; float relaxed_embedded_error; diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp new file mode 100644 index 000000000..245528e10 --- /dev/null +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -0,0 +1,306 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global int* out, __global half* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 f0 = vload3( 0, in + 3 * i );\n" + " int3 i0 = ", + name, + "( f0 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(half)). Assume power of two " + "buffer size \n" + " half3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (half3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " int3 i0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; + size_t bufferSize = BUFFER_SIZE; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + uint64_t bufferElements = bufferSize / sizeof(cl_int); + float *s = 0; + + int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // This test is not using ThreadPool so we need to disable FTZ here + // for reference computations + FPU_mode_type oldMode; + DisableFTZ(&oldMode); + + // Init the kernels + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + { + return error; + } + s = (float *)malloc(bufferElements * sizeof(float)); + + for (i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + if (gWimpyMode) + { + for (j = 0; j < bufferElements; j++) + p[j] = (cl_ushort)i + j * scale; + } + else + { + for (j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_int); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + int *r = (int *)gOut_Ref; + for (j = 0; j < bufferElements; j++) + { + s[j] = cl_half_to_float(p[j]); + r[j] = f->func.i_f(s[j]); + } + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + for (j = 0; j < bufferElements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + if (ftz && IsHalfSubnormal(p[j])) + { + unsigned int correct0 = f->func.i_f(0.0); + unsigned int correct1 = f->func.i_f(-0.0); + if (q[j] == correct0 || q[j] == correct1) continue; + } + + uint32_t err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%0.4x): " + "*%d vs. %d\n", + f->name, sizeNames[k], err, s[j], p[j], t[j], + q[j]); + error = -1; + goto exit; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + +exit: + if (s) free(s); + RestoreFPState(&oldMode); + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp index d3e8071fb..42a813160 100644 --- a/test_conformance/math_brute_force/macro_binary_double.cpp +++ b/test_conformance/math_brute_force/macro_binary_double.cpp @@ -270,8 +270,7 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp index 6c7c8c05e..c7f3538e6 100644 --- a/test_conformance/math_brute_force/macro_binary_float.cpp +++ b/test_conformance/math_brute_force/macro_binary_float.cpp @@ -260,8 +260,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp new file mode 100644 index 000000000..72abb1057 --- /dev/null +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -0,0 +1,652 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + + +static int BuildKernelHalf(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global short", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in1, __global half", + sizeNames[vectorSize], + "* in2 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i] );\n" + "}\n" }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global short* out, __global half* in, __global half* in2)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 f0 = vload3( 0, in + 3 * i );\n" + " half3 f1 = vload3( 0, in2 + 3 * i );\n" + " short3 i0 = ", + name, + "( f0, f1 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " half3 f0, f1;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (half3)( in[3*i], NAN, NAN ); \n" + " f1 = (half3)( in2[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" + " f1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " short3 i0 = ", + name, + "( f0, f1 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem inBuf2; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + MTdata d; + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode + +} TestInfo; + +// A table of more difficult cases to get right +static const cl_half specialValuesHalf[] = { + 0xffff, + 0x0000, + 0x0001, + 0x7c00 /*INFINITY*/, + 0xfc00 /*-INFINITY*/, + 0x8000 /*-0*/, + 0x7bff /*HALF_MAX*/, + 0x0400 /*HALF_MIN*/ +}; + +static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); + +int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, + test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + +exit: + // Release + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + free_mtdata(test_info.tinfo[i].d); + clReleaseMemObject(test_info.tinfo[i].inBuf); + clReleaseMemObject(test_info.tinfo[i].inBuf2); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_short *t, *r; + float *s = 0, *s2 = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_short *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + } + + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + goto exit; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + goto exit; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint16_t pattern = 0xdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + goto exit; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_short *)gOut_Ref + thread_id * buffer_elements; + t = (cl_short *)r; + s = (float *)malloc(buffer_elements * sizeof(float)); + s2 = (float *)malloc(buffer_elements * sizeof(float)); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + r[j] = (short)func.i_ff(s[j], s2[j]); + } + + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + + // Wait for the last buffer + out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + cl_short *q = (cl_short *)out[0]; + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + if (IsHalfSubnormal(p2[j])) + { + short correct = (short)func.i_ff(0.0f, 0.0f); + short correct2 = (short)func.i_ff(0.0f, -0.0f); + short correct3 = (short)func.i_ff(-0.0f, 0.0f); + short correct4 = (short)func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + short correct = (short)func.i_ff(0.0f, s2[j]); + short correct2 = (short)func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsHalfSubnormal(p2[j])) + { + short correct = (short)func.i_ff(s[j], 0.0f); + short correct2 = (short)func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error( + "\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a " + "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n", + name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j); + error = -1; + goto exit; + } + + + for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; + k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + if (IsHalfSubnormal(p2[j])) + { + short correct = (short)-func.i_ff(0.0f, 0.0f); + short correct2 = (short)-func.i_ff(0.0f, -0.0f); + short correct3 = (short)-func.i_ff(-0.0f, 0.0f); + short correct4 = (short)-func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + short correct = (short)-func.i_ff(0.0f, s2[j]); + short correct2 = (short)-func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsHalfSubnormal(p2[j])) + { + short correct = (short)-func.i_ff(s[j], 0.0f); + short correct2 = (short)-func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + cl_ushort err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a " + "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x " + "(index: %d)\n", + name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j); + error = -1; + goto exit; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + goto exit; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + +exit: + if (s) free(s); + if (s2) free(s2); + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp new file mode 100644 index 000000000..31c7d65ce --- /dev/null +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -0,0 +1,543 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelHalf(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global short", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global short* out, __global half* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 f0 = vload3( 0, in + 3 * i );\n" + " short3 i0 = ", + name, + "( f0 );\n" + " vstore3( i0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " short3 i0;\n" + " half3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (half3)( in[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " f0 = (half3)( in[3*i], in[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " i0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = i0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = i0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode + +} TestInfo; + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); + +int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = f->ftz || gForceFTZ; + + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, + test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + +exit: + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + clReleaseMemObject(test_info.tinfo[i].inBuf); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + fptr func = job->f->func; + int ftz = job->ftz; + cl_uint j, k; + cl_int error = CL_SUCCESS; + const char *name = job->f->name; + float *s = 0; + + int signbit_test = 0; + if (!strcmp(name, "signbit")) signbit_test = 1; + +#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s)) + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_short *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Write the new values to the input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale; + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint16_t pattern = 0xdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + return error; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements; + cl_short *t = (cl_short *)r; + s = (float *)malloc(buffer_elements * sizeof(float)); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + if (!strcmp(name, "isnormal")) + { + if ((IsHalfSubnormal(p[j]) == 0) && !((p[j] & 0x7fffU) >= 0x7c00U) + && ((p[j] & 0x7fffU) != 0x0000U)) + r[j] = 1; + else + r[j] = 0; + } + else + r[j] = (short)ref_func(s[j]); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + // Wait for the last buffer + out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + cl_short *q = out[0]; + + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + // If we aren't getting the correctly rounded result + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + short correct = (short)ref_func(+0.0f); + short correct2 = (short)ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %d ulp error at %a (0x%0.4x)\nExpected: " + "%d vs. %d\n", + name, err, s[j], p[j], t[j], q[j]); + error = -1; + goto exit; + } + + + for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; + k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + short correct = (short)-ref_func(+0.0f); + short correct2 = (short)-ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a " + "(0x%0.4x)\nExpected: %d \nActual: %d\n", + name, sizeNames[k], err, s[j], p[j], -t[j], q[j]); + error = -1; + goto exit; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + goto exit; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } +exit: + if (s) free(s); + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp new file mode 100644 index 000000000..a36e8d665 --- /dev/null +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -0,0 +1,295 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + + +static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in1, __global half", + sizeNames[vectorSize], + "* in2, __global half", + sizeNames[vectorSize], + "* in3 )\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in1[i], in2[i], in3[i] );\n" + "}\n" }; + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half* out, __global half* in, __global half* in2, __global " + "half* in3)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 d0 = vload3( 0, in + 3 * i );\n" + " half3 d1 = vload3( 0, in2 + 3 * i );\n" + " half3 d2 = vload3( 0, in3 + 3 * i );\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" + " vstore3( d0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " half3 d0, d1, d2;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " d0 = (half3)( in[3*i], NAN, NAN ); \n" + " d1 = (half3)( in2[3*i], NAN, NAN ); \n" + " d2 = (half3)( in3[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" + " d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n" + " d2 = (half3)( in3[3*i], in3[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " d0 = ", + name, + "( d0, d1, d2 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = d0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = d0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + +int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + // int ftz = f->ftz || gForceFTZ; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + size_t bufferSize = BUFFER_SIZE; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + uint64_t step = bufferSize / sizeof(cl_half); + if (gWimpyMode) + { + step = (1ULL << 32) * gWimpyReductionFactor / (512); + } + // Init the kernels + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + { + return error; + } + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + cl_ushort *p2 = (cl_ushort *)gIn2; + cl_ushort *p3 = (cl_ushort *)gIn3; + for (j = 0; j < bufferSize / sizeof(cl_ushort); j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + p3[j] = (cl_ushort)genrand_int32(d); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint16_t pattern = 0xdead; + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_half) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), + &gInBuffer2))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), + &gInBuffer3))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data - no verification possible. MAD is a random number + // generator. + + if (0 == (i & 0x0fffffff)) + { + vlog("."); + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("pass"); + + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + } + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp index 1a6e0c4e1..556870162 100644 --- a/test_conformance/math_brute_force/main.cpp +++ b/test_conformance/math_brute_force/main.cpp @@ -49,6 +49,8 @@ #include "harness/testHarness.h" #define kPageSize 4096 +#define HALF_REQUIRED_FEATURES_1 (CL_FP_ROUND_TO_ZERO) +#define HALF_REQUIRED_FEATURES_2 (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) #define DOUBLE_REQUIRED_FEATURES \ (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO \ | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM) @@ -80,6 +82,8 @@ static int gTestFastRelaxed = 1; */ int gFastRelaxedDerived = 1; static int gToggleCorrectlyRoundedDivideSqrt = 0; +int gHasHalf = 0; +cl_device_fp_config gHalfCapabilities = 0; int gDeviceILogb0 = 1; int gDeviceILogbNaN = 1; int gCheckTininessBeforeRounding = 1; @@ -166,7 +170,6 @@ static int doTest(const char *name) return 0; } } - { if (0 == strcmp("ilogb", func_data->name)) { @@ -235,6 +238,23 @@ static int doTest(const char *name) } } } + + if (gHasHalf && NULL != func_data->vtbl_ptr->HalfTestFunc) + { + gTestCount++; + vlog("%3d: ", gTestCount); + if (func_data->vtbl_ptr->HalfTestFunc(func_data, gMTdata, + false /* relaxed mode*/)) + { + gFailCount++; + error++; + if (gStopOnError) + { + gSkipRestOfTests = true; + return error; + } + } + } } return error; @@ -407,6 +427,8 @@ static int ParseArgs(int argc, const char **argv) case 'm': singleThreaded ^= 1; break; + case 'g': gHasHalf ^= 1; break; + case 'r': gTestFastRelaxed ^= 1; break; case 's': gStopOnError ^= 1; break; @@ -539,6 +561,8 @@ static void PrintUsage(void) vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 " "on)\n"); vlog("\t\t-f\tToggle float precision testing. (Default: on)\n"); + vlog("\t\t-g\tToggle half precision testing. (Default: on if khr_fp_16 " + "on)\n"); vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n"); vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math " "precision. (Default: on)\n"); @@ -638,6 +662,44 @@ test_status InitCL(cl_device_id device) #endif } + gFloatToHalfRoundingMode = kRoundToNearestEven; + if (is_extension_available(gDevice, "cl_khr_fp16")) + { + gHasHalf ^= 1; +#if defined(CL_DEVICE_HALF_FP_CONFIG) + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_HALF_FP_CONFIG, + sizeof(gHalfCapabilities), + &gHalfCapabilities, NULL))) + { + vlog_error( + "ERROR: Unable to get device CL_DEVICE_HALF_FP_CONFIG. (%d)\n", + error); + return TEST_FAIL; + } + if (HALF_REQUIRED_FEATURES_1 + != (gHalfCapabilities & HALF_REQUIRED_FEATURES_1) + && HALF_REQUIRED_FEATURES_2 + != (gHalfCapabilities & HALF_REQUIRED_FEATURES_2)) + { + char list[300] = ""; + if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_NEAREST)) + strncat(list, "CL_FP_ROUND_TO_NEAREST, ", sizeof(list) - 1); + if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_ZERO)) + strncat(list, "CL_FP_ROUND_TO_ZERO, ", sizeof(list) - 1); + if (0 == (gHalfCapabilities & CL_FP_INF_NAN)) + strncat(list, "CL_FP_INF_NAN, ", sizeof(list) - 1); + vlog_error("ERROR: required half features are missing: %s\n", list); + + return TEST_FAIL; + } +#else + vlog_error("FAIL: device says it supports cl_khr_fp16 but " + "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n"); + return TEST_FAIL; +#endif + } + + uint32_t deviceFrequency = 0; size_t configSize = sizeof(deviceFrequency); if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, @@ -826,6 +888,7 @@ test_status InitCL(cl_device_id device) "Bruteforce_Ulp_Error_Double() for more details.\n\n"); } + vlog("\tTesting half precision? %s\n", no_yes[0 != gHasHalf]); vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]); if (gIsEmbedded) vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]); diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index 16db3d672..5c9015b05 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -4698,6 +4698,40 @@ double reference_nextafter(double xx, double yy) return a.f; } +cl_half reference_nanh(cl_ushort x) +{ + cl_ushort u; + cl_half h; + u = x | 0x7e00U; + memcpy(&h, &u, sizeof(cl_half)); + return h; +} + +float reference_nextafterh(float xx, float yy) +{ + cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE); + cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE); + float x = cl_half_to_float(tmp_a); + float y = cl_half_to_float(tmp_b); + + // take care of nans + if (x != x) return x; + + if (y != y) return y; + + if (x == y) return y; + + short a_h = cl_half_from_float(x, CL_HALF_RTE); + short b_h = cl_half_from_float(y, CL_HALF_RTE); + + if (a_h & 0x8000) a_h = 0x8000 - a_h; + if (b_h & 0x8000) b_h = 0x8000 - b_h; + + a_h += (a_h < b_h) ? 1 : -1; + a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h; + + return cl_half_to_float(a_h); +} long double reference_nextafterl(long double xx, long double yy) { diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h index 78b245105..b9b2e4695 100644 --- a/test_conformance/math_brute_force/reference_math.h +++ b/test_conformance/math_brute_force/reference_math.h @@ -18,8 +18,10 @@ #if defined(__APPLE__) #include + #else #include +#include "CL/cl_half.h" #endif // -- for testing float -- @@ -160,6 +162,8 @@ long double reference_fractl(long double, long double*); long double reference_fmal(long double, long double, long double); long double reference_madl(long double, long double, long double); long double reference_nextafterl(long double, long double); +float reference_nextafterh(float, float); +cl_half reference_nanh(cl_ushort); long double reference_recipl(long double); long double reference_rootnl(long double, int); long double reference_rsqrtl(long double); diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp index 8af136ac2..10cca4c10 100644 --- a/test_conformance/math_brute_force/ternary_double.cpp +++ b/test_conformance/math_brute_force/ternary_double.cpp @@ -204,8 +204,7 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); } // anonymous namespace diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp index c69083ada..cbcb0e2ef 100644 --- a/test_conformance/math_brute_force/ternary_float.cpp +++ b/test_conformance/math_brute_force/ternary_float.cpp @@ -212,8 +212,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); } // anonymous namespace diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h index 78aef9c9a..91cca1633 100644 --- a/test_conformance/math_brute_force/test_functions.h +++ b/test_conformance/math_brute_force/test_functions.h @@ -24,6 +24,9 @@ int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double) int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half) +int TestFunc_Half_Half(const Func *f, MTdata, bool relaxedMode); + // int foo(float) int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode); @@ -36,6 +39,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode); // double foo(ulong) int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode); +// half (Ushort) +int TestFunc_Half_UShort(const Func *f, MTdata, bool relaxedMode); + // Returns {0, 1} for scalar and {0, -1} for vector. // int foo(float) int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode); @@ -44,21 +50,34 @@ int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode); // int foo(double) int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode); +// int foo(half,half) +int TestMacro_Int_Half_Half(const Func *f, MTdata, bool relaxedMode); + +// int foo(half) +int TestMacro_Int_Half(const Func *f, MTdata, bool relaxedMode); + +// int foo(half) +int TestFunc_Int_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, float) int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double) int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode); +// Half foo(half, half) +int TestFunc_Half_Half_Half(const Func *f, MTdata, bool relaxedMode); // Special handling for nextafter. -// float foo(float, float) -int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata, - bool relaxedMode); +// Half foo(Half, Half) +int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata, bool relaxedMode); + +// Half foo(Half, Half) +int TestFunc_Half_Half_Half_common(const Func *f, MTdata, int isNextafter, + bool relaxedMode); + +// Half foo(Half, int) +int TestFunc_Half_Half_Int(const Func *f, MTdata, bool relaxedMode); -// Special handling for nextafter. -// double foo(double, double) -int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata, - bool relaxedMode); // float op float int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, @@ -115,4 +134,7 @@ int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); // double mad(double, double, double) int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode); +// half mad(half, half, half) +int TestFunc_mad_Half(const Func *f, MTdata, bool relaxedMode); + #endif diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp new file mode 100644 index 000000000..f60823ffa --- /dev/null +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -0,0 +1,600 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +static int BuildKernelHalf(const char *name, int vectorSize, + cl_uint kernel_count, cl_kernel *k, cl_program *p, + bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half", + sizeNames[vectorSize], + "* out, __global half", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half* out, __global half* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " half3 f0 = vload3( 0, in + 3 * i );\n" + " f0 = ", + name, + "( f0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " half3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " f0 = (half3)( in[3*i], NAN, NAN ); \n" + " break;\n" + " case 0:\n" + " f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( f0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, + relaxedMode); +} + + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_uint kernel_count; + cl_kernel **kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernel_count, + info->kernels[i], info->programs + i, + info->relaxedMode); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + cl_mem inBuf; // input buffer for the thread + cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double maxErrorValue; // position of the max error value. Init to 0. + cl_command_queue tQueue; // per thread command queue to improve performance +} ThreadInfo; + +typedef struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes + cl_kernel + *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each + // worker thread: k[vector_size][thread_id] + ThreadInfo * + tinfo; // An array of thread specific information for each worker thread + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isRangeLimited; // 1 if the function is only to be evaluated over a + // range + float half_sin_cos_tan_limit; +} TestInfo; + + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); + +int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info, 0, sizeof(test_info)); + test_info.threadCount = GetThreadCount(); + + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + // cl_kernels aren't thread safe, so we make one for each vector size for + // every thread + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + size_t array_size = test_info.threadCount * sizeof(cl_kernel); + test_info.k[i] = (cl_kernel *)malloc(array_size); + if (NULL == test_info.k[i]) + { + vlog_error("Error: Unable to allocate storage for kernels!\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.k[i], 0, array_size); + } + test_info.tinfo = + (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); + if (NULL == test_info.tinfo) + { + vlog_error( + "Error: Unable to allocate storage for thread specific data.\n"); + error = CL_OUT_OF_HOST_MEMORY; + goto exit; + } + memset(test_info.tinfo, 0, + test_info.threadCount * sizeof(*test_info.tinfo)); + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + goto exit; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + goto exit; + } + } + + // Check for special cases for unary float + test_info.isRangeLimited = 0; + test_info.half_sin_cos_tan_limit = 0; + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] + } + else if (0 == strcmp(f->name, "half_tan")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric + } + + // Init the kernels + { + BuildKernelInfo build_info = { gMinVectorSizeIndex, + test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + goto exit; + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + if (error) goto exit; + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + +exit: + for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) + { + clReleaseProgram(test_info.programs[i]); + if (test_info.k[i]) + { + for (j = 0; j < test_info.threadCount; j++) + clReleaseKernel(test_info.k[i][j]); + + free(test_info.k[i]); + } + } + if (test_info.tinfo) + { + for (i = 0; i < test_info.threadCount; i++) + { + clReleaseMemObject(test_info.tinfo[i].inBuf); + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + clReleaseMemObject(test_info.tinfo[i].outBuf[j]); + clReleaseCommandQueue(test_info.tinfo[i].tQueue); + } + + free(test_info.tinfo); + } + + return error; +} + +static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + const TestInfo *job = (const TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = job->tinfo + thread_id; + float ulps = job->ulps; + fptr func = job->f->func; + cl_uint j, k; + cl_int error = CL_SUCCESS; + + int isRangeLimited = job->isRangeLimited; + float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit; + int ftz = job->ftz; + + float *s = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + // Write the new values to the input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + { + p[j] = base + j * scale; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint16_t pattern = 0xdead; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + return error; + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + cl_ushort *t = (cl_ushort *)r; + s = (float *)malloc(buffer_elements * sizeof(float)); + for (j = 0; j < buffer_elements; j++) + { + s[j] = (float)cl_half_to_float(p[j]); + r[j] = cl_half_from_float(func.f_f(s[j]), CL_HALF_RTE); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + goto exit; + } + } + // Wait for the last buffer + out[j] = (uint16_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + goto exit; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + float test = cl_half_to_float(q[j]); + double correct = func.f_f(s[j]); + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + // half_sin/cos/tan are only valid between +-2**16, Inf, NaN + if (isRangeLimited + && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) + && fabsf(s[j]) < INFINITY) + { + if (fabsf(test) <= half_sin_cos_tan_limit) + { + err = 0; + fail = 0; + } + } + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfSubnormal( + cl_half_from_float(correct, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2 = func.f_f(0.0); + double correct3 = func.f_f(-0.0); + float err2 = Ulp_Error_Half(q[j], correct2); + float err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfSubnormal( + cl_half_from_float(correct2, CL_HALF_RTE)) + || IsHalfSubnormal( + cl_half_from_float(correct3, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + } + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at %a " + "(0x%0.4x)\nExpected: %a (half 0x%0.4x) " + "\nActual: %a (half 0x%0.4x)\n", + job->f->name, sizeNames[k], err, s[j], p[j], + t[j], cl_half_to_float(r[j]), test, q[j]); + error = -1; + goto exit; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + goto exit; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } +exit: + if (s) free(s); + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp new file mode 100644 index 000000000..f8e8b6231 --- /dev/null +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -0,0 +1,334 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" +#include "reference_math.h" + +#include + + +static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, + cl_program *p, bool relaxedMode) +{ + const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half", + sizeNames[vectorSize], + "* out, __global ushort", + sizeNames[vectorSize], + "* in)\n" + "{\n" + " int i = get_global_id(0);\n" + " out[i] = ", + name, + "( in[i] );\n" + "}\n" }; + + const char *c3[] = { + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", + "__kernel void math_kernel", + sizeNames[vectorSize], + "( __global half* out, __global ushort* in)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " if( i + 1 < get_global_size(0) )\n" + " {\n" + " ushort3 u0 = vload3( 0, in + 3 * i );\n" + " half3 f0 = ", + name, + "( u0 );\n" + " vstore3( f0, 0, out + 3*i );\n" + " }\n" + " else\n" + " {\n" + " size_t parity = i & 1; // Figure out how many elements are " + "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " + "buffer size \n" + " ushort3 u0;\n" + " half3 f0;\n" + " switch( parity )\n" + " {\n" + " case 1:\n" + " u0 = (ushort3)( in[3*i], 0xdead, 0xdead ); \n" + " break;\n" + " case 0:\n" + " u0 = (ushort3)( in[3*i], in[3*i+1], 0xdead ); \n" + " break;\n" + " }\n" + " f0 = ", + name, + "( u0 );\n" + " switch( parity )\n" + " {\n" + " case 0:\n" + " out[3*i+1] = f0.y; \n" + " // fall through\n" + " case 1:\n" + " out[3*i] = f0.x; \n" + " break;\n" + " }\n" + " }\n" + "}\n" + }; + + const char **kern = c; + size_t kernSize = sizeof(c) / sizeof(c[0]); + + if (sizeValues[vectorSize] == 3) + { + kern = c3; + kernSize = sizeof(c3) / sizeof(c3[0]); + } + + char testName[32]; + snprintf(testName, sizeof(testName) - 1, "math_kernel%s", + sizeNames[vectorSize]); + + return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); +} + +typedef struct BuildKernelInfo +{ + cl_uint offset; // the first vector size to build + cl_kernel *kernels; + cl_program *programs; + const char *nameInCode; + bool relaxedMode; +} BuildKernelInfo; + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo *info = (BuildKernelInfo *)p; + cl_uint i = info->offset + job_id; + return BuildKernelHalf(info->nameInCode, i, info->kernels + i, + info->programs + i, info->relaxedMode); +} + + +int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) +{ + uint64_t i; + uint32_t j, k; + int error; + cl_program programs[VECTOR_SIZE_COUNT]; + cl_kernel kernels[VECTOR_SIZE_COUNT]; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + size_t bufferSize = BUFFER_SIZE; + size_t bufferElements = bufferSize / sizeof(cl_half); + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + int scale = (int)((1ULL << 32) / (16 * bufferElements) + 1); + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + const char *name = f->name; + float half_ulps = f->half_ulps; + if (gWimpyMode) + { + step = (1ULL << 32) * gWimpyReductionFactor / (512); + } + + // Init the kernels + BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, + f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + { + return error; + } + + for (i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + if (gWimpyMode) + { + for (j = 0; j < bufferElements; j++) p[j] = i + j * scale; + } + else + { + for (j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint16_t pattern = 0xdead; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = + clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + goto exit; + } + } + + // Run the kernels + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_half); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), + &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + goto exit; + } + if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), + &gInBuffer))) + { + LogBuildError(programs[j]); + goto exit; + } + + if ((error = + clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, + &localCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILURE -- could not execute kernel\n"); + goto exit; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + cl_half *r = (cl_half *)gOut_Ref; + for (j = 0; j < bufferElements; j++) + { + if (!strcmp(name, "nan")) + r[j] = reference_nanh(p[j]); + else + r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE); + } + // Read the data back + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + goto exit; + } + } + + if (gSkipCorrectnessTesting) break; + + + // Verify data + cl_ushort *t = (cl_ushort *)gOut_Ref; + for (j = 0; j < bufferElements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = (cl_ushort *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double test = cl_half_to_float(q[j]); + double correct; + if (!strcmp(name, "nan")) + correct = cl_half_to_float(reference_nanh(p[j])); + else + correct = f->func.f_u(p[j]); + + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= half_ulps); + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfSubnormal( + cl_half_from_float(correct, CL_HALF_RTE))) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = p[j]; + } + if (fail) + { + vlog_error( + "\n%s%s: %f ulp error at 0x%0.4x \nExpected: %a " + "(0x%0.4x) \nActual: %a (0x%0.4x)\n", + f->name, sizeNames[k], err, p[j], + cl_half_to_float(r[j]), r[j], test, q[j]); + error = -1; + goto exit; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, + bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + +exit: + // Release + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + clReleaseKernel(kernels[k]); + clReleaseProgram(programs[k]); + } + + return error; +} \ No newline at end of file diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index b4a59edb5..d4d857150 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -22,6 +22,7 @@ #include "harness/testHarness.h" #include "harness/ThreadPool.h" #include "harness/conversions.h" +#include "CL/cl_half.h" #define BUFFER_SIZE (1024 * 1024 * 2) #define EMBEDDED_REDUCTION_FACTOR (64) @@ -60,12 +61,15 @@ extern int gForceFTZ; extern int gFastRelaxedDerived; extern int gWimpyMode; extern int gIsInRTZMode; +extern int gHasHalf; extern int gInfNanSupport; extern int gIsEmbedded; extern int gVerboseBruteForce; extern uint32_t gMaxVectorSizeIndex; extern uint32_t gMinVectorSizeIndex; extern cl_device_fp_config gFloatCapabilities; +extern cl_device_fp_config gHalfCapabilities; +extern RoundingMode gFloatToHalfRoundingMode; #define LOWER_IS_BETTER 0 #define HIGHER_IS_BETTER 1 From 30467ce0031d9dcd85ff903ac4b3d31d3819cf11 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 23 Mar 2023 12:32:44 +0100 Subject: [PATCH 02/24] Added modernization of remaining half tests for consistency (issue #142, bruteforce) --- .../math_brute_force/i_unary_half.cpp | 97 +++++++++++-------- .../math_brute_force/mad_half.cpp | 87 ++++++++++------- .../math_brute_force/unary_u_half.cpp | 77 +++++++++------ 3 files changed, 156 insertions(+), 105 deletions(-) diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index 245528e10..c274f3e91 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -14,12 +14,16 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include +#include +#include +#if 0 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { @@ -118,19 +122,33 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernelHalf(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } +#else + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Int, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +#endif int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { - uint64_t i; - uint32_t j, k; int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; size_t bufferSize = BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); uint64_t bufferElements = bufferSize / sizeof(cl_int); - float *s = 0; + std::vector s; int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); @@ -139,30 +157,31 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) // for reference computations FPU_mode_type oldMode; DisableFTZ(&oldMode); + std::shared_ptr at_scope_exit( + nullptr, [&oldMode](int *) { RestoreFPState(&oldMode); }); // Init the kernels - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode }; - if ((error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) { - return error; + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; } - s = (float *)malloc(bufferElements * sizeof(float)); + s.resize(bufferElements); - for (i = 0; i < (1ULL << 16); i += step) + for (uint64_t i = 0; i < (1ULL << 16); i += step) { // Init input array cl_ushort *p = (cl_ushort *)gIn; if (gWimpyMode) { - for (j = 0; j < bufferElements; j++) + for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j * scale; } else { - for (j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; + for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; } if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL))) @@ -172,7 +191,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) } // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint32_t pattern = 0xffffdead; memset_pattern4(gOut[j], &pattern, bufferSize); @@ -182,34 +201,34 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j); - goto exit; + return error; } } // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_int); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) { vlog_error("FAILED -- could not execute kernel\n"); - goto exit; + return error; } } @@ -218,20 +237,20 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) // Calculate the correctly rounded reference result int *r = (int *)gOut_Ref; - for (j = 0; j < bufferElements; j++) + for (size_t j = 0; j < bufferElements; j++) { s[j] = cl_half_to_float(p[j]); r[j] = f->func.i_f(s[j]); } // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); - goto exit; + return error; } } @@ -239,9 +258,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) // Verify data uint32_t *t = (uint32_t *)gOut_Ref; - for (j = 0; j < bufferElements; j++) + for (size_t j = 0; j < bufferElements; j++) { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { uint32_t *q = (uint32_t *)(gOut[k]); // If we aren't getting the correctly rounded result @@ -260,8 +279,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) "*%d vs. %d\n", f->name, sizeNames[k], err, s[j], p[j], t[j], q[j]); - error = -1; - goto exit; + return -1; } } } @@ -270,8 +288,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zd \n", + i, step, bufferSize); } else { @@ -292,6 +311,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) vlog("\n"); +#if 0 exit: if (s) free(s); RestoreFPState(&oldMode); @@ -301,6 +321,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); } +#endif return error; -} \ No newline at end of file +} diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp index a36e8d665..56115b10c 100644 --- a/test_conformance/math_brute_force/mad_half.cpp +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -14,13 +14,14 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include - +#if 0 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { @@ -129,13 +130,28 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, info->programs + i, info->relaxedMode); } +#else + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetTernaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +#endif + int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) { - uint64_t i; - uint32_t j, k; int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. float maxError = 0.0f; // int ftz = f->ftz || gForceFTZ; float maxErrorVal = 0.0f; @@ -150,21 +166,20 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) step = (1ULL << 32) * gWimpyReductionFactor / (512); } // Init the kernels - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode }; - if ((error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info))) { - return error; + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; } - for (i = 0; i < (1ULL << 32); i += step) + for (uint64_t i = 0; i < (1ULL << 32); i += step) { // Init input array cl_ushort *p = (cl_ushort *)gIn; cl_ushort *p2 = (cl_ushort *)gIn2; cl_ushort *p3 = (cl_ushort *)gIn3; - for (j = 0; j < bufferSize / sizeof(cl_ushort); j++) + for (size_t j = 0; j < bufferSize / sizeof(cl_ushort); j++) { p[j] = (cl_ushort)genrand_int32(d); p2[j] = (cl_ushort)genrand_int32(d); @@ -190,7 +205,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) } // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint16_t pattern = 0xdead; memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); @@ -200,47 +215,47 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) { vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j); - goto exit; + return error; } } // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeof(cl_half) * sizeValues[j]; size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; // bufferSize / vectorSize rounded up - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2), - &gInBuffer2))) + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer2), &gInBuffer2))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3), - &gInBuffer3))) + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer3), &gInBuffer3))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) { vlog_error("FAILED -- could not execute kernel\n"); - goto exit; + return error; } } @@ -248,14 +263,14 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); - goto exit; + return error; } } @@ -283,6 +298,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) } vlog("\n"); +#if 0 exit: // Release for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) @@ -290,6 +306,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); } +#endif return error; -} \ No newline at end of file +} diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index f8e8b6231..c063de7fa 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -14,6 +14,7 @@ // limitations under the License. // +#include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" @@ -21,7 +22,7 @@ #include - +#if 0 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, cl_program *p, bool relaxedMode) { @@ -120,15 +121,28 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernelHalf(info->nameInCode, i, info->kernels + i, info->programs + i, info->relaxedMode); } +#else + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::UInt, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} +#endif int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { - uint64_t i; - uint32_t j, k; int error; - cl_program programs[VECTOR_SIZE_COUNT]; - cl_kernel kernels[VECTOR_SIZE_COUNT]; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); float maxErrorVal = 0.0f; @@ -145,8 +159,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) } // Init the kernels - BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs, - f->nameInCode }; + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernel_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -154,17 +167,17 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) return error; } - for (i = 0; i < (1ULL << 32); i += step) + for (uint64_t i = 0; i < (1ULL << 32); i += step) { // Init input array cl_ushort *p = (cl_ushort *)gIn; if (gWimpyMode) { - for (j = 0; j < bufferElements; j++) p[j] = i + j * scale; + for (size_t j = 0; j < bufferElements; j++) p[j] = i + j * scale; } else { - for (j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; + for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; } if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, @@ -175,7 +188,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) } // write garbage into output arrays - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { uint16_t pattern = 0xdead; memset_pattern4(gOut[j], &pattern, bufferSize); @@ -185,34 +198,34 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", error, j); - goto exit; + return error; } } // Run the kernels - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_half); size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; - if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]), - &gOutBuffer[j]))) + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer), - &gInBuffer))) + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) { LogBuildError(programs[j]); - goto exit; + return error; } - if ((error = - clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL, - &localCount, NULL, 0, NULL, NULL))) + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) { vlog_error("FAILURE -- could not execute kernel\n"); - goto exit; + return error; } } @@ -221,7 +234,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) // Calculate the correctly rounded reference result cl_half *r = (cl_half *)gOut_Ref; - for (j = 0; j < bufferElements; j++) + for (size_t j = 0; j < bufferElements; j++) { if (!strcmp(name, "nan")) r[j] = reference_nanh(p[j]); @@ -229,25 +242,24 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE); } // Read the data back - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, bufferSize, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); - goto exit; + return error; } } if (gSkipCorrectnessTesting) break; - // Verify data cl_ushort *t = (cl_ushort *)gOut_Ref; - for (j = 0; j < bufferElements; j++) + for (size_t j = 0; j < bufferElements; j++) { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ushort *q = (cl_ushort *)(gOut[k]); @@ -289,8 +301,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) "(0x%0.4x) \nActual: %a (0x%0.4x)\n", f->name, sizeNames[k], err, p[j], cl_half_to_float(r[j]), r[j], test, q[j]); - error = -1; - goto exit; + return -1; } } } @@ -322,6 +333,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); vlog("\n"); +#if 0 exit: // Release for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) @@ -329,6 +341,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) clReleaseKernel(kernels[k]); clReleaseProgram(programs[k]); } +#endif return error; -} \ No newline at end of file +} From 5f3f2ec3279083cffc991005ee60f5e881345774 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 23 Mar 2023 15:32:10 +0100 Subject: [PATCH 03/24] Added kernel types related corrections --- test_conformance/math_brute_force/binary_half.cpp | 12 +++++++----- test_conformance/math_brute_force/binary_i_half.cpp | 12 +++++++----- test_conformance/math_brute_force/common.cpp | 7 +++++++ test_conformance/math_brute_force/common.h | 3 +++ .../math_brute_force/macro_binary_half.cpp | 12 +++++++----- .../math_brute_force/macro_unary_half.cpp | 13 ++++++++----- test_conformance/math_brute_force/unary_half.cpp | 11 +++++++---- test_conformance/math_brute_force/unary_u_half.cpp | 8 +++++--- 8 files changed, 51 insertions(+), 27 deletions(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index 4200b07d9..e40e7727d 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -132,10 +132,11 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -#else +#endif -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; auto generator = [](const std::string &kernel_name, const char *builtin, @@ -147,8 +148,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -#endif - +//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo @@ -201,6 +201,8 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; +} + //////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right static const cl_half specialValuesHalf[] = { diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp index 1e772799c..c4c7288af 100644 --- a/test_conformance/math_brute_force/binary_i_half.cpp +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -130,10 +130,11 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -#else +#endif -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; auto generator = [](const std::string &kernel_name, const char *builtin, @@ -145,8 +146,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } - -#endif +//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo @@ -197,6 +197,8 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; +} + //////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp index 580bbacd9..3bfde6483 100644 --- a/test_conformance/math_brute_force/common.cpp +++ b/test_conformance/math_brute_force/common.cpp @@ -30,6 +30,8 @@ const char *GetTypeName(ParameterType type) case ParameterType::Half: return "half"; case ParameterType::Float: return "float"; case ParameterType::Double: return "double"; + case ParameterType::Short: return "short"; + case ParameterType::UShort: return "ushort"; case ParameterType::Int: return "int"; case ParameterType::UInt: return "uint"; case ParameterType::Long: return "long"; @@ -46,6 +48,9 @@ const char *GetUndefValue(ParameterType type) case ParameterType::Float: case ParameterType::Double: return "NAN"; + case ParameterType::Short: + case ParameterType::UShort: return "0x5678"; + case ParameterType::Int: case ParameterType::UInt: return "0x12345678"; @@ -81,6 +86,8 @@ void EmitEnableExtension(std::ostringstream &kernel, ParameterType type) break; case ParameterType::Float: + case ParameterType::Short: + case ParameterType::UShort: case ParameterType::Int: case ParameterType::UInt: case ParameterType::Long: diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h index f70e7d232..793a00fe9 100644 --- a/test_conformance/math_brute_force/common.h +++ b/test_conformance/math_brute_force/common.h @@ -39,6 +39,8 @@ enum class ParameterType Half, Float, Double, + Short, + UShort, Int, UInt, Long, @@ -92,4 +94,5 @@ using SourceGenerator = std::string (*)(const std::string &kernel_name, cl_int BuildKernels(BuildKernelInfo &info, cl_uint job_id, SourceGenerator generator); + #endif /* COMMON_H */ diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index 4060a8b5a..5810380e4 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -123,23 +123,23 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -#else +#endif +namespace { -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; auto generator = [](const std::string &kernel_name, const char *builtin, cl_uint vector_size_index) { - return GetBinaryKernel(kernel_name, builtin, ParameterType::Int, + return GetBinaryKernel(kernel_name, builtin, ParameterType::Short, ParameterType::Half, ParameterType::Half, vector_size_index); }; return BuildKernels(info, job_id, generator); } -#endif +//////////////////////////////////////////////////////////////////////////////// typedef struct ThreadInfo { @@ -181,6 +181,8 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; +} + //////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index d2ba1b36a..0e93f5324 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -125,21 +125,22 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, info->kernels[i], info->programs + i, info->relaxedMode); } -#else +#endif -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; auto generator = [](const std::string &kernel_name, const char *builtin, cl_uint vector_size_index) { - return GetUnaryKernel(kernel_name, builtin, ParameterType::Int, + return GetUnaryKernel(kernel_name, builtin, ParameterType::Short, ParameterType::Half, vector_size_index); }; return BuildKernels(info, job_id, generator); } -#endif +//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo @@ -179,6 +180,8 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; +} + //////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index 320ad12c4..d88cd7297 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -126,10 +126,11 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, info->relaxedMode); } -#else +#endif -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; auto generator = [](const std::string &kernel_name, const char *builtin, @@ -140,7 +141,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -#endif +//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo @@ -187,6 +188,8 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; +} + //////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index c063de7fa..94fb880dd 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -21,6 +21,7 @@ #include "reference_math.h" #include +#include #if 0 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, @@ -130,7 +131,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, auto generator = [](const std::string &kernel_name, const char *builtin, cl_uint vector_size_index) { return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, - ParameterType::UInt, vector_size_index); + ParameterType::UShort, vector_size_index); }; return BuildKernels(info, job_id, generator); } @@ -311,8 +312,9 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10zu bufferSize:%10zd \n", i, step, - bufferSize); + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zd \n", + i, step, bufferSize); } else { From cf97168626e15b98a69dc79e83b217f2cbdc2ab1 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 23 Mar 2023 17:04:07 +0100 Subject: [PATCH 04/24] Added more fixes and general cleanup --- .../math_brute_force/binary_half.cpp | 241 +++--------------- .../math_brute_force/binary_i_half.cpp | 217 ++-------------- .../math_brute_force/i_unary_half.cpp | 122 +-------- .../math_brute_force/macro_binary_half.cpp | 219 ++-------------- .../math_brute_force/macro_unary_half.cpp | 200 ++------------- .../math_brute_force/mad_half.cpp | 127 +-------- .../math_brute_force/unary_half.cpp | 197 ++------------ .../math_brute_force/unary_u_half.cpp | 117 +-------- 8 files changed, 123 insertions(+), 1317 deletions(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index e40e7727d..1aeb36aff 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,128 +14,20 @@ // limitations under the License. // +#include "harness/errorHelpers.h" + #include "common.h" #include "function_list.h" #include "test_functions.h" #include "utility.h" #include "reference_math.h" -#include - -#include "harness/errorHelpers.h" - -#if 0 - -static int BuildKernelHalf(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in1, __global half", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half* out, __global half* in, __global half* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 d0 = vload3( 0, in + 3 * i );\n" - " half3 d1 = vload3( 0, in2 + 3 * i );\n" - " d0 = ", - name, - "( d0, d1 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " half3 d0, d1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (half3)( in[3*i], NAN, NAN ); \n" - " d1 = (half3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} -#endif +#include +#include namespace { +//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -149,24 +41,24 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) } //////////////////////////////////////////////////////////////////////////////// - // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. double maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + MTdataHolder d; -//////////////////////////////////////////////////////////////////////////////// + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; +//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -185,7 +77,6 @@ struct TestInfoBase }; //////////////////////////////////////////////////////////////////////////////// - struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -216,10 +107,11 @@ static const cl_half specialValuesHalf[] = { 0x0400 /*HALF_MIN*/ }; +//////////////////////////////////////////////////////////////////////////////// static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); - static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); +//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, bool relaxedMode) { @@ -260,40 +152,8 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; test_info.isNextafter = isNextafter; -#if 0 - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); - -#else - test_info.tinfo.resize(test_info.threadCount); -#endif - for (i = 0; i < test_info.threadCount; i++) { cl_buffer_region region = { i * test_info.subBufferSize @@ -340,7 +200,7 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, vlog_error("clCreateCommandQueue failed. (%d)\n", error); return error; } - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); } // Init the kernels @@ -379,39 +239,10 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, vlog("\n"); -#if 0 -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } -#endif - return error; } +//////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; @@ -432,7 +263,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) int isNextafter = job->isNextafter; cl_ushort *t; cl_half *r; - float *s = 0, *s2 = 0; + std::vector s(0), s2(0); RoundingMode oldRoundMode; cl_int copysign_test = 0; @@ -495,14 +326,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) buffer_size, p, 0, NULL, NULL))) { vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; + return error; } if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL))) { vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; + return error; } for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) @@ -511,12 +342,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clWaitForEvents(1, e + j))) { vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; + return error; } if ((error = clReleaseEvent(e[j]))) { vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; + return error; } // Fill the result buffer with garbage, so that old results don't carry @@ -527,7 +358,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) out[j], 0, NULL, NULL))) { vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; + return error; } // run the kernel @@ -560,7 +391,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) &vectorCount, NULL, 0, NULL, NULL))) { vlog_error("FAILED -- could not execute kernel\n"); - goto exit; + return error; } } @@ -591,8 +422,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result r = (cl_half *)gOut_Ref + thread_id * buffer_elements; t = (cl_ushort *)r; - s = (float *)malloc(buffer_elements * sizeof(float)); - s2 = (float *)malloc(buffer_elements * sizeof(float)); + s.resize(buffer_elements); + s2.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) for (j = 0; j < buffer_elements; j++) { @@ -617,7 +448,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } } @@ -628,7 +459,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (error || NULL == out[j]) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } // Verify data @@ -872,7 +703,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) p2[j], cl_half_to_float(r[j]), r[j], test, q[j], j); error = -1; - goto exit; + return error; } } } @@ -893,12 +724,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) { - vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f " + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " "ThreadCount:%2u\n", base, job->step, job->scale, buffer_elements, job->ulps, job->threadCount); @@ -909,18 +739,17 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } fflush(stdout); } -exit: - if (s) free(s); - if (s2) free(s2); + return error; } - +//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode); } +//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode); diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp index c4c7288af..571683e5d 100644 --- a/test_conformance/math_brute_force/binary_i_half.cpp +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,118 +22,9 @@ #include #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in1, __global int", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half* out, __global half* in, __global int* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 d0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = vload3( 0, in2 + 3 * i );\n" - " d0 = ", - name, - "( d0, i0 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " half3 d0;\n" - " int3 i0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (half3)( in[3*i], NAN, NAN ); \n" - " i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" - " i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, i0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} -#endif - namespace { +//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -147,24 +38,23 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) } //////////////////////////////////////////////////////////////////////////////// - // Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value (param 1). Init to 0. cl_int maxErrorValue2; // position of the max error value (param 2). Init // to 0. MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance } ThreadInfo; //////////////////////////////////////////////////////////////////////////////// - struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -176,12 +66,9 @@ struct TestInfoBase cl_uint scale; // stride between individual test values float ulps; // max_allowed ulps int ftz; // non-zero if running in flush to zero mode - - // no special values }; //////////////////////////////////////////////////////////////////////////////// - struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -258,38 +145,8 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); -#if 0 - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); -#else - test_info.tinfo.resize(test_info.threadCount); -#endif - for (i = 0; i < test_info.threadCount; i++) { cl_buffer_region region = { i * test_info.subBufferSize @@ -382,42 +239,12 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); } - vlog("\n"); -#if 0 -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } -#endif - return error; } +//////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; @@ -433,7 +260,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) const char *name = job->f->name; cl_ushort *t; cl_half *r; - float *s = 0; + std::vector s; cl_int *s2; // start the map of the output arrays @@ -494,7 +321,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) NULL, NULL))) { vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; + return error; } if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, @@ -502,7 +329,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) NULL, NULL))) { vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; + return error; } for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) @@ -511,12 +338,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clWaitForEvents(1, e + j))) { vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; + return error; } if ((error = clReleaseEvent(e[j]))) { vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; + return error; } // Fill the result buffer with garbage, so that old results don't carry @@ -527,7 +354,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) out[j], 0, NULL, NULL))) { vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; + return error; } // run the kernel @@ -560,7 +387,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) &vectorCount, NULL, 0, NULL, NULL))) { vlog_error("FAILED -- could not execute kernel\n"); - goto exit; + return error; } } @@ -572,7 +399,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result r = (cl_half *)gOut_Ref + thread_id * buffer_elements; t = (cl_ushort *)r; - s = (float *)malloc(buffer_elements * sizeof(float)); + s.resize(buffer_elements); s2 = (cl_int *)gIn2 + thread_id * buffer_elements; for (j = 0; j < buffer_elements; j++) { @@ -591,7 +418,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } } @@ -602,7 +429,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (error || NULL == out[j]) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } // Verify data @@ -672,7 +499,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_half_to_float(r[j]), r[j], test, q[j], (cl_uint)j); error = -1; - goto exit; + return error; } } } @@ -685,13 +512,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error); - goto exit; + return error; } } if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) @@ -707,8 +533,5 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } fflush(stdout); } - -exit: - if (s) free(s); return error; } diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index c274f3e91..c78c03a49 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,107 +23,7 @@ #include #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global int", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global int* out, __global half* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 f0 = vload3( 0, in + 3 * i );\n" - " int3 i0 = ", - name, - "( f0 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(half)). Assume power of two " - "buffer size \n" - " half3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (half3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " int3 i0 = ", - name, - "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} -#else - +//////////////////////////////////////////////////////////////////////////////// static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -136,8 +36,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -#endif - +//////////////////////////////////////////////////////////////////////////////// int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -148,7 +47,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) size_t bufferSize = BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); uint64_t bufferElements = bufferSize / sizeof(cl_int); - std::vector s; + std::vector s(0); int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); @@ -310,18 +209,5 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) vlog("\n"); - -#if 0 -exit: - if (s) free(s); - RestoreFPState(&oldMode); - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } -#endif - return error; } diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index 5810380e4..8af034c43 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,112 +21,10 @@ #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global short", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in1, __global half", - sizeNames[vectorSize], - "* in2 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i] );\n" - "}\n" }; - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global short* out, __global half* in, __global half* in2)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 f0 = vload3( 0, in + 3 * i );\n" - " half3 f1 = vload3( 0, in2 + 3 * i );\n" - " short3 i0 = ", - name, - "( f0, f1 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " half3 f0, f1;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (half3)( in[3*i], NAN, NAN ); \n" - " f1 = (half3)( in2[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" - " f1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " short3 i0 = ", - name, - "( f0, f1 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} -#endif namespace { +//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -140,18 +38,17 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) } //////////////////////////////////////////////////////////////////////////////// - -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem inBuf2; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread MTdata d; - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; //////////////////////////////////////////////////////////////////////////////// - struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -165,7 +62,6 @@ struct TestInfoBase }; //////////////////////////////////////////////////////////////////////////////// - struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -184,7 +80,6 @@ struct TestInfo : public TestInfoBase } //////////////////////////////////////////////////////////////////////////////// - // A table of more difficult cases to get right static const cl_half specialValuesHalf[] = { 0xffff, @@ -197,11 +92,11 @@ static const cl_half specialValuesHalf[] = { 0x0400 /*HALF_MIN*/ }; +//////////////////////////////////////////////////////////////////////////////// static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); - - static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); +//////////////////////////////////////////////////////////////////////////////// int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) { TestInfoBase test_info_base; @@ -234,37 +129,8 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); -#if 0 - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); -#else - test_info.tinfo.resize(test_info.threadCount); -#endif for (i = 0; i < test_info.threadCount; i++) { cl_buffer_region region = { i * test_info.subBufferSize @@ -315,7 +181,6 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) test_info.tinfo[i].d = init_genrand(genrand_int32(d)); } - // Init the kernels { BuildKernelInfo build_info = { test_info.threadCount, test_info.k, @@ -340,39 +205,10 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) vlog("\n"); -#if 0 -exit: - // Release - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - free_mtdata(test_info.tinfo[i].d); - clReleaseMemObject(test_info.tinfo[i].inBuf); - clReleaseMemObject(test_info.tinfo[i].inBuf2); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } -#endif - return error; } +//////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; @@ -387,7 +223,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_int error; const char *name = job->f->name; cl_short *t, *r; - float *s = 0, *s2 = 0; + std::vector s(0), s2(0); // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; @@ -448,14 +284,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) buffer_size, p, 0, NULL, NULL))) { vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; + return error; } if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, buffer_size, p2, 0, NULL, NULL))) { vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); - goto exit; + return error; } for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) @@ -464,12 +300,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if ((error = clWaitForEvents(1, e + j))) { vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - goto exit; + return error; } if ((error = clReleaseEvent(e[j]))) { vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - goto exit; + return error; } // Fill the result buffer with garbage, so that old results don't carry @@ -480,7 +316,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) out[j], 0, NULL, NULL))) { vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - goto exit; + return error; } // run the kernel @@ -513,7 +349,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) &vectorCount, NULL, 0, NULL, NULL))) { vlog_error("FAILED -- could not execute kernel\n"); - goto exit; + return error; } } @@ -525,8 +361,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result r = (cl_short *)gOut_Ref + thread_id * buffer_elements; t = (cl_short *)r; - s = (float *)malloc(buffer_elements * sizeof(float)); - s2 = (float *)malloc(buffer_elements * sizeof(float)); + s.resize(buffer_elements); + s2.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) { s[j] = cl_half_to_float(p[j]); @@ -546,7 +382,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } } @@ -557,7 +393,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (error || NULL == out[j]) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } // Verify data @@ -605,7 +441,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n", name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j); error = -1; - goto exit; + return error; } @@ -653,7 +489,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) "(index: %d)\n", name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j); error = -1; - goto exit; + return error; } } } @@ -665,7 +501,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error); - goto exit; + return error; } } @@ -688,8 +524,5 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) fflush(stdout); } -exit: - if (s) free(s); - if (s2) free(s2); return error; } diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index 0e93f5324..36d3996ef 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -21,114 +21,9 @@ #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global short", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global short* out, __global half* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 f0 = vload3( 0, in + 3 * i );\n" - " short3 i0 = ", - name, - "( f0 );\n" - " vstore3( i0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " short3 i0;\n" - " half3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (half3)( in[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " f0 = (half3)( in[3*i], in[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " i0 = ", - name, - "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = i0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = i0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} -#endif - namespace { +//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -141,17 +36,16 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) } //////////////////////////////////////////////////////////////////////////////// - // Thread specific data for a worker thread -typedef struct ThreadInfo +struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread - cl_command_queue tQueue; // per thread command queue to improve performance -} ThreadInfo; + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; //////////////////////////////////////////////////////////////////////////////// - struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -164,7 +58,6 @@ struct TestInfoBase }; //////////////////////////////////////////////////////////////////////////////// - struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -183,9 +76,9 @@ struct TestInfo : public TestInfoBase } //////////////////////////////////////////////////////////////////////////////// - static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); +//////////////////////////////////////////////////////////////////////////////// int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) { TestInfoBase test_info_base; @@ -216,38 +109,8 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) test_info.f = f; test_info.ftz = f->ftz || gForceFTZ; -#if 0 - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); -#else - test_info.tinfo.resize(test_info.threadCount); -#endif - for (i = 0; i < test_info.threadCount; i++) { cl_buffer_region region = { i * test_info.subBufferSize @@ -310,36 +173,10 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) vlog("\n"); -#if 0 -exit: - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } -#endif - return error; } +//////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; @@ -353,7 +190,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_uint j, k; cl_int error = CL_SUCCESS; const char *name = job->f->name; - float *s = 0; + std::vector s(0); int signbit_test = 0; if (!strcmp(name, "signbit")) signbit_test = 1; @@ -452,7 +289,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements; cl_short *t = (cl_short *)r; - s = (float *)malloc(buffer_elements * sizeof(float)); + s.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) { s[j] = cl_half_to_float(p[j]); @@ -479,7 +316,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } } // Wait for the last buffer @@ -489,16 +326,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (error || NULL == out[j]) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } - // Verify data for (j = 0; j < buffer_elements; j++) { cl_short *q = out[0]; - // If we aren't getting the correctly rounded result if (gMinVectorSizeIndex == 0 && t[j] != q[j]) { @@ -519,7 +354,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) "%d vs. %d\n", name, err, s[j], p[j], t[j], q[j]); error = -1; - goto exit; + return error; } @@ -546,7 +381,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) "(0x%0.4x)\nExpected: %d \nActual: %d\n", name, sizeNames[k], err, s[j], p[j], -t[j], q[j]); error = -1; - goto exit; + return error; } } } @@ -558,13 +393,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error); - goto exit; + return error; } } if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); - if (0 == (base & 0x0fffffff)) { if (gVerboseBruteForce) @@ -580,7 +414,5 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } fflush(stdout); } -exit: - if (s) free(s); return error; } diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp index 56115b10c..ef6f2b776 100644 --- a/test_conformance/math_brute_force/mad_half.cpp +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,117 +21,7 @@ #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in1, __global half", - sizeNames[vectorSize], - "* in2, __global half", - sizeNames[vectorSize], - "* in3 )\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in1[i], in2[i], in3[i] );\n" - "}\n" }; - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half* out, __global half* in, __global half* in2, __global " - "half* in3)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 d0 = vload3( 0, in + 3 * i );\n" - " half3 d1 = vload3( 0, in2 + 3 * i );\n" - " half3 d2 = vload3( 0, in3 + 3 * i );\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " vstore3( d0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " half3 d0, d1, d2;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " d0 = (half3)( in[3*i], NAN, NAN ); \n" - " d1 = (half3)( in2[3*i], NAN, NAN ); \n" - " d2 = (half3)( in3[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" - " d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n" - " d2 = (half3)( in3[3*i], in3[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " d0 = ", - name, - "( d0, d1, d2 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = d0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = d0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} - -#else - +//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -144,8 +34,7 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -#endif - +//////////////////////////////////////////////////////////////////////////////// int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -298,15 +187,5 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) } vlog("\n"); -#if 0 -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } -#endif - return error; } diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index d88cd7297..f5de28d0d 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,115 +21,9 @@ #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, - cl_uint kernel_count, cl_kernel *k, cl_program *p, - bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half", - sizeNames[vectorSize], - "* out, __global half", - sizeNames[vectorSize], - "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half* out, __global half* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " half3 f0 = vload3( 0, in + 3 * i );\n" - " f0 = ", - name, - "( f0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " half3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " f0 = (half3)( in[3*i], NAN, NAN ); \n" - " break;\n" - " case 0:\n" - " f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( f0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p, - relaxedMode); -} - - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_uint kernel_count; - cl_kernel **kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernel_count, - info->kernels[i], info->programs + i, - info->relaxedMode); -} - -#endif - namespace { +//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -142,19 +36,18 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) } //////////////////////////////////////////////////////////////////////////////// - // Thread specific data for a worker thread typedef struct ThreadInfo { - cl_mem inBuf; // input buffer for the thread - cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread float maxError; // max error value. Init to 0. double maxErrorValue; // position of the max error value. Init to 0. - cl_command_queue tQueue; // per thread command queue to improve performance + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance } ThreadInfo; //////////////////////////////////////////////////////////////////////////////// - struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -172,7 +65,6 @@ struct TestInfoBase }; //////////////////////////////////////////////////////////////////////////////// - struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -191,9 +83,9 @@ struct TestInfo : public TestInfoBase } //////////////////////////////////////////////////////////////////////////////// - static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); +//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) { TestInfoBase test_info_base; @@ -229,38 +121,8 @@ int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) test_info.ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); -#if 0 - // cl_kernels aren't thread safe, so we make one for each vector size for - // every thread - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - size_t array_size = test_info.threadCount * sizeof(cl_kernel); - test_info.k[i] = (cl_kernel *)malloc(array_size); - if (NULL == test_info.k[i]) - { - vlog_error("Error: Unable to allocate storage for kernels!\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.k[i], 0, array_size); - } - test_info.tinfo = - (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo)); - if (NULL == test_info.tinfo) - { - vlog_error( - "Error: Unable to allocate storage for thread specific data.\n"); - error = CL_OUT_OF_HOST_MEMORY; - goto exit; - } - memset(test_info.tinfo, 0, - test_info.threadCount * sizeof(*test_info.tinfo)); -#else - test_info.tinfo.resize(test_info.threadCount); -#endif - for (i = 0; i < test_info.threadCount; i++) { cl_buffer_region region = { i * test_info.subBufferSize @@ -352,36 +214,10 @@ int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); vlog("\n"); -#if 0 -exit: - for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++) - { - clReleaseProgram(test_info.programs[i]); - if (test_info.k[i]) - { - for (j = 0; j < test_info.threadCount; j++) - clReleaseKernel(test_info.k[i][j]); - - free(test_info.k[i]); - } - } - if (test_info.tinfo) - { - for (i = 0; i < test_info.threadCount; i++) - { - clReleaseMemObject(test_info.tinfo[i].inBuf); - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - clReleaseMemObject(test_info.tinfo[i].outBuf[j]); - clReleaseCommandQueue(test_info.tinfo[i].tQueue); - } - - free(test_info.tinfo); - } -#endif - return error; } +//////////////////////////////////////////////////////////////////////////////// static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; @@ -399,7 +235,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit; int ftz = job->ftz; - float *s = 0; + std::vector s(0); // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; @@ -496,7 +332,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements; cl_ushort *t = (cl_ushort *)r; - s = (float *)malloc(buffer_elements * sizeof(float)); + s.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) { s[j] = (float)cl_half_to_float(p[j]); @@ -514,7 +350,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } } // Wait for the last buffer @@ -524,7 +360,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (error || NULL == out[j]) { vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - goto exit; + return error; } // Verify data @@ -602,9 +438,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) "(0x%0.4x)\nExpected: %a (half 0x%0.4x) " "\nActual: %a (half 0x%0.4x)\n", job->f->name, sizeNames[k], err, s[j], p[j], - t[j], cl_half_to_float(r[j]), test, q[j]); + cl_half_to_float(r[j]), t[j], test, q[j]); error = -1; - goto exit; + return error; } } } @@ -617,7 +453,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", j, error); - goto exit; + return error; } } @@ -639,7 +475,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } fflush(stdout); } -exit: - if (s) free(s); + return error; } diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index 94fb880dd..842e85a9b 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2023 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -23,107 +23,7 @@ #include #include -#if 0 -static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k, - cl_program *p, bool relaxedMode) -{ - const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half", - sizeNames[vectorSize], - "* out, __global ushort", - sizeNames[vectorSize], - "* in)\n" - "{\n" - " int i = get_global_id(0);\n" - " out[i] = ", - name, - "( in[i] );\n" - "}\n" }; - - const char *c3[] = { - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n", - "__kernel void math_kernel", - sizeNames[vectorSize], - "( __global half* out, __global ushort* in)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " if( i + 1 < get_global_size(0) )\n" - " {\n" - " ushort3 u0 = vload3( 0, in + 3 * i );\n" - " half3 f0 = ", - name, - "( u0 );\n" - " vstore3( f0, 0, out + 3*i );\n" - " }\n" - " else\n" - " {\n" - " size_t parity = i & 1; // Figure out how many elements are " - "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two " - "buffer size \n" - " ushort3 u0;\n" - " half3 f0;\n" - " switch( parity )\n" - " {\n" - " case 1:\n" - " u0 = (ushort3)( in[3*i], 0xdead, 0xdead ); \n" - " break;\n" - " case 0:\n" - " u0 = (ushort3)( in[3*i], in[3*i+1], 0xdead ); \n" - " break;\n" - " }\n" - " f0 = ", - name, - "( u0 );\n" - " switch( parity )\n" - " {\n" - " case 0:\n" - " out[3*i+1] = f0.y; \n" - " // fall through\n" - " case 1:\n" - " out[3*i] = f0.x; \n" - " break;\n" - " }\n" - " }\n" - "}\n" - }; - - const char **kern = c; - size_t kernSize = sizeof(c) / sizeof(c[0]); - - if (sizeValues[vectorSize] == 3) - { - kern = c3; - kernSize = sizeof(c3) / sizeof(c3[0]); - } - - char testName[32]; - snprintf(testName, sizeof(testName) - 1, "math_kernel%s", - sizeNames[vectorSize]); - - return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode); -} - -typedef struct BuildKernelInfo -{ - cl_uint offset; // the first vector size to build - cl_kernel *kernels; - cl_program *programs; - const char *nameInCode; - bool relaxedMode; -} BuildKernelInfo; - -static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, - void *p) -{ - BuildKernelInfo *info = (BuildKernelInfo *)p; - cl_uint i = info->offset + job_id; - return BuildKernelHalf(info->nameInCode, i, info->kernels + i, - info->programs + i, info->relaxedMode); -} -#else - +//////////////////////////////////////////////////////////////////////////////// static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -136,8 +36,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -#endif - +//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -335,15 +234,5 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); vlog("\n"); -#if 0 -exit: - // Release - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) - { - clReleaseKernel(kernels[k]); - clReleaseProgram(programs[k]); - } -#endif - return error; } From cc8db3b4098c18c04a217f5967de98020a7ff70c Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 23 Mar 2023 17:52:17 +0100 Subject: [PATCH 05/24] Corrected ULP values for half tests (issue #142, bruteforce) --- .../math_brute_force/function_list.cpp | 234 +++++++++--------- 1 file changed, 117 insertions(+), 117 deletions(-) diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 1dcd4d900..cb5cf1131 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -29,18 +29,18 @@ // Only use ulps information in spir test #ifdef FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ - RELAXED_OFF, _type \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _ulp, _embedded_ulp, _relaxed_ulp, \ - _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type, \ + _relaxed_embedded_ulp) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp, \ + _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ @@ -48,11 +48,11 @@ { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, \ INFINITY, _rmode, RELAXED_OFF, _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ - _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ - _type \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ + { \ + STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ + _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } #define unaryF NULL @@ -80,21 +80,21 @@ #else // FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ - _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ - RELAXED_OFF, _type \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ + { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, \ - { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _ulp, \ - _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ - RELAXED_ON, _type \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type, \ + _relaxed_embedded_ulp) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ + { (void*)reference_##_name##l }, \ + { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp, \ + _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ + RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ @@ -103,11 +103,11 @@ _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ + { \ + STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ + { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp, \ + _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } static constexpr vtbl _unary = { @@ -234,24 +234,24 @@ static constexpr vtbl _mad_tbl = { #endif // FUNCTION_LIST_ULPS_ONLY const Func functionList[] = { - ENTRY_EXT(acos, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(acosh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(acospi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY_EXT(asin, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(asinh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(asinpi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY_EXT(atan, 5.0f, 5.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(atanh, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(atanpi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(atan2, 6.0f, 6.0f, FTZ_OFF, binaryF), - ENTRY(atan2pi, 6.0f, 6.0f, FTZ_OFF, binaryF), - ENTRY(cbrt, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(ceil, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(copysign, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY_EXT(cos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(acosh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(acospi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(asinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(asinpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(atanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(atanpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(atan2, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(cbrt, 2.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY(ceil, 0.0f, 0.0f, 0.f, FTZ_OFF, unaryF), + ENTRY(copysign, 0.0f, 0.0f, 0.f, FTZ_OFF, binaryF), + ENTRY_EXT(cos, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY(cosh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY_EXT(cospi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(cosh, 4.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY_EXT(cospi, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 // ENTRY( erfc, 16.0f, // 16.0f, FTZ_OFF, unaryF), @@ -260,81 +260,81 @@ const Func functionList[] = { // 16.0f, 16.0f, FTZ_OFF, // unaryF), //disabled for 1.0 due to lack // of reference implementation - ENTRY_EXT(exp, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f), // relaxed error is actually overwritten in unary.c as it // is 3+floor(fabs(2*x)) - ENTRY_EXT(exp2, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp2, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f), // relaxed error is actually overwritten in unary.c as it // is 3+floor(fabs(2*x)) - ENTRY_EXT(exp10, 3.0f, 4.0f, 8192.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp10, 3.0f, 4.0f, 2.f, 8192.0f, FTZ_OFF, unaryF, 8192.0f), // relaxed error is actually overwritten in unary.c as // it is 3+floor(fabs(2*x)) in derived mode, // in non-derived mode it uses the ulp error for half_exp10. - ENTRY(expm1, 3.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(fabs, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(fdim, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(floor, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(fma, 0.0f, 0.0f, FTZ_OFF, ternaryF), - ENTRY(fmax, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fmin, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fmod, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fract, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), - ENTRY(frexp, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i), - ENTRY(hypot, 4.0f, 4.0f, FTZ_OFF, binaryF), - ENTRY(ilogb, 0.0f, 0.0f, FTZ_OFF, i_unaryF), - ENTRY(isequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isfinite, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isgreaterequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isinf, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isless, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(islessequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(islessgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isnan, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isnormal, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isnotequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isunordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(ldexp, 0.0f, 0.0f, FTZ_OFF, binaryF_i), - ENTRY(lgamma, INFINITY, INFINITY, FTZ_OFF, unaryF), - ENTRY(lgamma_r, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i), - ENTRY_EXT(log, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY(expm1, 3.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY(fabs, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(fdim, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(floor, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(fma, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF), + ENTRY(fmax, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fmin, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fmod, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fract, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), + ENTRY(frexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i), + ENTRY(hypot, 4.0f, 4.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(ilogb, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF), + ENTRY(isequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isfinite, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isinf, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isless, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(islessequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isnan, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isnormal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i), + ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF), + ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i), + ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY_EXT(log2, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY_EXT(log10, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY(log1p, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(logb, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, + ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, INFINITY), // in fast-relaxed-math mode it has to be either // exactly rounded fma or exactly rounded a*b+c - ENTRY(maxmag, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), - ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u), - ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter), - ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF, + ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(minmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(modf, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), + ENTRY(nan, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u), + ENTRY(nextafter, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter), + ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 8192.0f, FTZ_OFF, binaryF, 8192.0f), // in derived mode the ulp error is calculated as // exp2(y*log2(x)) and in non-derived it is the same as // half_pow - ENTRY(pown, 16.0f, 16.0f, FTZ_OFF, binaryF_i), - ENTRY(powr, 16.0f, 16.0f, FTZ_OFF, binaryF), + ENTRY(pown, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i), + ENTRY(powr, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF), // ENTRY( reciprocal, 1.0f, // 1.0f, FTZ_OFF, unaryF), - ENTRY(remainder, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(remquo, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i), - ENTRY(rint, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(rootn, 16.0f, 16.0f, FTZ_OFF, binaryF_i), - ENTRY(round, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(rsqrt, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(signbit, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY_EXT(sin, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(remainder, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(remquo, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i), + ENTRY(rint, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(rootn, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i), + ENTRY(round, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, FTZ_OFF, unaryF), + ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY_EXT(sincos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results, + ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY(sinh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY_EXT(sinpi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 { "sqrt", "sqrt", @@ -365,16 +365,16 @@ const Func functionList[] = { RELAXED_OFF, unaryF }, ENTRY_EXT( - tan, 5.0f, 5.0f, 8192.0f, FTZ_OFF, unaryF, + tan, 5.0f, 5.0f, 2.0f, 8192.0f, FTZ_OFF, unaryF, 8192.0f), // in derived mode it the ulp error is calculated as sin/cos // and in non-derived mode it is the same as half_tan. - ENTRY(tanh, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(tanpi, 6.0f, 6.0f, FTZ_OFF, unaryF), + ENTRY(tanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(tanpi, 6.0f, 6.0f, 2.0f, FTZ_OFF, unaryF), // ENTRY( tgamma, 16.0f, // 16.0f, FTZ_OFF, unaryF), // // Commented this out until we can be // sure this requirement is realistic - ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(trunc, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF), HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF), @@ -392,8 +392,8 @@ const Func functionList[] = { HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF), // basic operations - OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), - OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), { "divide", "/", { (void*)reference_divide }, @@ -422,10 +422,10 @@ const Func functionList[] = { FTZ_OFF, RELAXED_OFF, binaryOperatorF }, - OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), - OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, FTZ_OFF, + OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), // A simple copy operation - OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), }; const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]); From dd42b073830b7588774d2834d611ca212972f65f Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 23 Mar 2023 18:57:04 +0100 Subject: [PATCH 06/24] Corrected presubmit check for clang format --- .../math_brute_force/function_list.cpp | 81 ++++++++++--------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index cb5cf1131..bfe3ff8ae 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -29,18 +29,18 @@ // Only use ulps information in spir test #ifdef FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ - RELAXED_OFF, _type \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp, \ - _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \ + _type, _relaxed_embedded_ulp) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp, \ + _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ @@ -48,11 +48,12 @@ { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, \ INFINITY, _rmode, RELAXED_OFF, _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ - _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ - _type \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, \ + _rmode, _type) \ + { \ + STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ + _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } #define unaryF NULL @@ -80,21 +81,21 @@ #else // FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ - _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ - RELAXED_OFF, _type \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ + { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ - { \ - STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, \ - { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp, \ - _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ - RELAXED_ON, _type \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \ + _type, _relaxed_embedded_ulp) \ + { \ + STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ + { (void*)reference_##_name##l }, \ + { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp, \ + _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ + RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ @@ -103,11 +104,12 @@ _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ - { \ - STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, \ + _rmode, _type) \ + { \ + STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ + { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp, \ + _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } static constexpr vtbl _unary = { @@ -297,7 +299,8 @@ const Func functionList[] = { ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i), ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF), - ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i), + ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, + unaryF_two_results_i), ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, @@ -306,7 +309,8 @@ const Func functionList[] = { 4.76837158203125e-7f), // relaxed ulp 2^-21 ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, + ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, + mad_function, INFINITY), // in fast-relaxed-math mode it has to be either // exactly rounded fma or exactly rounded a*b+c ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), @@ -331,7 +335,8 @@ const Func functionList[] = { ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results, + ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, + unaryF_two_results, 0.00048828125f), // relaxed ulp 2^-11 ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, From 04033bf8bd958c9c053e753363c4588f7dc75ccf Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 18 Apr 2023 22:10:51 +0200 Subject: [PATCH 07/24] Added support for ternary, unary_two_result and unary_two_result_i tests for cl_half (issue #142, bruteforce) --- .../math_brute_force/CMakeLists.txt | 3 + .../math_brute_force/function_list.cpp | 6 +- test_conformance/math_brute_force/main.cpp | 12 + .../math_brute_force/ternary_half.cpp | 774 ++++++++++++++++++ .../math_brute_force/test_functions.h | 9 + .../unary_two_results_half.cpp | 527 ++++++++++++ .../unary_two_results_i_half.cpp | 368 +++++++++ test_conformance/math_brute_force/utility.h | 25 + 8 files changed, 1721 insertions(+), 3 deletions(-) create mode 100644 test_conformance/math_brute_force/ternary_half.cpp create mode 100644 test_conformance/math_brute_force/unary_two_results_half.cpp create mode 100644 test_conformance/math_brute_force/unary_two_results_i_half.cpp diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index ec5b3dae6..50f1fd00f 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -34,14 +34,17 @@ set(${MODULE_NAME}_SOURCES sleep.h ternary_double.cpp ternary_float.cpp + ternary_half.cpp test_functions.h unary_double.cpp unary_float.cpp unary_half.cpp unary_two_results_double.cpp unary_two_results_float.cpp + unary_two_results_half.cpp unary_two_results_i_double.cpp unary_two_results_i_float.cpp + unary_two_results_i_half.cpp unary_u_double.cpp unary_u_float.cpp unary_u_half.cpp diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index bfe3ff8ae..67ed0d8ac 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -185,21 +185,21 @@ static constexpr vtbl _ternary = { "ternary", TestFunc_Float_Float_Float_Float, TestFunc_Double_Double_Double_Double, - NULL, + TestFunc_Half_Half_Half_Half, }; static constexpr vtbl _unary_two_results = { "unary_two_results", TestFunc_Float2_Float, TestFunc_Double2_Double, - NULL, + TestFunc_Half2_Half, }; static constexpr vtbl _unary_two_results_i = { "unary_two_results_i", TestFunc_FloatI_Float, TestFunc_DoubleI_Double, - NULL, + TestFunc_HalfI_Half, }; static constexpr vtbl _binary_two_results_i = { diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp index b31d1e5ef..e5d3545ca 100644 --- a/test_conformance/math_brute_force/main.cpp +++ b/test_conformance/math_brute_force/main.cpp @@ -108,6 +108,8 @@ cl_device_fp_config gFloatCapabilities = 0; int gWimpyReductionFactor = 32; int gVerboseBruteForce = 0; +cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE; + static int ParseArgs(int argc, const char **argv); static void PrintUsage(void); static void PrintFunctions(void); @@ -694,6 +696,16 @@ test_status InitCL(cl_device_id device) return TEST_FAIL; } + + if ((gHalfCapabilities & CL_FP_ROUND_TO_NEAREST) != 0) + { + gHalfRoundingMode = CL_HALF_RTE; + } + else // due to above condition it must be RTZ + { + gHalfRoundingMode = CL_HALF_RTZ; + } + #else vlog_error("FAIL: device says it supports cl_khr_fp16 but " "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n"); diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp new file mode 100644 index 000000000..0d8bb8cf2 --- /dev/null +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -0,0 +1,774 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +#define CORRECTLY_ROUNDED 0 +#define FLUSHED 1 + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetTernaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// A table of more difficult cases to get right +static const cl_half specialValuesHalf[] = { + 0xffff, + 0x0000, + 0x0001, + 0x7c00 /*INFINITY*/, + 0xfc00 /*-INFINITY*/, + 0x8000 /*-0*/, + 0x7bff /*HALF_MAX*/, + 0x0400 /*HALF_MIN*/ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half); + + cl_uchar overflow[half_buffer_size]; + float half_ulps = f->half_ulps; + int skipNanInf = (0 == strcmp("fma", f->nameInCode)); + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *hp0 = (cl_half *)gIn; + cl_half *hp1 = (cl_half *)gIn2; + cl_half *hp2 = (cl_half *)gIn3; + size_t idx = 0; + + if (i == 0) + { // test edge cases + uint32_t x, y, z; + x = y = z = 0; + for (; idx < half_buffer_size; idx++) + { + hp0[idx] = specialValuesHalf[x]; + hp1[idx] = specialValuesHalf[y]; + hp2[idx] = specialValuesHalf[z]; + + if (++x >= specialValuesHalfCount) + { + x = 0; + if (++y >= specialValuesHalfCount) + { + y = 0; + if (++z >= specialValuesHalfCount) break; + } + } + } + if (idx == half_buffer_size) + vlog_error("Test Error: not all special cases tested!\n"); + } + + auto any_value = [&d]() { + float t = (float)((double)genrand_int32(d) / (double)0xFFFFFFFF); + return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX); + }; + + for (; idx < half_buffer_size; idx++) + { + hp0[idx] = any_value(); + hp1[idx] = any_value(); + hp2[idx] = any_value(); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + BUFFER_SIZE, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n", + error); + return error; + } + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_half) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer3), &gInBuffer3))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog("clFlush failed\n"); + return error; + } + + // Calculate the correctly rounded reference result + cl_half *res = (cl_half *)gOut_Ref; + if (skipNanInf) + { + for (size_t j = 0; j < half_buffer_size; j++) + { + feclearexcept(FE_OVERFLOW); + res[j] = HFF((float)f->func.f_fma( + HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED)); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + } + } + else + { + for (size_t j = 0; j < half_buffer_size; j++) + res[j] = HFF((float)f->func.f_fma( + HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED)); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + for (size_t j = 0; j < half_buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + int fail; + cl_half test = ((cl_half *)q)[j]; + float ref1 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]), CORRECTLY_ROUNDED); + cl_half correct = HFF(ref1); + + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (skipNanInf) + { + if (overflow[j] || IsHalfInfinity(correct) + || IsHalfNaN(correct) || IsHalfInfinity(hp0[j]) + || IsHalfNaN(hp0[j]) || IsHalfInfinity(hp1[j]) + || IsHalfNaN(hp1[j]) || IsHalfInfinity(hp2[j]) + || IsHalfNaN(hp2[j])) + continue; + } + + float err = + test != correct ? Ulp_Error_Half(test, ref1) : 0.f; + fail = !(fabsf(err) <= half_ulps); + + if (fail && (ftz || relaxedMode)) + { + // retry per section 6.5.3.2 with flushing on + if (0.0f == test + && 0.0f + == f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]), FLUSHED)) + { + fail = 0; + err = 0.0f; + } + + // retry per section 6.5.3.3 + if (fail && IsHalfSubnormal(hp0[j])) + { // look at me, + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = + f->func.f_fma(0.0f, HTF(hp1[j]), HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(-0.0f, HTF(hp1[j]), HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + HTF(hp2[j]), FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + HTF(hp2[j]), FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + // try with first two args as zero + if (IsHalfSubnormal(hp1[j])) + { // its fun to have fun, + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(0.0f, 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(-0.0f, 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(0.0f, -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(-0.0f, -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, 0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, 0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, -0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, -0.0f, + HTF(hp2[j]), + FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + if (IsHalfSubnormal(hp2[j])) + { + if (test == 0.0f) // 0*0+0 is 0 + { + fail = 0; + err = 0.0f; + } + } + } + else if (IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(0.0f, HTF(hp1[j]), 0.0f, + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(-0.0f, HTF(hp1[j]), 0.0f, + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(0.0f, HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(-0.0f, HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + -0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + -0.0f, FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + else if (fail && IsHalfSubnormal(hp1[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = + f->func.f_fma(HTF(hp0[j]), 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(HTF(hp0[j]), -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + HTF(hp2[j]), FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + HTF(hp2[j]), FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + // try with second two args as zero + if (IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(HTF(hp0[j]), 0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(HTF(hp0[j]), -0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(HTF(hp0[j]), 0.0f, -0.0f, + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(HTF(hp0[j]), -0.0f, -0.0f, + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + -0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + -0.0f, FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + else if (fail && IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + 0.0f, CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, correct2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, correct3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), + HTF(hp1[j]), 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), + HTF(hp1[j]), -0.0f, + FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(hp0[j]); + maxErrorVal2 = HTF(hp1[j]); + maxErrorVal3 = HTF(hp2[j]); + } + + if (fail) + { + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a, %a, %a} " + "({0x%4.4x, 0x%4.4x, 0x%4.4x}): *%a vs. %a\n", + f->name, sizeNames[k], err, HTF(hp0[j]), + HTF(hp1[j]), HTF(hp2[j]), hp0[j], hp1[j], hp2[j], + HTF(res[j]), HTF(test)); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h index 91cca1633..16f57013c 100644 --- a/test_conformance/math_brute_force/test_functions.h +++ b/test_conformance/math_brute_force/test_functions.h @@ -108,18 +108,27 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode); int TestFunc_Double_Double_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, half) +int TestFunc_Half_Half_Half_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, float*) int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double*) int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half*) +int TestFunc_Half2_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int*) int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, int*) int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, int*) +int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode); + // float foo(float, float, int*) int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode); diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp new file mode 100644 index 000000000..3f8d71168 --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -0,0 +1,527 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError0 = 0.0f; + float maxError1 = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal0 = 0.0f; + float maxErrorVal1 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half); + + cl_uchar overflow[half_buffer_size]; + int isFract = 0 == strcmp("fract", f->nameInCode); + int skipNanInf = isFract; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + float half_ulps = f->half_ulps; + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *pIn = (cl_half *)gIn; + { + const unsigned m_size = 0x1ff; + const unsigned e_size = 0xf; + const unsigned s_size = 0x2; + const unsigned sclamp = 0xffff; + + for (size_t j = 0; j < half_buffer_size; j++) + { + unsigned ind = j % (s_size * e_size * m_size); + unsigned val = (((ind / (e_size * m_size)) << 15) + | (((ind / m_size) % e_size + 1) << 10) + | (ind % m_size + 1)) + & sclamp; + pIn[j] = val; + + if (relaxedMode && strcmp(f->name, "sincos") == 0) + { + float pj = HTF(pIn[j]); + if (fabs(pj) > M_PI) pIn[j] = 0x7e00; // HALF_NAN + } + } + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", + error); + return error; + } + + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", + error); + return error; + } + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_half); + size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog_error("clFlush failed\n"); + return error; + } + + FPU_mode_type oldMode; + RoundingMode oldRoundMode = kRoundToNearestEven; + if (isFract) + { + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz || relaxedMode) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + if (gIsInRTZMode) + oldRoundMode = set_round(kRoundTowardZero, kfloat); + } + + // Calculate the correctly rounded reference result + cl_half *ref1 = (cl_half *)gOut_Ref; + cl_half *ref2 = (cl_half *)gOut_Ref2; + + if (skipNanInf) + { + for (size_t j = 0; j < half_buffer_size; j++) + { + double dd; + feclearexcept(FE_OVERFLOW); + + if (relaxedMode) + ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd)); + else + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); + + ref2[j] = HFF((float)dd); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + } + } + else + { + for (size_t j = 0; j < half_buffer_size; j++) + { + double dd; + if (relaxedMode) + ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd)); + else + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); + + ref2[j] = HFF((float)dd); + } + } + + if (isFract && ftz) RestoreFPState(&oldMode); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) + { + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + break; + } + + // Verify data + for (size_t j = 0; j < half_buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *test1 = (cl_half *)gOut[k]; + cl_half *test2 = (cl_half *)gOut2[k]; + + // If we aren't getting the correctly rounded result + if (ref1[j] != test1[j] || ref2[j] != test2[j]) + { + double fp_correct1 = 0, fp_correct2 = 0; + float err = 0, err2 = 0; + + if (relaxedMode) + fp_correct1 = f->rfunc.f_fpf(HTF(pIn[j]), &fp_correct2); + else + fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); + + cl_half correct1 = HFF(fp_correct1); + cl_half correct2 = HFF(fp_correct2); + + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (relaxedMode || skipNanInf) + { + if (skipNanInf && overflow[j]) continue; + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsHalfInfinity(correct1) || IsHalfNaN(correct1) + || IsHalfInfinity(correct2) || IsHalfNaN(correct2) + || IsHalfInfinity(pIn[j]) || IsHalfNaN(pIn[j])) + continue; + } + + // If we are in fast relaxed math, we + // have a different calculation for the + // subnormal threshold. + typedef int (*CheckForSubnormal)(double, float); + CheckForSubnormal isFloatResultSubnormalPtr; + if (relaxedMode) + { + err = Abs_Error(HTF(test1[j]), fp_correct1); + err2 = Abs_Error(HTF(test2[j]), fp_correct2); + isFloatResultSubnormalPtr = + &IsFloatResultSubnormalAbsError; + } + else + { + err = Ulp_Error_Half(test1[j], fp_correct1); + err2 = Ulp_Error_Half(test2[j], fp_correct2); + isFloatResultSubnormalPtr = &IsFloatResultSubnormal; + } + int fail = + !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps); + + if (ftz || relaxedMode) + { + // retry per section 6.5.3.2 + if ((*isFloatResultSubnormalPtr)(fp_correct1, + half_ulps)) + { + if ((*isFloatResultSubnormalPtr)(fp_correct2, + half_ulps)) + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && HTF(test2[j]) == 0.0f); + if (!fail) + { + err = 0.0f; + err2 = 0.0f; + } + } + else + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && fabsf(err2) <= half_ulps); + if (!fail) err = 0.0f; + } + } + else if ((*isFloatResultSubnormalPtr)(fp_correct2, + half_ulps)) + { + fail = fail + && !(HTF(test2[j]) == 0.0f + && fabsf(err) <= half_ulps); + if (!fail) err2 = 0.0f; + } + + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(pIn[j])) + { + double fp_correctp, fp_correctn; + double fp_correct2p, fp_correct2n; + float errp, err2p, errn, err2n; + + if (skipNanInf) feclearexcept(FE_OVERFLOW); + if (relaxedMode) + { + fp_correctp = + f->rfunc.f_fpf(0.0, &fp_correct2p); + fp_correctn = + f->rfunc.f_fpf(-0.0, &fp_correct2n); + } + else + { + fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); + fp_correctn = + f->func.f_fpf(-0.0, &fp_correct2n); + } + + cl_half correctp = HFF(fp_correctp); + cl_half correctn = HFF(fp_correctn); + cl_half correct2p = HFF(fp_correct2p); + cl_half correct2n = HFF(fp_correct2n); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correctp) + || IsHalfNaN(correctp) + || IsHalfInfinity(correctn) + || IsHalfNaN(correctn) + || IsHalfInfinity(correct2p) + || IsHalfNaN(correct2p) + || IsHalfInfinity(correct2n) + || IsHalfNaN(correct2n)) + continue; + } + + if (relaxedMode) + { + errp = Abs_Error(HTF(test1[j]), fp_correctp); + err2p = Abs_Error(HTF(test1[j]), fp_correct2p); + errn = Abs_Error(HTF(test1[j]), fp_correctn); + err2n = Abs_Error(HTF(test1[j]), fp_correct2n); + } + else + { + errp = Ulp_Error_Half(test1[j], fp_correctp); + err2p = Ulp_Error_Half(test1[j], fp_correct2p); + errn = Ulp_Error_Half(test1[j], fp_correctn); + err2n = Ulp_Error_Half(test1[j], fp_correct2n); + } + + fail = fail + && ((!(fabsf(errp) <= half_ulps)) + && (!(fabsf(err2p) <= half_ulps)) + && ((!(fabsf(errn) <= half_ulps)) + && (!(fabsf(err2n) <= half_ulps)))); + if (fabsf(errp) < fabsf(err)) err = errp; + if (fabsf(errn) < fabsf(err)) err = errn; + if (fabsf(err2p) < fabsf(err2)) err2 = err2p; + if (fabsf(err2n) < fabsf(err2)) err2 = err2n; + + // retry per section 6.5.3.4 + if ((*isFloatResultSubnormalPtr)(fp_correctp, + half_ulps) + || (*isFloatResultSubnormalPtr)(fp_correctn, + half_ulps)) + { + if ((*isFloatResultSubnormalPtr)(fp_correct2p, + half_ulps) + || (*isFloatResultSubnormalPtr)( + fp_correct2n, half_ulps)) + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && HTF(test2[j]) == 0.0f); + if (!fail) err = err2 = 0.0f; + } + else + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && fabsf(err2) <= half_ulps); + if (!fail) err = 0.0f; + } + } + else if ((*isFloatResultSubnormalPtr)(fp_correct2p, + half_ulps) + || (*isFloatResultSubnormalPtr)( + fp_correct2n, half_ulps)) + { + fail = fail + && !(HTF(test2[j]) == 0.0f + && (fabsf(err) <= half_ulps)); + if (!fail) err2 = 0.0f; + } + } + } + if (fabsf(err) > maxError0) + { + maxError0 = fabsf(err); + maxErrorVal0 = HTF(pIn[j]); + } + if (fabsf(err2) > maxError1) + { + maxError1 = fabsf(err2); + maxErrorVal1 = HTF(pIn[j]); + } + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: " + "*{%a, %a} vs. {%a, %a}\n", + f->name, sizeNames[k], err, err2, + HTF(pIn[j]), HTF(ref1[j]), HTF(ref2[j]), + HTF(test1[j]), HTF(test2[j])); + return -1; + } + } + } + } + + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, + maxErrorVal1); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp new file mode 100644 index 000000000..241377dda --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -0,0 +1,368 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +cl_ulong abs_cl_long(cl_long i) +{ + cl_long mask = i >> 63; + return (i ^ mask) - mask; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int64_t maxError2 = 0; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // sizeof(cl_half) < sizeof (int32_t) + // to prevent overflowing gOut_Ref2 it is necessary to use + // bigger type as denominator for buffer size calculation + constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(int32_t); + + cl_ulong maxiError = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + float half_ulps = f->half_ulps; + + maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0; + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, + relaxedMode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *pIn = (cl_half *)gIn; + + { + const unsigned m_size = 0x1ff; + const unsigned e_size = 0xf; + const unsigned s_size = 0x2; + const unsigned sclamp = 0xffff; + + for (size_t j = 0; j < half_buffer_size; j++) + { + unsigned ind = j % (s_size * e_size * m_size); + unsigned val = (((ind / (e_size * m_size)) << 15) + | (((ind / m_size) % e_size + 1) << 10) + | (ind % m_size + 1)) + & sclamp; + pIn[j] = val; + } + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xffffdead; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", + error); + return error; + } + + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", + error); + return error; + } + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // sizeof(cl_half) < sizeof (int32_t) + // to prevent overflowing gOut_Ref2 it is necessary to use + // bigger type as denominator for buffer size calculation + size_t vectorSize = sizeValues[j] * sizeof(int32_t); + size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog_error("clFlush failed\n"); + return error; + } + + // Calculate the correctly rounded reference result + cl_half *ref1 = (cl_half *)gOut_Ref; + int32_t *ref2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < half_buffer_size; j++) + ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j)); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + for (size_t j = 0; j < half_buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *test1 = (cl_half *)(gOut[k]); + int32_t *test2 = (int32_t *)(gOut2[k]); + + // If we aren't getting the correctly rounded result + if (ref1[j] != test1[j] || ref2[j] != test2[j]) + { + cl_half test = ((cl_half *)test1)[j]; + int correct2 = INT_MIN; + float fp_correct = + (float)f->func.f_fpI(HTF(pIn[j]), &correct2); + cl_half correct = HFF(fp_correct); + float err = correct != test + ? Ulp_Error_Half(test, fp_correct) + : 0.f; + cl_long iErr = (int64_t)test2[j] - (int64_t)correct2; + int fail = !(fabsf(err) <= half_ulps + && abs_cl_long(iErr) <= maxiError); + if (ftz || relaxedMode) + { + // retry per section 6.5.3.2 + if (IsFloatResultSubnormal(fp_correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(pIn[j])) + { + int correct5, correct6; + double fp_correct3 = f->func.f_fpI(0.0, &correct5); + double fp_correct4 = f->func.f_fpI(-0.0, &correct6); + + float err2 = Ulp_Error_Half(test, fp_correct3); + float err3 = Ulp_Error_Half(test, fp_correct4); + + cl_long iErr2 = + (long long)test2[j] - (long long)correct5; + cl_long iErr3 = + (long long)test2[j] - (long long)correct6; + + // Did +0 work? + if (fabsf(err2) <= half_ulps + && abs_cl_long(iErr2) <= maxiError) + { + err = err2; + iErr = iErr2; + fail = 0; + } + // Did -0 work? + else if (fabsf(err3) <= half_ulps + && abs_cl_long(iErr3) <= maxiError) + { + err = err3; + iErr = iErr3; + fail = 0; + } + + // retry per section 6.5.3.4 + if (fail + && (IsFloatResultSubnormal(correct2, half_ulps) + || IsFloatResultSubnormal(fp_correct3, + half_ulps))) + { + fail = fail + && !(test == 0.0f + && (abs_cl_long(iErr2) <= maxiError + || abs_cl_long(iErr3) + <= maxiError)); + if (!fail) + { + err = 0.0f; + iErr = 0; + } + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = pIn[j]; + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = pIn[j]; + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: " + "*{%a, %d} vs. {%a, %d}\n", + f->name, sizeNames[k], err, (int)iErr, + HTF(pIn[j]), HTF(ref1[j]), + ((int *)gOut_Ref2)[j], HTF(test), test2[j]); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index 2d04eb4a7..c21f88c4c 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -72,6 +72,11 @@ extern cl_device_fp_config gFloatCapabilities; extern cl_device_fp_config gHalfCapabilities; extern RoundingMode gFloatToHalfRoundingMode; +extern cl_half_rounding_mode gHalfRoundingMode; + +#define HFF(num) cl_half_from_float(num, gHalfRoundingMode) +#define HTF(num) cl_half_to_float(num) + #define LOWER_IS_BETTER 0 #define HIGHER_IS_BETTER 1 @@ -166,6 +171,26 @@ inline int IsFloatNaN(double x) return ((u.u & 0x7fffffffU) > 0x7F800000U); } +inline bool IsHalfNaN(const cl_half v) +{ + // Extract FP16 exponent and mantissa + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + + // NaN test + return (h_exp == 0x1F && h_mant != 0); +} + +inline bool IsHalfInfinity(const cl_half v) +{ + // Extract FP16 exponent and mantissa + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + + // Inf test + return (h_exp == 0x1F && h_mant == 0); +} + cl_uint RoundUpToNextPowerOfTwo(cl_uint x); // Windows (since long double got deprecated) sets the x87 to 53-bit precision From 70cef0c88120f52a9477b20b0f91bcc9bbaebbeb Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 9 May 2023 12:50:01 +0200 Subject: [PATCH 08/24] Added missing condition due to vendor's review --- test_conformance/math_brute_force/macro_unary_half.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index 36d3996ef..3d5938020 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -107,7 +107,8 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) } test_info.f = f; - test_info.ftz = f->ftz || gForceFTZ; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); test_info.tinfo.resize(test_info.threadCount); From 4e9938e2e2767555dc84b2b1189724a0eaef1653 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Wed, 10 May 2023 14:15:55 +0200 Subject: [PATCH 09/24] code format correction --- test_conformance/math_brute_force/common.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp index 39b4c950a..cd2efc747 100644 --- a/test_conformance/math_brute_force/common.cpp +++ b/test_conformance/math_brute_force/common.cpp @@ -100,7 +100,6 @@ void EmitEnableExtension(std::ostringstream &kernel, if (needsFp64) kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; if (needsFp16) kernel << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - } std::string GetBuildOptions(bool relaxed_mode) From b9ae99b0a514caae20f9de1de4e821e94e299904 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Fri, 2 Jun 2023 13:32:33 +0200 Subject: [PATCH 10/24] Added check for lack of support for denormals in binary_half scenario --- .../math_brute_force/binary_half.cpp | 131 ++++++++++++------ .../math_brute_force/reference_math.cpp | 15 +- .../math_brute_force/reference_math.h | 2 +- 3 files changed, 106 insertions(+), 42 deletions(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index 1aeb36aff..bf165542b 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -501,6 +501,13 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (IsHalfSubnormal( cl_half_from_float(correct, CL_HALF_RTE))) { + if (isNextafter) + { + correct = reference_nextafterh(s[j], s2[j], false); + err = Ulp_Error_Half(q[j], correct); + fail = !(fabsf(err) <= ulps); + } + fail = fail && (test != 0.0f); if (!fail) err = 0.0f; } @@ -510,13 +517,15 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) double correct2, correct3; float err2, err3; if (isNextafter) + { correct2 = reference_nextafterh(0.0, s2[j]); - else - correct2 = ref_func(0.0, s2[j]); - if (isNextafter) correct3 = reference_nextafterh(-0.0, s2[j]); + } else + { + correct2 = ref_func(0.0, s2[j]); correct3 = ref_func(-0.0, s2[j]); + } if (skipNanInf) { // Note: no double rounding here. Reference @@ -528,11 +537,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) continue; } - err2 = Ulp_Error_Half(q[j], correct2); - err3 = Ulp_Error_Half(q[j], correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); + auto check_error = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + }; + check_error(); if (fabsf(err2) < fabsf(err)) err = err2; if (fabsf(err3) < fabsf(err)) err = err3; @@ -542,6 +554,15 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) || IsHalfSubnormal( cl_half_from_float(correct3, CL_HALF_RTE))) { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(0.0, s2[j], false); + correct3 = + reference_nextafterh(-0.0, s2[j], false); + check_error(); + } + fail = fail && (test != 0.0f); if (!fail) err = 0.0f; } @@ -563,21 +584,19 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) float err4, err5; if (isNextafter) + { correct2 = reference_nextafterh(0.0, 0.0); - else - correct2 = ref_func(0.0, 0.0); - if (isNextafter) correct3 = reference_nextafterh(-0.0, 0.0); - else - correct3 = ref_func(-0.0, 0.0); - if (isNextafter) correct4 = reference_nextafterh(0.0, -0.0); - else - correct4 = ref_func(0.0, -0.0); - if (isNextafter) correct5 = reference_nextafterh(-0.0, -0.0); + } else + { + correct2 = ref_func(0.0, 0.0); + correct3 = ref_func(-0.0, 0.0); + correct4 = ref_func(0.0, -0.0); correct5 = ref_func(-0.0, -0.0); + } // Per section 10 paragraph 6, accept any result if // an input or output is a infinity or NaN or @@ -596,19 +615,23 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) || IsFloatNaN(correct5)) continue; } - err2 = Ulp_Error_Half(q[j], correct2); - err3 = Ulp_Error_Half(q[j], correct3); - err4 = Ulp_Error_Half(q[j], correct4); - err5 = Ulp_Error_Half(q[j], correct5); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps)) - && (!(fabsf(err4) <= ulps)) - && (!(fabsf(err5) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; + + auto check_error4 = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + }; + check_error4(); // retry per section 6.5.3.4 if (IsHalfSubnormal( @@ -620,6 +643,19 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) || IsHalfSubnormal( cl_half_from_float(correct5, CL_HALF_RTE))) { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(0.0, 0.0, false); + correct3 = + reference_nextafterh(-0.0, 0.0, false); + correct4 = + reference_nextafterh(0.0, -0.0, false); + correct5 = + reference_nextafterh(-0.0, -0.0, false); + check_error4(); + } + fail = fail && (test != 0.0f); if (!fail) err = 0.0f; } @@ -641,13 +677,16 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) float err2, err3; if (isNextafter) + { correct2 = reference_nextafterh(s[j], 0.0); - else - correct2 = ref_func(s[j], 0.0); - if (isNextafter) correct3 = reference_nextafterh(s[j], -0.0); + } else + { + correct2 = ref_func(s[j], 0.0); correct3 = ref_func(s[j], -0.0); + } + if (skipNanInf) { // Note: no double rounding here. Reference @@ -658,13 +697,16 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) continue; } - err2 = Ulp_Error_Half(q[j], correct2); - err3 = Ulp_Error_Half(q[j], correct3); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; + auto check_error = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + }; + check_error(); // retry per section 6.5.3.4 if (IsHalfSubnormal( @@ -672,6 +714,15 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) || IsHalfSubnormal( cl_half_from_float(correct3, CL_HALF_RTE))) { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(s[j], 0.0, false); + correct3 = + reference_nextafterh(s[j], -0.0, false); + check_error(); + } + fail = fail && (test != 0.0f); if (!fail) err = 0.0f; } diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index 5ba8bfb85..7fa0c54a3 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -4708,7 +4708,7 @@ cl_half reference_nanh(cl_ushort x) return h; } -float reference_nextafterh(float xx, float yy) +float reference_nextafterh(float xx, float yy, bool allow_denorms) { cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE); cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE); @@ -4731,6 +4731,19 @@ float reference_nextafterh(float xx, float yy) a_h += (a_h < b_h) ? 1 : -1; a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h; + + if (!allow_denorms && IsHalfSubnormal(a_h)) + { + auto sgn = [](float val) { return (0.f < val) - (val < 0.f); }; + + bool signs = sgn(xx) == sgn(yy); + bool zeros = (fabs(yy) == 0.f) && (fabs(xx) == 0.f); + if ((fabs(yy) > fabs(xx) && signs) || (zeros && !signs)) + a_h = (a_h & 0x8000) ? 0x8400 : 0x0400; + else + a_h = 0; + } + return cl_half_to_float(a_h); } diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h index b9b2e4695..175ee7312 100644 --- a/test_conformance/math_brute_force/reference_math.h +++ b/test_conformance/math_brute_force/reference_math.h @@ -162,7 +162,7 @@ long double reference_fractl(long double, long double*); long double reference_fmal(long double, long double, long double); long double reference_madl(long double, long double, long double); long double reference_nextafterl(long double, long double); -float reference_nextafterh(float, float); +float reference_nextafterh(float, float, bool allow_denormals = true); cl_half reference_nanh(cl_ushort); long double reference_recipl(long double); long double reference_rootnl(long double, int); From 5b313bd94ce316d02e9d885a5729d7c69549b84d Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Mon, 5 Jun 2023 14:13:41 +0200 Subject: [PATCH 11/24] Corrected procedure to compute nextafter cl_half for flush-to-zero mode --- test_conformance/math_brute_force/reference_math.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index 7fa0c54a3..c31221e3a 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -4724,6 +4724,7 @@ float reference_nextafterh(float xx, float yy, bool allow_denorms) short a_h = cl_half_from_float(x, CL_HALF_RTE); short b_h = cl_half_from_float(y, CL_HALF_RTE); + short oa_h = a_h; if (a_h & 0x8000) a_h = 0x8000 - a_h; if (b_h & 0x8000) b_h = 0x8000 - b_h; @@ -4731,14 +4732,9 @@ float reference_nextafterh(float xx, float yy, bool allow_denorms) a_h += (a_h < b_h) ? 1 : -1; a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h; - if (!allow_denorms && IsHalfSubnormal(a_h)) { - auto sgn = [](float val) { return (0.f < val) - (val < 0.f); }; - - bool signs = sgn(xx) == sgn(yy); - bool zeros = (fabs(yy) == 0.f) && (fabs(xx) == 0.f); - if ((fabs(yy) > fabs(xx) && signs) || (zeros && !signs)) + if (cl_half_to_float(0x7fff & oa_h) < cl_half_to_float(0x7fff & a_h)) a_h = (a_h & 0x8000) ? 0x8400 : 0x0400; else a_h = 0; From 0c937a9050b1b80d9631b734026f87819b3f7cdd Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Mon, 5 Jun 2023 14:25:30 +0200 Subject: [PATCH 12/24] Added correction for external check of reference value for nextafter test --- .../math_brute_force/binary_half.cpp | 42 ++++++------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index bf165542b..dffd7d095 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -616,22 +616,19 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) continue; } - auto check_error4 = [&]() { - err2 = Ulp_Error_Half(q[j], correct2); - err3 = Ulp_Error_Half(q[j], correct3); - err4 = Ulp_Error_Half(q[j], correct4); - err5 = Ulp_Error_Half(q[j], correct5); - fail = fail - && ((!(fabsf(err2) <= ulps)) - && (!(fabsf(err3) <= ulps)) - && (!(fabsf(err4) <= ulps)) - && (!(fabsf(err5) <= ulps))); - if (fabsf(err2) < fabsf(err)) err = err2; - if (fabsf(err3) < fabsf(err)) err = err3; - if (fabsf(err4) < fabsf(err)) err = err4; - if (fabsf(err5) < fabsf(err)) err = err5; - }; - check_error4(); + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 if (IsHalfSubnormal( @@ -643,19 +640,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) || IsHalfSubnormal( cl_half_from_float(correct5, CL_HALF_RTE))) { - if (fail && isNextafter) - { - correct2 = - reference_nextafterh(0.0, 0.0, false); - correct3 = - reference_nextafterh(-0.0, 0.0, false); - correct4 = - reference_nextafterh(0.0, -0.0, false); - correct5 = - reference_nextafterh(-0.0, -0.0, false); - check_error4(); - } - fail = fail && (test != 0.0f); if (!fail) err = 0.0f; } From 867df9f12ffc2235e580ae8f31add4a7dd2237bb Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 13 Jun 2023 09:14:06 +0200 Subject: [PATCH 13/24] Added correction due to code review request --- test_conformance/math_brute_force/ternary_half.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp index 0d8bb8cf2..3739199ac 100644 --- a/test_conformance/math_brute_force/ternary_half.cpp +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -262,12 +262,12 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) if (gSkipCorrectnessTesting) break; // Verify data - uint32_t *t = (uint32_t *)gOut_Ref; + uint16_t *t = (uint16_t *)gOut_Ref; for (size_t j = 0; j < half_buffer_size; j++) { for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { - uint32_t *q = (uint32_t *)(gOut[k]); + uint16_t *q = (uint16_t *)(gOut[k]); // If we aren't getting the correctly rounded result if (t[j] != q[j]) From 015e3b6b960ef8e540d95db2e42adc1e04c0a505 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 22 Jun 2023 11:42:48 +0200 Subject: [PATCH 14/24] Changed quantity of tests performed for half in unary and macro_unary procedures from basic --- test_conformance/math_brute_force/macro_unary_half.cpp | 4 +++- test_conformance/math_brute_force/unary_half.cpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index 3d5938020..755b772cd 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -103,7 +103,9 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) } else { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); } test_info.f = f; diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index f5de28d0d..5b0eab4c6 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -113,7 +113,9 @@ int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) } else { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); } test_info.f = f; From 1122f310ed32ce102947e89398b75de9751b9e72 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Wed, 8 Nov 2023 23:11:43 +0100 Subject: [PATCH 15/24] Added corrections related to code review: -added binary_operator_half.cpp and binary_two_results_i_half.cpp -address sanitizer errors fixed -extending list of special half values -removed unnecessary relaxed math references in half tests -corrected conditions to verify ulp narrowing of computation results -several refactoring and cosmetics corrections --- .../math_brute_force/CMakeLists.txt | 2 + .../math_brute_force/binary_half.cpp | 376 +++++----- .../math_brute_force/binary_i_half.cpp | 334 +++++---- .../math_brute_force/binary_operator_half.cpp | 663 ++++++++++++++++++ .../binary_two_results_i_half.cpp | 485 +++++++++++++ .../math_brute_force/function_list.cpp | 4 +- .../math_brute_force/i_unary_half.cpp | 8 +- .../math_brute_force/macro_binary_half.cpp | 272 ++++--- .../math_brute_force/macro_unary_half.cpp | 221 +++--- .../math_brute_force/mad_half.cpp | 10 +- .../math_brute_force/ternary_half.cpp | 24 +- .../math_brute_force/test_functions.h | 6 + .../math_brute_force/unary_half.cpp | 303 ++++---- .../unary_two_results_half.cpp | 124 +--- .../unary_two_results_i_half.cpp | 29 +- .../math_brute_force/unary_u_half.cpp | 17 +- test_conformance/math_brute_force/utility.h | 6 + 17 files changed, 1962 insertions(+), 922 deletions(-) create mode 100644 test_conformance/math_brute_force/binary_operator_half.cpp create mode 100644 test_conformance/math_brute_force/binary_two_results_i_half.cpp diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index f0fca7b4f..d53911e43 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -9,8 +9,10 @@ set(${MODULE_NAME}_SOURCES binary_i_half.cpp binary_operator_double.cpp binary_operator_float.cpp + binary_operator_half.cpp binary_two_results_i_double.cpp binary_two_results_i_float.cpp + binary_two_results_i_half.cpp common.cpp common.h function_list.cpp diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index dffd7d095..4b495c953 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -27,7 +27,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -40,7 +39,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread struct ThreadInfo { @@ -58,7 +56,6 @@ struct ThreadInfo tQueue; // per thread command queue to improve performance }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -76,7 +73,6 @@ struct TestInfoBase int isNextafter; }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -92,158 +88,24 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right -static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; -//////////////////////////////////////////////////////////////////////////////// -static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, - bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - double maxErrorVal2 = 0.0; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->half_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); - test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; - test_info.isNextafter = isNextafter; - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); - } +size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -254,7 +116,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) fptr func = job->f->func; int ftz = job->ftz; MTdata d = tinfo->d; - cl_uint j, k; cl_int error; const char *name = job->f->name; @@ -264,6 +125,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_ushort *t; cl_half *r; std::vector s(0), s2(0); + cl_uint j = 0; RoundingMode oldRoundMode; cl_int copysign_test = 0; @@ -352,12 +214,13 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); return error; } @@ -425,24 +288,24 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) s.resize(buffer_elements); s2.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) - for (j = 0; j < buffer_elements; j++) - { - s[j] = cl_half_to_float(p[j]); - s2[j] = cl_half_to_float(p2[j]); - if (isNextafter) - r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), - CL_HALF_RTE); - else - r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); - } + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + if (isNextafter) + r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), + CL_HALF_RTE); + else + r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); + } if (isFDim && ftz) RestoreFPState(&oldMode); // Read the data back -- no need to wait for the first N-1 buffers. This is // an in order queue. - for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, buffer_size, 0, NULL, NULL, &error); if (error || NULL == out[j]) { @@ -452,21 +315,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } } - // Wait for the last buffer - out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size, - 0, NULL, NULL, &error); - if (error || NULL == out[j]) - { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); - return error; - } - // Verify data for (j = 0; j < buffer_elements; j++) { - for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { cl_ushort *q = out[k]; @@ -498,8 +351,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fail && ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, ulps)) { if (isNextafter) { @@ -549,10 +401,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { if (fail && isNextafter) { @@ -631,14 +481,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err5) < fabsf(err)) err = err5; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct4, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct5, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -693,10 +539,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) check_error(); // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { if (fail && isNextafter) { @@ -731,9 +575,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } if (fail) { - vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), " - "%a (0x%0.4x)}\nExpected: %a (half 0x%0.4x) " - "\nActual: %a (half 0x%0.4x) at index: %d\n", + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %zu\n", name, sizeNames[k], err, s[j], p[j], s2[j], p2[j], cl_half_to_float(r[j]), r[j], test, q[j], j); @@ -778,13 +622,143 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) return error; } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + +int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, + bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; + test_info.isNextafter = isNextafter; + + test_info.tinfo.resize(test_info.threadCount); + + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error( + "Error: Unable to create sub-buffer of gOutBuffer[%d] " + "for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} + int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode); } -//////////////////////////////////////////////////////////////////////////////// int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode) { return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode); diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp index 571683e5d..dcfd28551 100644 --- a/test_conformance/math_brute_force/binary_i_half.cpp +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -24,7 +24,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -37,7 +36,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -54,7 +52,6 @@ typedef struct ThreadInfo tQueue; // per thread command queue to improve performance } ThreadInfo; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -68,7 +65,6 @@ struct TestInfoBase int ftz; // non-zero if running in flush to zero mode }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -84,168 +80,29 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// - // A table of more difficult cases to get right -static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; -static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); +size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -static const int specialValuesInt3[] = { 0, 1, 2, 3, - 1022, 1023, 1024, INT_MIN, - INT_MAX, -1, -2, -3, - -1022, -1023, -11024, -INT_MAX }; -static size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); +const int specialValuesInt3[] = { 0, 1, 2, 3, 1022, 1023, + 1024, INT_MIN, INT_MAX, -1, -2, -3, + -1022, -1023, -11024, -INT_MAX }; +size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - cl_int maxErrorVal2 = 0; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ulps = f->half_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - cl_buffer_region region2 = { i * test_info.subBufferSize - * sizeof(cl_int), - test_info.subBufferSize * sizeof(cl_int) }; - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - // Run the kernels - if (!gSkipCorrectnessTesting) - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; - } - } - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (!gSkipCorrectnessTesting) - { - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - - vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); - } - - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -348,7 +205,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half)); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -404,7 +261,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = 0; j < buffer_elements; j++) { s[j] = cl_half_to_float(p[j]); - r[j] = cl_half_from_float(func.f_fi(s[j], s2[j]), CL_HALF_RTE); + r[j] = HFF(func.f_fi(s[j], s2[j])); } // Read the data back -- no need to wait for the first N-1 buffers. This is @@ -450,8 +307,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fail && ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -473,10 +329,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -492,9 +346,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } if (fail) { - vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), " - "%d}\nExpected: %a (half 0x%0.4x) \nActual: %a " - "(half 0x%0.4x) at index: %d\n", + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%d}\nExpected: %a (half 0x%04x) \nActual: %a " + "(half 0x%04x) at index: %d\n", name, sizeNames[k], err, s[j], p[j], s2[j], cl_half_to_float(r[j]), r[j], test, q[j], (cl_uint)j); @@ -535,3 +389,139 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } return error; } + +} // anonymous namespace + +int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp new file mode 100644 index 000000000..2d3196474 --- /dev/null +++ b/test_conformance/math_brute_force/binary_operator_half.cpp @@ -0,0 +1,663 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + + // max error value. Init to 0. + float maxError; + // position of the max error value (param 1). Init to 0. + double maxErrorValue; + // position of the max error value (param 2). Init to 0. + double maxErrorValue2; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector tinfo; + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + // no special fields +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_int error; + + const char *name = job->f->name; + cl_half *r = 0; + std::vector s(0), s2(0); + RoundingMode oldRoundMode; + + cl_event e[VECTOR_SIZE_COUNT]; + cl_half *out[VECTOR_SIZE_COUNT]; + + // start the map of the output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + + bool divide = strcmp(name, "divide") == 0; + + // Init input array + cl_half *p = (cl_half *)gIn + thread_id * buffer_elements; + cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements; + cl_uint idx = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)lastSpecialJobIndex) + { + // Insert special values + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; idx < buffer_elements; idx++) + { + p[idx] = specialValuesHalf[x]; + p2[idx] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + + if (divide) + { + cl_half pj = p[idx] & 0x7fff; + cl_half p2j = p2[idx] & 0x7fff; + // Replace values outside [2^-7, 2^7] with QNaN + if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN + if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00; + } + } + } + + // Init any remaining values + for (; idx < buffer_elements; idx++) + { + p[idx] = (cl_half)genrand_int32(d); + p2[idx] = (cl_half)genrand_int32(d); + + if (divide) + { + cl_half pj = p[idx] & 0x7fff; + cl_half p2j = p2[idx] & 0x7fff; + // Replace values outside [2^-7, 2^7] with QNaN + if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN + if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00; + } + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xACDCACDC; + memset_pattern4(out[j], &pattern, buffer_size); + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", + error); + return error; + } + + // Run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + // Calculate the correctly rounded reference result + FPU_mode_type oldMode; + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + oldRoundMode = kRoundToNearestEven; + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + s.resize(buffer_elements); + s2.resize(buffer_elements); + + for (size_t j = 0; j < buffer_elements; j++) + { + s[j] = HTF(p[j]); + s2[j] = HTF(p2[j]); + r[j] = HFF(func.f_ff(s[j], s2[j])); + } + + if (ftz) RestoreFPState(&oldMode); + + // Read the data back -- no need to wait for the first N-1 buffers but wait + // for the last buffer. This is an in order queue. + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Verify data + + for (size_t j = 0; j < buffer_elements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = out[k]; + + // If we aren't getting the correctly rounded result + if (r[j] != q[j]) + { + float test = HTF(q[j]); + float correct = func.f_ff(s[j], s2[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + + float err = Ulp_Error_Half(q[j], correct); + + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = HTF(func.f_ff(0.0, s2[j])); + correct3 = HTF(func.f_ff(-0.0, s2[j])); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + correct2 = HTF(func.f_ff(0.0, 0.0)); + correct3 = HTF(func.f_ff(-0.0, 0.0)); + correct4 = HTF(func.f_ff(0.0, -0.0)); + correct5 = HTF(func.f_ff(-0.0, -0.0)); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = HTF(func.f_ff(s[j], 0.0)); + correct3 = HTF(func.f_ff(s[j], -0.0)); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %zu\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], HTF(r[j]), r[j], test, q[j], j); + return -1; + } + } + } + } + + if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp new file mode 100644 index 000000000..3900e62d5 --- /dev/null +++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp @@ -0,0 +1,485 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +struct ComputeReferenceInfoF +{ + const cl_half *x; + const cl_half *y; + cl_half *r; + int32_t *i; + double (*f_ffpI)(double, double, int *); + cl_uint lim; + cl_uint count; +}; + +cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +{ + ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; + cl_uint lim = cri->lim; + cl_uint count = cri->count; + cl_uint off = jid * count; + const cl_half *x = cri->x + off; + const cl_half *y = cri->y + off; + cl_half *r = cri->r + off; + int32_t *i = cri->i + off; + double (*f)(double, double, int *) = cri->f_ffpI; + + if (off + count > lim) count = lim - off; + + for (cl_uint j = 0; j < count; ++j) + r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j)); + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + int64_t maxError2 = 0; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // use larger type of output data to prevent overflowing buffer size + constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t); + + cl_uint threadCount = GetThreadCount(); + + float half_ulps = f->half_ulps; + + int testingRemquo = !strcmp(f->name, "remquo"); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *p = (cl_half *)gIn; + cl_half *p2 = (cl_half *)gIn2; + for (size_t j = 0; j < buffer_size; j++) + { + p[j] = (cl_half)genrand_int32(d); + p2[j] = (cl_half)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn, 0, + NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn2, + 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", + error); + return error; + } + + if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], + &pattern, sizeof(pattern), 0, + BUFFER_SIZE, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", + error); + return error; + } + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // align working group size with the bigger output type + size_t vectorSize = sizeValues[j] * sizeof(int32_t); + size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + if (threadCount > 1) + { + ComputeReferenceInfoF cri; + cri.x = p; + cri.y = p2; + cri.r = (cl_half *)gOut_Ref; + cri.i = (int32_t *)gOut_Ref2; + cri.f_ffpI = f->func.f_ffpI; + cri.lim = buffer_size; + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceF, threadCount, &cri); + } + else + { + cl_half *r = (cl_half *)gOut_Ref; + int32_t *r2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + r[j] = + HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j)); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, + BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + cl_half *t = (cl_half *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = (cl_half *)(gOut[k]); + int32_t *q2 = (int32_t *)gOut2[k]; + + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; + + // Check for paired NaNs + if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j]) + continue; + + cl_half test = ((cl_half *)q)[j]; + int correct2 = INT_MIN; + float correct = + (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2); + float err = Ulp_Error_Half(test, correct); + int64_t iErr; + + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); + else + iErr = (long long)q2[j] - (long long)correct2; + + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f) + || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]); + if (iptrUndefined) iErr = 0; + + int fail = !(fabsf(err) <= half_ulps && iErr == 0); + if (ftz && fail) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + int correct3i, correct4i; + float correct3 = + (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i); + float correct4 = + (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + int correct7i, correct8i; + correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); + correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i); + double correct7 = + f->func.f_ffpI(0.0, -0.0, &correct7i); + double correct8 = + f->func.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Ulp_Error_Half(test, correct3); + err3 = Ulp_Error_Half(test, correct4); + float err4 = Ulp_Error_Half(test, correct7); + float err5 = Ulp_Error_Half(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= half_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= half_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct3, half_ulps) + || IsHalfResultSubnormal(correct4, half_ulps) + || IsHalfResultSubnormal(correct7, half_ulps) + || IsHalfResultSubnormal(correct8, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + int correct3i, correct4i; + double correct3 = + f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i); + double correct4 = + f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(p[j]); + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = HTF(p[j]); + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %" PRId64 + "} ulp error at {%a, %a} " + "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, " + "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, HTF(p[j]), + HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j], + t2[j], HTF(test), q2[j], test, q2[j]); + return -1; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 67ed0d8ac..b2f3de82e 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -164,7 +164,7 @@ static constexpr vtbl _binary_operator = { "binaryOperator", TestFunc_Float_Float_Float_Operator, TestFunc_Double_Double_Double_Operator, - NULL, + TestFunc_Half_Half_Half_Operator, }; static constexpr vtbl _binary_i = { @@ -206,7 +206,7 @@ static constexpr vtbl _binary_two_results_i = { "binary_two_results_i", TestFunc_FloatI_Float_Float, TestFunc_DoubleI_Double_Double, - NULL, + TestFunc_HalfI_Half_Half, }; static constexpr vtbl _mad_tbl = { diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index c78c03a49..ada2aa89a 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -23,7 +23,8 @@ #include #include -//////////////////////////////////////////////////////////////////////////////// +namespace { + static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -174,7 +176,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) uint32_t err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%0.4x): " + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): " "*%d vs. %d\n", f->name, sizeNames[k], err, s[j], p[j], t[j], q[j]); diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index 8af034c43..6157a9ebb 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -21,10 +21,8 @@ #include - namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -37,7 +35,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// struct ThreadInfo { clMemWrapper inBuf; // input buffer for the thread @@ -48,7 +45,6 @@ struct ThreadInfo tQueue; // per thread command queue to improve performance }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -61,7 +57,6 @@ struct TestInfoBase int ftz; // non-zero if running in flush to zero mode }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -77,139 +72,24 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// // A table of more difficult cases to get right -static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; -//////////////////////////////////////////////////////////////////////////////// -static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - test_info.tinfo[i].inBuf2 = - clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); - } - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - - return error; -} +size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -310,7 +190,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -370,7 +250,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) r[j] = (short)func.i_ff(s[j], s2[j]); } - // Read the data back -- no need to wait for the first N-1 buffers. This is // an in order queue. for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) @@ -437,8 +316,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) short err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; vlog_error( - "\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a " - "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n", + "\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n", name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j); error = -1; return error; @@ -484,8 +363,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_ushort err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; - vlog_error("\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a " - "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x " + vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x " "(index: %d)\n", name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j); error = -1; @@ -526,3 +405,116 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) return error; } + +} // anonymous namespace + +int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index 755b772cd..ae359b3e5 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -23,7 +23,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread struct ThreadInfo { @@ -45,7 +43,6 @@ struct ThreadInfo tQueue; // per thread command queue to improve performance }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -57,7 +54,6 @@ struct TestInfoBase int ftz; // non-zero if running in flush to zero mode }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -73,114 +69,7 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = - std::max((cl_uint)1, - (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); - } - - test_info.f = f; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - } - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -246,7 +135,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -353,7 +242,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) short err = t[j] - q[j]; if (q[j] > t[j]) err = q[j] - t[j]; - vlog_error("\nERROR: %s: %d ulp error at %a (0x%0.4x)\nExpected: " + vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: " "%d vs. %d\n", name, err, s[j], p[j], t[j], q[j]); error = -1; @@ -381,7 +270,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) short err = -t[j] - q[j]; if (q[j] > -t[j]) err = q[j] + t[j]; vlog_error("\nERROR: %s%s: %d ulp error at %a " - "(0x%0.4x)\nExpected: %d \nActual: %d\n", + "(0x%04x)\nExpected: %d \nActual: %d\n", name, sizeNames[k], err, s[j], p[j], -t[j], q[j]); error = -1; return error; @@ -419,3 +308,105 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) } return error; } + +} // anonymous namespace + +int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp index ef6f2b776..5cb73d4b1 100644 --- a/test_conformance/math_brute_force/mad_half.cpp +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -21,7 +21,8 @@ #include -//////////////////////////////////////////////////////////////////////////////// +namespace { + cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -34,7 +35,8 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -42,7 +44,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) KernelMatrix kernels; const unsigned thread_id = 0; // Test is currently not multithreaded. float maxError = 0.0f; - // int ftz = f->ftz || gForceFTZ; + float maxErrorVal = 0.0f; float maxErrorVal2 = 0.0f; float maxErrorVal3 = 0.0f; @@ -96,7 +98,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp index 3739199ac..93dc612f7 100644 --- a/test_conformance/math_brute_force/ternary_half.cpp +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -41,14 +41,17 @@ cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) // A table of more difficult cases to get right static const cl_half specialValuesHalf[] = { - 0xffff, - 0x0000, - 0x0001, - 0x7c00 /*INFINITY*/, - 0xfc00 /*-INFINITY*/, - 0x8000 /*-0*/, - 0x7bff /*HALF_MAX*/, - 0x0400 /*HALF_MIN*/ + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ }; constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); @@ -78,8 +81,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); // Init the kernels - BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, - relaxedMode }; + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -294,7 +296,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) test != correct ? Ulp_Error_Half(test, ref1) : 0.f; fail = !(fabsf(err) <= half_ulps); - if (fail && (ftz || relaxedMode)) + if (fail && ftz) { // retry per section 6.5.3.2 with flushing on if (0.0f == test diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h index 16f57013c..16b361d53 100644 --- a/test_conformance/math_brute_force/test_functions.h +++ b/test_conformance/math_brute_force/test_functions.h @@ -87,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata, bool relaxedMode); +// half op half +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int) int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode); @@ -135,6 +138,9 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double, int*) int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, int*) +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode); + // Special handling for mad. // float mad(float, float, float) int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index 5b0eab4c6..f6e914c8a 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -23,7 +23,6 @@ namespace { -//////////////////////////////////////////////////////////////////////////////// cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { BuildKernelInfo &info = *(BuildKernelInfo *)p; @@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// // Thread specific data for a worker thread typedef struct ThreadInfo { @@ -47,7 +45,6 @@ typedef struct ThreadInfo tQueue; // per thread command queue to improve performance } ThreadInfo; -//////////////////////////////////////////////////////////////////////////////// struct TestInfoBase { size_t subBufferSize; // Size of the sub-buffer in elements @@ -64,7 +61,6 @@ struct TestInfoBase float half_sin_cos_tan_limit; }; -//////////////////////////////////////////////////////////////////////////////// struct TestInfo : public TestInfoBase { TestInfo(const TestInfoBase &base): TestInfoBase(base) {} @@ -80,147 +76,7 @@ struct TestInfo : public TestInfoBase KernelMatrix k; }; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p); - -//////////////////////////////////////////////////////////////////////////////// -int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) -{ - TestInfoBase test_info_base; - cl_int error; - size_t i, j; - float maxError = 0.0f; - double maxErrorVal = 0.0; - - logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - - // Init test_info - memset(&test_info_base, 0, sizeof(test_info_base)); - TestInfo test_info(test_info_base); - - test_info.threadCount = GetThreadCount(); - - test_info.subBufferSize = BUFFER_SIZE - / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); - test_info.scale = getTestScale(sizeof(cl_half)); - test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; - if (test_info.step / test_info.subBufferSize != test_info.scale) - { - // there was overflow - test_info.jobCount = 1; - } - else - { - test_info.jobCount = - std::max((cl_uint)1, - (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); - } - - test_info.f = f; - test_info.ulps = f->half_ulps; - test_info.ftz = - f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); - - test_info.tinfo.resize(test_info.threadCount); - - for (i = 0; i < test_info.threadCount; i++) - { - cl_buffer_region region = { i * test_info.subBufferSize - * sizeof(cl_half), - test_info.subBufferSize * sizeof(cl_half) }; - test_info.tinfo[i].inBuf = - clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, - CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) - { - vlog_error("Error: Unable to create sub-buffer of gInBuffer for " - "region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) - { - test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( - gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, - ®ion, &error); - if (error || NULL == test_info.tinfo[i].outBuf[j]) - { - vlog_error("Error: Unable to create sub-buffer of gOutBuffer " - "for region {%zd, %zd}\n", - region.origin, region.size); - return error; - } - } - test_info.tinfo[i].tQueue = - clCreateCommandQueue(gContext, gDevice, 0, &error); - if (NULL == test_info.tinfo[i].tQueue || error) - { - vlog_error("clCreateCommandQueue failed. (%d)\n", error); - return error; - } - } - - // Check for special cases for unary float - test_info.isRangeLimited = 0; - test_info.half_sin_cos_tan_limit = 0; - if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) - { - test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = 1.0f - + test_info.ulps - * (FLT_EPSILON / 2.0f); // out of range results from finite - // inputs must be in [-1,1] - } - else if (0 == strcmp(f->name, "half_tan")) - { - test_info.isRangeLimited = 1; - test_info.half_sin_cos_tan_limit = - INFINITY; // out of range resut from finite inputs must be numeric - } - - // Init the kernels - { - BuildKernelInfo build_info = { test_info.threadCount, test_info.k, - test_info.programs, f->nameInCode }; - error = ThreadPool_Do(BuildKernel_HalfFn, - gMaxVectorSizeIndex - gMinVectorSizeIndex, - &build_info); - test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); - } - - if (!gSkipCorrectnessTesting) - { - error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); - - // Accumulate the arithmetic errors - for (i = 0; i < test_info.threadCount; i++) - { - if (test_info.tinfo[i].maxError > maxError) - { - maxError = test_info.tinfo[i].maxError; - maxErrorVal = test_info.tinfo[i].maxErrorValue; - } - } - - test_error(error, "ThreadPool_Do: TestHalf failed\n"); - - if (gWimpyMode) - vlog("Wimp pass"); - else - vlog("passed"); - } - - if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); - vlog("\n"); - - return error; -} - -//////////////////////////////////////////////////////////////////////////////// -static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; size_t buffer_elements = job->subBufferSize; @@ -288,7 +144,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Fill the result buffer with garbage, so that old results don't carry // over - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(out[j], &pattern, buffer_size); if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL))) @@ -333,12 +189,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // Calculate the correctly rounded reference result cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements; - cl_ushort *t = (cl_ushort *)r; s.resize(buffer_elements); for (j = 0; j < buffer_elements; j++) { s[j] = (float)cl_half_to_float(p[j]); - r[j] = cl_half_from_float(func.f_f(s[j]), CL_HALF_RTE); + r[j] = HFF(func.f_f(s[j])); } // Read the data back -- no need to wait for the first N-1 buffers. This is @@ -373,7 +228,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_ushort *q = out[k]; // If we aren't getting the correctly rounded result - if (t[j] != q[j]) + if (r[j] != q[j]) { float test = cl_half_to_float(q[j]); double correct = func.f_f(s[j]); @@ -397,8 +252,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -418,10 +272,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fabsf(err3) < fabsf(err)) err = err3; // retry per section 6.5.3.4 - if (IsHalfSubnormal( - cl_half_from_float(correct2, CL_HALF_RTE)) - || IsHalfSubnormal( - cl_half_from_float(correct3, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -437,10 +289,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) if (fail) { vlog_error("\nERROR: %s%s: %f ulp error at %a " - "(0x%0.4x)\nExpected: %a (half 0x%0.4x) " - "\nActual: %a (half 0x%0.4x)\n", + "(half 0x%04x)\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x)\n", job->f->name, sizeNames[k], err, s[j], p[j], - cl_half_to_float(r[j]), t[j], test, q[j]); + cl_half_to_float(r[j]), r[j], test, q[j]); error = -1; return error; } @@ -480,3 +332,138 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) return error; } + +} // anonymous namespace + +int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Check for special cases for unary float + test_info.isRangeLimited = 0; + test_info.half_sin_cos_tan_limit = 0; + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] + } + else if (0 == strcmp(f->name, "half_tan")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index 3f8d71168..18d4dadd0 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -62,8 +62,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) float half_ulps = f->half_ulps; // Init the kernels - BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, - relaxedMode }; + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -77,22 +76,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) const unsigned m_size = 0x1ff; const unsigned e_size = 0xf; const unsigned s_size = 0x2; - const unsigned sclamp = 0xffff; for (size_t j = 0; j < half_buffer_size; j++) { unsigned ind = j % (s_size * e_size * m_size); unsigned val = (((ind / (e_size * m_size)) << 15) | (((ind / m_size) % e_size + 1) << 10) - | (ind % m_size + 1)) - & sclamp; + | (ind % m_size + 1)); pIn[j] = val; - - if (relaxedMode && strcmp(f->name, "sincos") == 0) - { - float pj = HTF(pIn[j]); - if (fabs(pj) > M_PI) pIn[j] = 0x7e00; // HALF_NAN - } } } @@ -106,7 +97,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) // Write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xffffdead; + uint32_t pattern = 0xacdcacdc; if (gHostFill) { memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); @@ -200,7 +191,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) { // Calculate the correctly rounded reference result memset(&oldMode, 0, sizeof(oldMode)); - if (ftz || relaxedMode) ForceFTZ(&oldMode); + if (ftz) ForceFTZ(&oldMode); // Set the rounding mode to match the device if (gIsInRTZMode) @@ -218,11 +209,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) double dd; feclearexcept(FE_OVERFLOW); - if (relaxedMode) - ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd)); - else - ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); - + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); ref2[j] = HFF((float)dd); overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); @@ -233,11 +220,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) for (size_t j = 0; j < half_buffer_size; j++) { double dd; - if (relaxedMode) - ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd)); - else - ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); - + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); ref2[j] = HFF((float)dd); } } @@ -283,17 +266,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) double fp_correct1 = 0, fp_correct2 = 0; float err = 0, err2 = 0; - if (relaxedMode) - fp_correct1 = f->rfunc.f_fpf(HTF(pIn[j]), &fp_correct2); - else - fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); + fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); cl_half correct1 = HFF(fp_correct1); cl_half correct2 = HFF(fp_correct2); // Per section 10 paragraph 6, accept any result if an input // or output is a infinity or NaN or overflow - if (relaxedMode || skipNanInf) + if (skipNanInf) { if (skipNanInf && overflow[j]) continue; // Note: no double rounding here. Reference functions @@ -304,35 +284,18 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) continue; } - // If we are in fast relaxed math, we - // have a different calculation for the - // subnormal threshold. - typedef int (*CheckForSubnormal)(double, float); - CheckForSubnormal isFloatResultSubnormalPtr; - if (relaxedMode) - { - err = Abs_Error(HTF(test1[j]), fp_correct1); - err2 = Abs_Error(HTF(test2[j]), fp_correct2); - isFloatResultSubnormalPtr = - &IsFloatResultSubnormalAbsError; - } - else - { - err = Ulp_Error_Half(test1[j], fp_correct1); - err2 = Ulp_Error_Half(test2[j], fp_correct2); - isFloatResultSubnormalPtr = &IsFloatResultSubnormal; - } + err = Ulp_Error_Half(test1[j], fp_correct1); + err2 = Ulp_Error_Half(test2[j], fp_correct2); + int fail = !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps); - if (ftz || relaxedMode) + if (ftz) { // retry per section 6.5.3.2 - if ((*isFloatResultSubnormalPtr)(fp_correct1, - half_ulps)) + if (IsHalfResultSubnormal(fp_correct1, half_ulps)) { - if ((*isFloatResultSubnormalPtr)(fp_correct2, - half_ulps)) + if (IsHalfResultSubnormal(fp_correct2, half_ulps)) { fail = fail && !(HTF(test1[j]) == 0.0f @@ -351,8 +314,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (!fail) err = 0.0f; } } - else if ((*isFloatResultSubnormalPtr)(fp_correct2, - half_ulps)) + else if (IsHalfResultSubnormal(fp_correct2, half_ulps)) { fail = fail && !(HTF(test2[j]) == 0.0f @@ -369,19 +331,8 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) float errp, err2p, errn, err2n; if (skipNanInf) feclearexcept(FE_OVERFLOW); - if (relaxedMode) - { - fp_correctp = - f->rfunc.f_fpf(0.0, &fp_correct2p); - fp_correctn = - f->rfunc.f_fpf(-0.0, &fp_correct2n); - } - else - { - fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); - fp_correctn = - f->func.f_fpf(-0.0, &fp_correct2n); - } + fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); + fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n); cl_half correctp = HFF(fp_correctp); cl_half correctn = HFF(fp_correctn); @@ -408,20 +359,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) continue; } - if (relaxedMode) - { - errp = Abs_Error(HTF(test1[j]), fp_correctp); - err2p = Abs_Error(HTF(test1[j]), fp_correct2p); - errn = Abs_Error(HTF(test1[j]), fp_correctn); - err2n = Abs_Error(HTF(test1[j]), fp_correct2n); - } - else - { - errp = Ulp_Error_Half(test1[j], fp_correctp); - err2p = Ulp_Error_Half(test1[j], fp_correct2p); - errn = Ulp_Error_Half(test1[j], fp_correctn); - err2n = Ulp_Error_Half(test1[j], fp_correct2n); - } + errp = Ulp_Error_Half(test1[j], fp_correctp); + err2p = Ulp_Error_Half(test1[j], fp_correct2p); + errn = Ulp_Error_Half(test1[j], fp_correctn); + err2n = Ulp_Error_Half(test1[j], fp_correct2n); fail = fail && ((!(fabsf(errp) <= half_ulps)) @@ -434,15 +375,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (fabsf(err2n) < fabsf(err2)) err2 = err2n; // retry per section 6.5.3.4 - if ((*isFloatResultSubnormalPtr)(fp_correctp, - half_ulps) - || (*isFloatResultSubnormalPtr)(fp_correctn, - half_ulps)) + if (IsHalfResultSubnormal(fp_correctp, half_ulps) + || IsHalfResultSubnormal(fp_correctn, + half_ulps)) { - if ((*isFloatResultSubnormalPtr)(fp_correct2p, - half_ulps) - || (*isFloatResultSubnormalPtr)( - fp_correct2n, half_ulps)) + if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) { fail = fail && !(HTF(test1[j]) == 0.0f @@ -457,10 +397,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (!fail) err = 0.0f; } } - else if ((*isFloatResultSubnormalPtr)(fp_correct2p, - half_ulps) - || (*isFloatResultSubnormalPtr)( - fp_correct2n, half_ulps)) + else if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) { fail = fail && !(HTF(test2[j]) == 0.0f diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp index 241377dda..9a769447f 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -72,8 +72,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0; // Init the kernels - BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode, - relaxedMode }; + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, gMaxVectorSizeIndex - gMinVectorSizeIndex, &build_info))) @@ -88,15 +87,13 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) const unsigned m_size = 0x1ff; const unsigned e_size = 0xf; const unsigned s_size = 0x2; - const unsigned sclamp = 0xffff; for (size_t j = 0; j < half_buffer_size; j++) { unsigned ind = j % (s_size * e_size * m_size); unsigned val = (((ind / (e_size * m_size)) << 15) | (((ind / m_size) % e_size + 1) << 10) - | (ind % m_size + 1)) - & sclamp; + | (ind % m_size + 1)); pIn[j] = val; } } @@ -111,7 +108,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xffffdead; + uint32_t pattern = 0xacdcacdc; if (gHostFill) { memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); @@ -161,9 +158,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Run the kernels for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // sizeof(cl_half) < sizeof (int32_t) - // to prevent overflowing gOut_Ref2 it is necessary to use - // bigger type as denominator for buffer size calculation + // align working group size with the bigger output type size_t vectorSize = sizeValues[j] * sizeof(int32_t); size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; if ((error = clSetKernelArg(kernels[j][thread_id], 0, @@ -211,15 +206,17 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Read the data back for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, BUFFER_SIZE, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); return error; } if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) { vlog_error("ReadArray2 failed %d\n", error); @@ -251,10 +248,10 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) cl_long iErr = (int64_t)test2[j] - (int64_t)correct2; int fail = !(fabsf(err) <= half_ulps && abs_cl_long(iErr) <= maxiError); - if (ftz || relaxedMode) + if (ftz) { // retry per section 6.5.3.2 - if (IsFloatResultSubnormal(fp_correct, half_ulps)) + if (IsHalfResultSubnormal(fp_correct, half_ulps)) { fail = fail && !(test == 0.0f && iErr == 0); if (!fail) err = 0.0f; @@ -294,9 +291,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // retry per section 6.5.3.4 if (fail - && (IsFloatResultSubnormal(correct2, half_ulps) - || IsFloatResultSubnormal(fp_correct3, - half_ulps))) + && (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(fp_correct3, + half_ulps))) { fail = fail && !(test == 0.0f diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index 842e85a9b..e2ff93705 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -23,7 +23,8 @@ #include #include -//////////////////////////////////////////////////////////////////////////////// +namespace { + static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) { @@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, return BuildKernels(info, job_id, generator); } -//////////////////////////////////////////////////////////////////////////////// +} // anonymous namespace + int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { int error; @@ -90,7 +92,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint16_t pattern = 0xdead; + uint32_t pattern = 0xACDCACDC; memset_pattern4(gOut[j], &pattern, bufferSize); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, @@ -139,7 +141,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (!strcmp(name, "nan")) r[j] = reference_nanh(p[j]); else - r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE); + r[j] = HFF(f->func.f_u(p[j])); } // Read the data back for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) @@ -181,8 +183,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (ftz) { // retry per section 6.5.3.2 - if (IsHalfSubnormal( - cl_half_from_float(correct, CL_HALF_RTE))) + if (IsHalfResultSubnormal(correct, half_ulps)) { fail = fail && (test != 0.0f); if (!fail) err = 0.0f; @@ -197,8 +198,8 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) if (fail) { vlog_error( - "\n%s%s: %f ulp error at 0x%0.4x \nExpected: %a " - "(0x%0.4x) \nActual: %a (0x%0.4x)\n", + "\n%s%s: %f ulp error at 0x%04x \nExpected: %a " + "(0x%04x) \nActual: %a (0x%04x)\n", f->name, sizeNames[k], err, p[j], cl_half_to_float(r[j]), r[j], test, q[j]); return -1; diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index d11ce6f36..264fc7a43 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -126,6 +126,12 @@ inline int IsFloatResultSubnormal(double x, float ulps) return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } +inline int IsHalfResultSubnormal(float x, float ulps) +{ + x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps; + return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14); +} + inline int IsFloatResultSubnormalAbsError(double x, float abs_err) { x = x - abs_err; From 9133686b4a30e0ab64c01cd43b2a5438b1bbc5f9 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Wed, 8 Nov 2023 23:29:17 +0100 Subject: [PATCH 16/24] Print format correction due to failed CI check --- test_conformance/math_brute_force/binary_half.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index 4b495c953..fdf54268d 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -577,7 +577,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " "%a (0x%04x)}\nExpected: %a (half 0x%04x) " - "\nActual: %a (half 0x%04x) at index: %zu\n", + "\nActual: %a (half 0x%04x) at index: %u\n", name, sizeNames[k], err, s[j], p[j], s2[j], p2[j], cl_half_to_float(r[j]), r[j], test, q[j], j); From 11e45a793f70ae57211a2887c1eadc77a8cc13f4 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Thu, 9 Nov 2023 15:16:04 +0100 Subject: [PATCH 17/24] Corrected bug found in code review (fp16 bruteforce) --- test_conformance/math_brute_force/binary_half.cpp | 2 +- test_conformance/math_brute_force/binary_i_half.cpp | 2 +- test_conformance/math_brute_force/macro_binary_half.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index fdf54268d..3ddc64b89 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -683,7 +683,7 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, test_info.tinfo[i].inBuf2 = clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) + if (error || NULL == test_info.tinfo[i].inBuf2) { vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " "region {%zd, %zd}\n", diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp index dcfd28551..97692c142 100644 --- a/test_conformance/math_brute_force/binary_i_half.cpp +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -450,7 +450,7 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) test_info.tinfo[i].inBuf2 = clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); - if (error || NULL == test_info.tinfo[i].inBuf) + if (error || NULL == test_info.tinfo[i].inBuf2) { vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " "region {%zd, %zd}\n", diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index 6157a9ebb..842ef61f8 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -460,7 +460,7 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) test_info.tinfo[i].inBuf2 = clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); - if (error || NULL == test_info.tinfo[i].inBuf) + if (error || NULL == test_info.tinfo[i].inBuf2) { vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " "region {%zd, %zd}\n", From b5ed4f081bd09d8e563fde30793fb145f85e42f3 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Fri, 17 Nov 2023 08:36:55 +0100 Subject: [PATCH 18/24] Corrections related to code review (cl_khr_fp16 support according to #142) -gHostFill missing support added -special half values array extended -cosmetics and unifying --- .../math_brute_force/binary_half.cpp | 75 +++++++++++------- .../math_brute_force/binary_i_half.cpp | 79 ++++++++++++------- .../math_brute_force/binary_operator_half.cpp | 73 ++++++++++------- .../binary_two_results_i_half.cpp | 26 +++--- .../math_brute_force/i_unary_half.cpp | 32 +++++--- .../math_brute_force/macro_binary_half.cpp | 78 +++++++++++------- .../math_brute_force/macro_unary_half.cpp | 67 ++++++++++------ .../math_brute_force/mad_half.cpp | 27 +++++-- .../math_brute_force/ternary_half.cpp | 35 ++++---- .../math_brute_force/unary_half.cpp | 68 +++++++++------- .../unary_two_results_half.cpp | 20 ++--- .../unary_two_results_i_half.cpp | 26 +++--- .../math_brute_force/unary_u_half.cpp | 29 ++++--- 13 files changed, 375 insertions(+), 260 deletions(-) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp index 3ddc64b89..f80a08537 100644 --- a/test_conformance/math_brute_force/binary_half.cpp +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -101,9 +101,14 @@ const cl_half specialValuesHalf[] = { 0x3555, /*nearest value to 1/3*/ 0x3bff, /*largest number less than one*/ 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ }; -size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { @@ -133,21 +138,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; cl_ushort *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + + if (gHostFill) { - out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } } - } - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } // Init input array cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; @@ -200,28 +210,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) + if (gHostFill) { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } } // Fill the result buffer with garbage, so that old results don't carry // over - uint32_t pattern = 0xACDCACDC; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", - error); - return error; + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } // run the kernel diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp index 97692c142..001e2b4f5 100644 --- a/test_conformance/math_brute_force/binary_i_half.cpp +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -47,7 +47,7 @@ typedef struct ThreadInfo maxErrorValue; // position of the max error value (param 1). Init to 0. cl_int maxErrorValue2; // position of the max error value (param 2). Init // to 0. - MTdata d; + MTdataHolder d; clCommandQueueWrapper tQueue; // per thread command queue to improve performance } ThreadInfo; @@ -93,9 +93,14 @@ const cl_half specialValuesHalf[] = { 0x3555, /*nearest value to 1/3*/ 0x3bff, /*largest number less than one*/ 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ }; -size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); const int specialValuesInt3[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX, -1, -2, -3, @@ -123,21 +128,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; cl_ushort *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + + if (gHostFill) { - out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error); - if (error || NULL == out[j]) + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } } - } - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } // Init input array cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; @@ -191,27 +201,38 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) + if (gHostFill) { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } } // Fill the result buffer with garbage, so that old results don't carry // over - uint32_t pattern = 0xACDCACDC; - memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half)); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - return error; + memset_pattern4(out[j], &pattern, + buffer_elements * sizeof(cl_half)); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer( + tinfo->tQueue, tinfo->outBuf[j], &pattern, sizeof(pattern), 0, + buffer_elements * sizeof(cl_half), 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } // run the kernel @@ -479,7 +500,7 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) return error; } - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); } diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp index 2d3196474..e7f53af87 100644 --- a/test_conformance/math_brute_force/binary_operator_half.cpp +++ b/test_conformance/math_brute_force/binary_operator_half.cpp @@ -93,6 +93,11 @@ const cl_half specialValuesHalf[] = { 0x3555, /*nearest value to 1/3*/ 0x3bff, /*largest number less than one*/ 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ }; constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); @@ -118,22 +123,25 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) cl_event e[VECTOR_SIZE_COUNT]; cl_half *out[VECTOR_SIZE_COUNT]; - // start the map of the output arrays - for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + if (gHostFill) { - out[j] = (cl_ushort *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) + // start the map of the output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } } - } - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } bool divide = strcmp(name, "divide") == 0; @@ -207,28 +215,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) + if (gHostFill) { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } } // Fill the result buffer with garbage, so that old results don't carry // over - uint32_t pattern = 0xACDCACDC; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n", - error); - return error; + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } // Run the kernel diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp index 3900e62d5..bc2519e95 100644 --- a/test_conformance/math_brute_force/binary_two_results_i_half.cpp +++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp @@ -159,23 +159,15 @@ int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode) } else { - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", - error); - return error; - } - - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", - error); - return error; - } + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); } } diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index ada2aa89a..d51f5ddb3 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -48,8 +48,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; size_t bufferSize = BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - uint64_t bufferElements = bufferSize / sizeof(cl_int); - std::vector s(0); + size_t bufferElements = bufferSize / sizeof(cl_int); int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); @@ -69,7 +68,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) &build_info))) return error; } - s.resize(bufferElements); + std::vector s(bufferElements); for (uint64_t i = 0; i < (1ULL << 16); i += step) { @@ -94,15 +93,26 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xffffdead; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - return error; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSize, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } } diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index 842ef61f8..bcda06e4e 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -40,7 +40,7 @@ struct ThreadInfo clMemWrapper inBuf; // input buffer for the thread clMemWrapper inBuf2; // input buffer for the thread clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread - MTdata d; + MTdataHolder d; clCommandQueueWrapper tQueue; // per thread command queue to improve performance }; @@ -85,9 +85,14 @@ const cl_half specialValuesHalf[] = { 0x3555, /*nearest value to 1/3*/ 0x3bff, /*largest number less than one*/ 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ }; -size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) { @@ -108,21 +113,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; cl_short *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + + if (gHostFill) { - out[j] = (cl_short *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } } - } - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } // Init input array cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; @@ -176,27 +186,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) + if (gHostFill) { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } } // Fill the result buffer with garbage, so that old results don't carry // over - uint32_t pattern = 0xACDCACDC; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - return error; + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } // run the kernel @@ -489,7 +509,7 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) return error; } - test_info.tinfo[i].d = init_genrand(genrand_int32(d)); + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); } // Init the kernels diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp index ae359b3e5..a755ddb15 100644 --- a/test_conformance/math_brute_force/macro_unary_half.cpp +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -92,21 +92,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; cl_short *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + + if (gHostFill) { - out[j] = (cl_short *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } } - } - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } // Write the new values to the input array cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; @@ -121,27 +126,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) + if (gHostFill) { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } } // Fill the result buffer with garbage, so that old results don't carry // over - uint32_t pattern = 0xACDCACDC; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - return error; + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } // run the kernel diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp index 5cb73d4b1..4545c93ea 100644 --- a/test_conformance/math_brute_force/mad_half.cpp +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -98,15 +98,26 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xACDCACDC; - memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - return error; + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } } diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp index 93dc612f7..ba6dd4d48 100644 --- a/test_conformance/math_brute_force/ternary_half.cpp +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -52,6 +52,11 @@ static const cl_half specialValuesHalf[] = { 0x3555, /*nearest value to 1/3*/ 0x3bff, /*largest number less than one*/ 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ }; constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); @@ -72,9 +77,9 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) float maxErrorVal3 = 0.0f; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half); + constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half); - cl_uchar overflow[half_buffer_size]; + cl_uchar overflow[bufferElements]; float half_ulps = f->half_ulps; int skipNanInf = (0 == strcmp("fma", f->nameInCode)); @@ -99,7 +104,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) { // test edge cases uint32_t x, y, z; x = y = z = 0; - for (; idx < half_buffer_size; idx++) + for (; idx < bufferElements; idx++) { hp0[idx] = specialValuesHalf[x]; hp1[idx] = specialValuesHalf[y]; @@ -115,7 +120,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) } } } - if (idx == half_buffer_size) + if (idx == bufferElements) vlog_error("Test Error: not all special cases tested!\n"); } @@ -124,7 +129,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX); }; - for (; idx < half_buffer_size; idx++) + for (; idx < bufferElements; idx++) { hp0[idx] = any_value(); hp1[idx] = any_value(); @@ -155,7 +160,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) // Write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xffffdead; + uint32_t pattern = 0xacdcacdc; if (gHostFill) { memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); @@ -171,14 +176,10 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) } else { - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n", - error); - return error; - } + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } } @@ -233,7 +234,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) cl_half *res = (cl_half *)gOut_Ref; if (skipNanInf) { - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) { feclearexcept(FE_OVERFLOW); res[j] = HFF((float)f->func.f_fma( @@ -244,7 +245,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) } else { - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) res[j] = HFF((float)f->func.f_fma( HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED)); } @@ -265,7 +266,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) // Verify data uint16_t *t = (uint16_t *)gOut_Ref; - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) { for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index f6e914c8a..0ac71df16 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -95,24 +95,28 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) std::vector s(0); - // start the map of the output arrays cl_event e[VECTOR_SIZE_COUNT]; cl_ushort *out[VECTOR_SIZE_COUNT]; - for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + + if (gHostFill) { - out[j] = (uint16_t *)clEnqueueMapBuffer( - tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, - buffer_size, 0, NULL, e + j, &error); - if (error || NULL == out[j]) + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, - error); - return error; + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } } - } - // Get that moving - if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } // Write the new values to the input array cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; @@ -130,27 +134,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - // Wait for the map to finish - if ((error = clWaitForEvents(1, e + j))) - { - vlog_error("Error: clWaitForEvents failed! err: %d\n", error); - return error; - } - if ((error = clReleaseEvent(e[j]))) + if (gHostFill) { - vlog_error("Error: clReleaseEvent failed! err: %d\n", error); - return error; + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } } // Fill the result buffer with garbage, so that old results don't carry // over - uint32_t pattern = 0xACDCACDC; - memset_pattern4(out[j], &pattern, buffer_size); - if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], - out[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error); - return error; + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } // run the kernel diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index 18d4dadd0..86a1a3f0a 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -124,23 +124,15 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) } else { - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", - error); - return error; - } + BUFFER_SIZE, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", - error); - return error; - } + BUFFER_SIZE, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); } } diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp index 9a769447f..ee6c5dd35 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -135,23 +135,15 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) } else { - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n", - error); - return error; - } - - if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL))) - { - vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n", - error); - return error; - } + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); } } diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index e2ff93705..083ab94dc 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -92,15 +92,26 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) // write garbage into output arrays for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - uint32_t pattern = 0xACDCACDC; - memset_pattern4(gOut[j], &pattern, bufferSize); - if ((error = - clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) + uint32_t pattern = 0xacdcacdc; + if (gHostFill) { - vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", - error, j); - return error; + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSize, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], + &pattern, sizeof(pattern), 0, + bufferSize, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); } } @@ -126,7 +137,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) 1, NULL, &localCount, NULL, 0, NULL, NULL))) { - vlog_error("FAILURE -- could not execute kernel\n"); + vlog_error("FAILED -- could not execute kernel\n"); return error; } } From f51a0b5d612075c0a55efef4cf4feb547eb0fbc9 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Fri, 17 Nov 2023 08:37:46 +0100 Subject: [PATCH 19/24] clang format applied --- .../math_brute_force/macro_binary_half.cpp | 4 ++-- test_conformance/math_brute_force/unary_half.cpp | 4 ++-- .../math_brute_force/unary_two_results_half.cpp | 12 ++++++------ test_conformance/math_brute_force/unary_u_half.cpp | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp index bcda06e4e..d25342dda 100644 --- a/test_conformance/math_brute_force/macro_binary_half.cpp +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -214,8 +214,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) else { error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], - &pattern, sizeof(pattern), 0, - buffer_size, 0, NULL, NULL); + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer failed!\n"); } diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp index 0ac71df16..9b230f96b 100644 --- a/test_conformance/math_brute_force/unary_half.cpp +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -162,8 +162,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) else { error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], - &pattern, sizeof(pattern), 0, - buffer_size, 0, NULL, NULL); + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer failed!\n"); } diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index 86a1a3f0a..23889c66b 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -124,14 +124,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) } else { - error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL); + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); test_error(error, "clEnqueueFillBuffer 1 failed!\n"); - error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], - &pattern, sizeof(pattern), 0, - BUFFER_SIZE, 0, NULL, NULL); + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); test_error(error, "clEnqueueFillBuffer 2 failed!\n"); } } diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index 083ab94dc..388dadd4b 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -108,9 +108,9 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) } else { - error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], - &pattern, sizeof(pattern), 0, - bufferSize, 0, NULL, NULL); + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); test_error(error, "clEnqueueFillBuffer failed!\n"); } } From 207b7587a8ba1e35ff4f1cc2329f95b81bee874b Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Fri, 17 Nov 2023 11:50:28 +0100 Subject: [PATCH 20/24] consistency correction --- test_conformance/math_brute_force/mad_half.cpp | 7 ++----- test_conformance/math_brute_force/unary_u_half.cpp | 4 ---- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp index 4545c93ea..d8aefde38 100644 --- a/test_conformance/math_brute_force/mad_half.cpp +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -51,11 +51,8 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) size_t bufferSize = BUFFER_SIZE; logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); - uint64_t step = bufferSize / sizeof(cl_half); - if (gWimpyMode) - { - step = (1ULL << 32) * gWimpyReductionFactor / (512); - } + uint64_t step = getTestStep(sizeof(cl_half), bufferSize); + // Init the kernels { BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index 388dadd4b..04b2b16b2 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -55,10 +55,6 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); const char *name = f->name; float half_ulps = f->half_ulps; - if (gWimpyMode) - { - step = (1ULL << 32) * gWimpyReductionFactor / (512); - } // Init the kernels BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; From c49e8259d27695bfcb44344773840390ff851119 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Sun, 19 Nov 2023 14:02:56 +0100 Subject: [PATCH 21/24] more consistency corrections for cl_fp16_khr supported tests --- .../unary_two_results_half.cpp | 34 +++++++++---------- .../unary_two_results_i_half.cpp | 30 +++++++--------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index 23889c66b..ae3a4a733 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -51,9 +51,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) float maxErrorVal1 = 0.0f; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half); + constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half); + int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); - cl_uchar overflow[half_buffer_size]; + cl_uchar overflow[bufferElements]; int isFract = 0 == strcmp("fract", f->nameInCode); int skipNanInf = isFract; @@ -68,23 +69,19 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) &build_info))) return error; - for (uint64_t i = 0; i < (1ULL << 32); i += step) + for (uint64_t i = 0; i < (1ULL << 16); i += step) { // Init input array cl_half *pIn = (cl_half *)gIn; + if (gWimpyMode) { - const unsigned m_size = 0x1ff; - const unsigned e_size = 0xf; - const unsigned s_size = 0x2; - - for (size_t j = 0; j < half_buffer_size; j++) - { - unsigned ind = j % (s_size * e_size * m_size); - unsigned val = (((ind / (e_size * m_size)) << 15) - | (((ind / m_size) % e_size + 1) << 10) - | (ind % m_size + 1)); - pIn[j] = val; - } + for (size_t j = 0; j < bufferElements; j++) + pIn[j] = (cl_ushort)i + j * scale; + } + else + { + for (size_t j = 0; j < bufferElements; j++) + pIn[j] = (cl_ushort)i + j; } if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, @@ -196,20 +193,21 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (skipNanInf) { - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) { double dd; feclearexcept(FE_OVERFLOW); ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); ref2[j] = HFF((float)dd); + overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); } } else { - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) { double dd; ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); @@ -245,7 +243,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) } // Verify data - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) { for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp index ee6c5dd35..007f16968 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -61,7 +61,8 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // sizeof(cl_half) < sizeof (int32_t) // to prevent overflowing gOut_Ref2 it is necessary to use // bigger type as denominator for buffer size calculation - constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(int32_t); + constexpr size_t bufferElements = BUFFER_SIZE / sizeof(int32_t); + int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); cl_ulong maxiError = 0; @@ -78,24 +79,19 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) &build_info))) return error; - for (uint64_t i = 0; i < (1ULL << 32); i += step) + for (uint64_t i = 0; i < (1ULL << 16); i += step) { // Init input array cl_half *pIn = (cl_half *)gIn; - + if (gWimpyMode) { - const unsigned m_size = 0x1ff; - const unsigned e_size = 0xf; - const unsigned s_size = 0x2; - - for (size_t j = 0; j < half_buffer_size; j++) - { - unsigned ind = j % (s_size * e_size * m_size); - unsigned val = (((ind / (e_size * m_size)) << 15) - | (((ind / m_size) % e_size + 1) << 10) - | (ind % m_size + 1)); - pIn[j] = val; - } + for (size_t j = 0; j < bufferElements; j++) + pIn[j] = (cl_ushort)i + j * scale; + } + else + { + for (size_t j = 0; j < bufferElements; j++) + pIn[j] = (cl_ushort)i + j; } if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, @@ -192,7 +188,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // Calculate the correctly rounded reference result cl_half *ref1 = (cl_half *)gOut_Ref; int32_t *ref2 = (int32_t *)gOut_Ref2; - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j)); // Read the data back @@ -219,7 +215,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) if (gSkipCorrectnessTesting) break; // Verify data - for (size_t j = 0; j < half_buffer_size; j++) + for (size_t j = 0; j < bufferElements; j++) { for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) { From 6f859068dbec99baae2552ffc96112c89e29d5cf Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 28 Nov 2023 11:52:58 +0100 Subject: [PATCH 22/24] Corrections related to code review (bureforce #142) --- .../math_brute_force/i_unary_half.cpp | 18 ++++++------------ .../unary_two_results_half.cpp | 3 +++ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index d51f5ddb3..22971bfc0 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -19,6 +19,7 @@ #include "test_functions.h" #include "utility.h" +#include #include #include #include @@ -48,9 +49,8 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; size_t bufferSize = BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - size_t bufferElements = bufferSize / sizeof(cl_int); - - int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); + size_t bufferElements = std::min(bufferSize / sizeof(cl_int), + size_t(1ULL << (sizeof(cl_half) * 8))); logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); // This test is not using ThreadPool so we need to disable FTZ here @@ -74,15 +74,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { // Init input array cl_ushort *p = (cl_ushort *)gIn; - if (gWimpyMode) - { - for (size_t j = 0; j < bufferElements; j++) - p[j] = (cl_ushort)i + j * scale; - } - else - { - for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; - } + + for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL))) { diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index ae3a4a733..9284fbd76 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -201,6 +201,9 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); ref2[j] = HFF((float)dd); + // ensure correct rounding of fract result is not reaching 1 + if (isFract && HTF(ref1[j]) >= 1.f) ref1[j] = 0x3bff; + overflow[j] = FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); } From 64db7f52b224d9c59e74cebf13ab9bdbdd26f0d0 Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 28 Nov 2023 15:14:49 +0100 Subject: [PATCH 23/24] Correction for i_unary_half test capacity --- .../math_brute_force/i_unary_half.cpp | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp index 22971bfc0..baff3ee20 100644 --- a/test_conformance/math_brute_force/i_unary_half.cpp +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -47,10 +47,11 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) KernelMatrix kernels; const unsigned thread_id = 0; // Test is currently not multithreaded. int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; - size_t bufferSize = BUFFER_SIZE; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - size_t bufferElements = std::min(bufferSize / sizeof(cl_int), + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int), size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSizeIn = bufferElements * sizeof(cl_half); + size_t bufferSizeOut = bufferElements * sizeof(cl_int); logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); // This test is not using ThreadPool so we need to disable FTZ here @@ -78,7 +79,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - bufferSize, gIn, 0, NULL, NULL))) + bufferSizeIn, gIn, 0, NULL, NULL))) { vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; @@ -90,9 +91,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) uint32_t pattern = 0xacdcacdc; if (gHostFill) { - memset_pattern4(gOut[j], &pattern, bufferSize); + memset_pattern4(gOut[j], &pattern, bufferSizeOut); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], - CL_FALSE, 0, bufferSize, + CL_FALSE, 0, bufferSizeOut, gOut[j], 0, NULL, NULL))) { vlog_error( @@ -104,8 +105,8 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) else { error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, - sizeof(pattern), 0, bufferSize, 0, - NULL, NULL); + sizeof(pattern), 0, bufferSizeOut, + 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer failed!\n"); } } @@ -114,7 +115,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_int); - size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + size_t localCount = (bufferSizeOut + vectorSize - 1) / vectorSize; if ((error = clSetKernelArg(kernels[j][thread_id], 0, sizeof(gOutBuffer[j]), &gOutBuffer[j]))) { @@ -144,15 +145,15 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) int *r = (int *)gOut_Ref; for (size_t j = 0; j < bufferElements; j++) { - s[j] = cl_half_to_float(p[j]); + s[j] = HTF(p[j]); r[j] = f->func.i_f(s[j]); } // Read the data back for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - bufferSize, gOut[j], 0, NULL, NULL))) + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSizeOut, gOut[j], 0, NULL, + NULL))) { vlog_error("ReadArray failed %d\n", error); return error; @@ -195,7 +196,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) { vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10zd \n", - i, step, bufferSize); + i, step, bufferSizeOut); } else { From 29449119f66cd2797abc38b88523f8570015f0bc Mon Sep 17 00:00:00 2001 From: Marcin Hajder Date: Tue, 28 Nov 2023 18:24:30 +0100 Subject: [PATCH 24/24] Corrections related to capacity of cl_khr_fp16 tests in bruteforce (#142) --- .../unary_two_results_half.cpp | 42 +++++++--------- .../unary_two_results_i_half.cpp | 48 ++++++++----------- .../math_brute_force/unary_u_half.cpp | 15 ++---- 3 files changed, 42 insertions(+), 63 deletions(-) diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp index 9284fbd76..70a9f4c79 100644 --- a/test_conformance/math_brute_force/unary_two_results_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -51,10 +51,11 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) float maxErrorVal1 = 0.0f; uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half); - int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSize = bufferElements * sizeof(cl_half); - cl_uchar overflow[bufferElements]; + std::vector overflow(bufferElements); int isFract = 0 == strcmp("fract", f->nameInCode); int skipNanInf = isFract; @@ -73,19 +74,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) { // Init input array cl_half *pIn = (cl_half *)gIn; - if (gWimpyMode) - { - for (size_t j = 0; j < bufferElements; j++) - pIn[j] = (cl_ushort)i + j * scale; - } - else - { - for (size_t j = 0; j < bufferElements; j++) - pIn[j] = (cl_ushort)i + j; - } + for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j; if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) + bufferSize, gIn, 0, NULL, NULL))) { vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; @@ -97,9 +89,9 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) uint32_t pattern = 0xacdcacdc; if (gHostFill) { - memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + memset_pattern4(gOut[j], &pattern, bufferSize); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], - CL_FALSE, 0, BUFFER_SIZE, + CL_FALSE, 0, bufferSize, gOut[j], 0, NULL, NULL))) { vlog_error( @@ -108,9 +100,9 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) return error; } - memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + memset_pattern4(gOut2[j], &pattern, bufferSize); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], - CL_FALSE, 0, BUFFER_SIZE, + CL_FALSE, 0, bufferSize, gOut2[j], 0, NULL, NULL))) { vlog_error( @@ -122,12 +114,12 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) else { error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, - sizeof(pattern), 0, BUFFER_SIZE, 0, + sizeof(pattern), 0, bufferSize, 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer 1 failed!\n"); error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, - sizeof(pattern), 0, BUFFER_SIZE, 0, + sizeof(pattern), 0, bufferSize, 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer 2 failed!\n"); } @@ -137,7 +129,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { size_t vectorSize = sizeValues[j] * sizeof(cl_half); - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; if ((error = clSetKernelArg(kernels[j][thread_id], 0, sizeof(gOutBuffer[j]), &gOutBuffer[j]))) { @@ -225,14 +217,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) { if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, - BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + bufferSize, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); return error; } if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, - BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + bufferSize, gOut2[j], 0, NULL, NULL))) { vlog_error("ReadArray2 failed %d\n", error); return error; @@ -432,8 +424,8 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) if (gVerboseBruteForce) { vlog("base:%14" PRIu64 " step:%10" PRIu64 - " bufferSize:%10d \n", - i, step, BUFFER_SIZE); + " bufferSize:%10zu \n", + i, step, bufferSize); } else { diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp index 007f16968..5906c2837 100644 --- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -61,8 +61,11 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) // sizeof(cl_half) < sizeof (int32_t) // to prevent overflowing gOut_Ref2 it is necessary to use // bigger type as denominator for buffer size calculation - constexpr size_t bufferElements = BUFFER_SIZE / sizeof(int32_t); - int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int), + size_t(1ULL << (sizeof(cl_half) * 8))); + + size_t bufferSizeLo = bufferElements * sizeof(cl_half); + size_t bufferSizeHi = bufferElements * sizeof(cl_int); cl_ulong maxiError = 0; @@ -83,19 +86,10 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) { // Init input array cl_half *pIn = (cl_half *)gIn; - if (gWimpyMode) - { - for (size_t j = 0; j < bufferElements; j++) - pIn[j] = (cl_ushort)i + j * scale; - } - else - { - for (size_t j = 0; j < bufferElements; j++) - pIn[j] = (cl_ushort)i + j; - } + for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j; if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, - BUFFER_SIZE, gIn, 0, NULL, NULL))) + bufferSizeLo, gIn, 0, NULL, NULL))) { vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); return error; @@ -107,9 +101,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) uint32_t pattern = 0xacdcacdc; if (gHostFill) { - memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + memset_pattern4(gOut[j], &pattern, bufferSizeLo); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], - CL_FALSE, 0, BUFFER_SIZE, + CL_FALSE, 0, bufferSizeLo, gOut[j], 0, NULL, NULL))) { vlog_error( @@ -118,9 +112,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) return error; } - memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + memset_pattern4(gOut2[j], &pattern, bufferSizeHi); if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], - CL_FALSE, 0, BUFFER_SIZE, + CL_FALSE, 0, bufferSizeHi, gOut2[j], 0, NULL, NULL))) { vlog_error( @@ -132,12 +126,12 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) else { error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, - sizeof(pattern), 0, BUFFER_SIZE, 0, + sizeof(pattern), 0, bufferSizeLo, 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer 1 failed!\n"); error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, - sizeof(pattern), 0, BUFFER_SIZE, 0, + sizeof(pattern), 0, bufferSizeHi, 0, NULL, NULL); test_error(error, "clEnqueueFillBuffer 2 failed!\n"); } @@ -147,8 +141,8 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) { // align working group size with the bigger output type - size_t vectorSize = sizeValues[j] * sizeof(int32_t); - size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + size_t vectorSize = sizeValues[j] * sizeof(cl_int); + size_t localCount = (bufferSizeHi + vectorSize - 1) / vectorSize; if ((error = clSetKernelArg(kernels[j][thread_id], 0, sizeof(gOutBuffer[j]), &gOutBuffer[j]))) { @@ -198,14 +192,14 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, - BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + bufferSizeLo, gOut[j], 0, NULL, NULL))) { vlog_error("ReadArray failed %d\n", error); return error; } - if ((error = - clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, - BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, + 0, bufferSizeHi, gOut2[j], 0, NULL, + NULL))) { vlog_error("ReadArray2 failed %d\n", error); return error; @@ -325,8 +319,8 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) if (gVerboseBruteForce) { vlog("base:%14" PRIu64 " step:%10" PRIu64 - " bufferSize:%10d \n", - i, step, BUFFER_SIZE); + " bufferSize:%10zu \n", + i, step, bufferSizeHi); } else { diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp index 04b2b16b2..6f21ef3ee 100644 --- a/test_conformance/math_brute_force/unary_u_half.cpp +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -48,10 +48,10 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) float maxError = 0.0f; int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); float maxErrorVal = 0.0f; - size_t bufferSize = BUFFER_SIZE; - size_t bufferElements = bufferSize / sizeof(cl_half); uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); - int scale = (int)((1ULL << 32) / (16 * bufferElements) + 1); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSize = bufferElements * sizeof(cl_half); logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); const char *name = f->name; float half_ulps = f->half_ulps; @@ -69,14 +69,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) { // Init input array cl_ushort *p = (cl_ushort *)gIn; - if (gWimpyMode) - { - for (size_t j = 0; j < bufferElements; j++) p[j] = i + j * scale; - } - else - { - for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; - } + for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, bufferSize, gIn, 0, NULL, NULL)))