diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index a221f05add..d53911e433 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -3,24 +3,32 @@ set(MODULE_NAME BRUTEFORCE) set(${MODULE_NAME}_SOURCES binary_double.cpp binary_float.cpp + binary_half.cpp binary_i_double.cpp binary_i_float.cpp + binary_i_half.cpp binary_operator_double.cpp binary_operator_float.cpp + binary_operator_half.cpp binary_two_results_i_double.cpp binary_two_results_i_float.cpp + binary_two_results_i_half.cpp common.cpp common.h function_list.cpp function_list.h i_unary_double.cpp i_unary_float.cpp + i_unary_half.cpp macro_binary_double.cpp macro_binary_float.cpp + macro_binary_half.cpp macro_unary_double.cpp macro_unary_float.cpp + macro_unary_half.cpp mad_double.cpp mad_float.cpp + mad_half.cpp main.cpp reference_math.cpp reference_math.h @@ -28,15 +36,20 @@ set(${MODULE_NAME}_SOURCES sleep.h ternary_double.cpp ternary_float.cpp + ternary_half.cpp test_functions.h unary_double.cpp unary_float.cpp + unary_half.cpp unary_two_results_double.cpp unary_two_results_float.cpp + unary_two_results_half.cpp unary_two_results_i_double.cpp unary_two_results_i_float.cpp + unary_two_results_i_half.cpp unary_u_double.cpp unary_u_float.cpp + unary_u_half.cpp utility.cpp utility.h ) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp new file mode 100644 index 0000000000..f80a085370 --- /dev/null +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -0,0 +1,784 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "harness/errorHelpers.h" + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" +#include "reference_math.h" + +#include +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdataHolder d; + + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isFDim; + int skipNanInf; + int isNextafter; +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_int error; + const char *name = job->f->name; + + int isFDim = job->isFDim; + int skipNanInf = job->skipNanInf; + int isNextafter = job->isNextafter; + cl_ushort *t; + cl_half *r; + std::vector s(0), s2(0); + cl_uint j = 0; + + RoundingMode oldRoundMode; + cl_int copysign_test = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + FPU_mode_type oldMode; + oldRoundMode = kRoundToNearestEven; + if (isFDim) + { + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + } + + if (!strcmp(name, "copysign")) copysign_test = 1; + +#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2)) + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + t = (cl_ushort *)r; + s.resize(buffer_elements); + s2.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + if (isNextafter) + r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), + CL_HALF_RTE); + else + r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); + } + + if (isFDim && ftz) RestoreFPState(&oldMode); + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Verify data + + for (j = 0; j < buffer_elements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double correct; + if (isNextafter) + correct = reference_nextafterh(s[j], s2[j]); + else + correct = ref_func(s[j], s2[j]); + + float test = cl_half_to_float(q[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (skipNanInf) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + if (isNextafter) + { + correct = reference_nextafterh(s[j], s2[j], false); + err = Ulp_Error_Half(q[j], correct); + fail = !(fabsf(err) <= ulps); + } + + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + if (isNextafter) + { + correct2 = reference_nextafterh(0.0, s2[j]); + correct3 = reference_nextafterh(-0.0, s2[j]); + } + else + { + correct2 = ref_func(0.0, s2[j]); + correct3 = ref_func(-0.0, s2[j]); + } + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + auto check_error = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + }; + check_error(); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(0.0, s2[j], false); + correct3 = + reference_nextafterh(-0.0, s2[j], false); + check_error(); + } + + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p[j]); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + if (isNextafter) + { + correct2 = reference_nextafterh(0.0, 0.0); + correct3 = reference_nextafterh(-0.0, 0.0); + correct4 = reference_nextafterh(0.0, -0.0); + correct5 = reference_nextafterh(-0.0, -0.0); + } + else + { + correct2 = ref_func(0.0, 0.0); + correct3 = ref_func(-0.0, 0.0); + correct4 = ref_func(0.0, -0.0); + correct5 = ref_func(-0.0, -0.0); + } + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p2[j]); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + if (isNextafter) + { + correct2 = reference_nextafterh(s[j], 0.0); + correct3 = reference_nextafterh(s[j], -0.0); + } + else + { + correct2 = ref_func(s[j], 0.0); + correct3 = ref_func(s[j], -0.0); + } + + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + auto check_error = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + }; + check_error(); + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(s[j], 0.0, false); + correct3 = + reference_nextafterh(s[j], -0.0, false); + check_error(); + } + + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p2[j]); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %u\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], cl_half_to_float(r[j]), r[j], test, q[j], + j); + error = -1; + return error; + } + } + } + } + + if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return error; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, + bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; + test_info.isNextafter = isNextafter; + + test_info.tinfo.resize(test_info.threadCount); + + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error( + "Error: Unable to create sub-buffer of gOutBuffer[%d] " + "for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} + +int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode); +} + +int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode); +} diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp index a6c2855735..a0561422e9 100644 --- a/test_conformance/math_brute_force/binary_i_double.cpp +++ b/test_conformance/math_brute_force/binary_i_double.cpp @@ -193,16 +193,14 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); const int specialValuesInt[] = { 0, 1, 2, 3, 1022, 1023, 1024, INT_MIN, INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX, }; -constexpr size_t specialValuesIntCount = - sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); +constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp index dfe25efc69..f9e13abaaf 100644 --- a/test_conformance/math_brute_force/binary_i_float.cpp +++ b/test_conformance/math_brute_force/binary_i_float.cpp @@ -184,8 +184,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); const int specialValuesInt[] = { 0, 1, 2, 3, 126, 127, @@ -194,9 +193,7 @@ const int specialValuesInt[] = { -0x04000001, -1465264071, -1488522147, }; -constexpr size_t specialValuesIntCount = - sizeof(specialValuesInt) / sizeof(specialValuesInt[0]); - +constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { TestInfo *job = (TestInfo *)data; diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp new file mode 100644 index 0000000000..001e2b4f54 --- /dev/null +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -0,0 +1,548 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Int, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + cl_int maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdataHolder d; + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +} ThreadInfo; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +const int specialValuesInt3[] = { 0, 1, 2, 3, 1022, 1023, + 1024, INT_MIN, INT_MAX, -1, -2, -3, + -1022, -1023, -11024, -INT_MAX }; +size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_ushort *t; + cl_half *r; + std::vector s; + cl_int *s2; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesInt3Count; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesInt3[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesInt3Count) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_elements * sizeof(cl_half), p, 0, + NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_elements * sizeof(cl_int), p2, 0, + NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, + buffer_elements * sizeof(cl_half)); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer( + tinfo->tQueue, tinfo->outBuf[j], &pattern, sizeof(pattern), 0, + buffer_elements * sizeof(cl_half), 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + t = (cl_ushort *)r; + s.resize(buffer_elements); + s2 = (cl_int *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + r[j] = HFF(func.f_fi(s[j], s2[j])); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Wait for the last buffer + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + float test = cl_half_to_float(q[j]); + double correct = func.f_fi(s[j], s2[j]); + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + correct2 = func.f_fi(0.0, s2[j]); + correct3 = func.f_fi(-0.0, s2[j]); + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%d}\nExpected: %a (half 0x%04x) \nActual: %a " + "(half 0x%04x) at index: %d\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + cl_half_to_float(r[j]), r[j], test, q[j], + (cl_uint)j); + error = -1; + return error; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + return error; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp index 7600ab16a3..517188030b 100644 --- a/test_conformance/math_brute_force/binary_operator_double.cpp +++ b/test_conformance/math_brute_force/binary_operator_double.cpp @@ -192,8 +192,7 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp index 741c396ca8..3eb1041834 100644 --- a/test_conformance/math_brute_force/binary_operator_float.cpp +++ b/test_conformance/math_brute_force/binary_operator_float.cpp @@ -184,8 +184,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp new file mode 100644 index 0000000000..e7f53af871 --- /dev/null +++ b/test_conformance/math_brute_force/binary_operator_half.cpp @@ -0,0 +1,680 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + + // max error value. Init to 0. + float maxError; + // position of the max error value (param 1). Init to 0. + double maxErrorValue; + // position of the max error value (param 2). Init to 0. + double maxErrorValue2; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector tinfo; + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + // no special fields +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_int error; + + const char *name = job->f->name; + cl_half *r = 0; + std::vector s(0), s2(0); + RoundingMode oldRoundMode; + + cl_event e[VECTOR_SIZE_COUNT]; + cl_half *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + bool divide = strcmp(name, "divide") == 0; + + // Init input array + cl_half *p = (cl_half *)gIn + thread_id * buffer_elements; + cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements; + cl_uint idx = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)lastSpecialJobIndex) + { + // Insert special values + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; idx < buffer_elements; idx++) + { + p[idx] = specialValuesHalf[x]; + p2[idx] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + + if (divide) + { + cl_half pj = p[idx] & 0x7fff; + cl_half p2j = p2[idx] & 0x7fff; + // Replace values outside [2^-7, 2^7] with QNaN + if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN + if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00; + } + } + } + + // Init any remaining values + for (; idx < buffer_elements; idx++) + { + p[idx] = (cl_half)genrand_int32(d); + p2[idx] = (cl_half)genrand_int32(d); + + if (divide) + { + cl_half pj = p[idx] & 0x7fff; + cl_half p2j = p2[idx] & 0x7fff; + // Replace values outside [2^-7, 2^7] with QNaN + if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN + if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00; + } + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // Run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + // Calculate the correctly rounded reference result + FPU_mode_type oldMode; + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + oldRoundMode = kRoundToNearestEven; + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + s.resize(buffer_elements); + s2.resize(buffer_elements); + + for (size_t j = 0; j < buffer_elements; j++) + { + s[j] = HTF(p[j]); + s2[j] = HTF(p2[j]); + r[j] = HFF(func.f_ff(s[j], s2[j])); + } + + if (ftz) RestoreFPState(&oldMode); + + // Read the data back -- no need to wait for the first N-1 buffers but wait + // for the last buffer. This is an in order queue. + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Verify data + + for (size_t j = 0; j < buffer_elements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = out[k]; + + // If we aren't getting the correctly rounded result + if (r[j] != q[j]) + { + float test = HTF(q[j]); + float correct = func.f_ff(s[j], s2[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + + float err = Ulp_Error_Half(q[j], correct); + + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = HTF(func.f_ff(0.0, s2[j])); + correct3 = HTF(func.f_ff(-0.0, s2[j])); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + correct2 = HTF(func.f_ff(0.0, 0.0)); + correct3 = HTF(func.f_ff(-0.0, 0.0)); + correct4 = HTF(func.f_ff(0.0, -0.0)); + correct5 = HTF(func.f_ff(-0.0, -0.0)); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = HTF(func.f_ff(s[j], 0.0)); + correct3 = HTF(func.f_ff(s[j], -0.0)); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %zu\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], HTF(r[j]), r[j], test, q[j], j); + return -1; + } + } + } + } + + if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp new file mode 100644 index 0000000000..bc2519e95b --- /dev/null +++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp @@ -0,0 +1,477 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +struct ComputeReferenceInfoF +{ + const cl_half *x; + const cl_half *y; + cl_half *r; + int32_t *i; + double (*f_ffpI)(double, double, int *); + cl_uint lim; + cl_uint count; +}; + +cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +{ + ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; + cl_uint lim = cri->lim; + cl_uint count = cri->count; + cl_uint off = jid * count; + const cl_half *x = cri->x + off; + const cl_half *y = cri->y + off; + cl_half *r = cri->r + off; + int32_t *i = cri->i + off; + double (*f)(double, double, int *) = cri->f_ffpI; + + if (off + count > lim) count = lim - off; + + for (cl_uint j = 0; j < count; ++j) + r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j)); + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + int64_t maxError2 = 0; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // use larger type of output data to prevent overflowing buffer size + constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t); + + cl_uint threadCount = GetThreadCount(); + + float half_ulps = f->half_ulps; + + int testingRemquo = !strcmp(f->name, "remquo"); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *p = (cl_half *)gIn; + cl_half *p2 = (cl_half *)gIn2; + for (size_t j = 0; j < buffer_size; j++) + { + p[j] = (cl_half)genrand_int32(d); + p2[j] = (cl_half)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn, 0, + NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn2, + 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // align working group size with the bigger output type + size_t vectorSize = sizeValues[j] * sizeof(int32_t); + size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + if (threadCount > 1) + { + ComputeReferenceInfoF cri; + cri.x = p; + cri.y = p2; + cri.r = (cl_half *)gOut_Ref; + cri.i = (int32_t *)gOut_Ref2; + cri.f_ffpI = f->func.f_ffpI; + cri.lim = buffer_size; + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceF, threadCount, &cri); + } + else + { + cl_half *r = (cl_half *)gOut_Ref; + int32_t *r2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + r[j] = + HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j)); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, + BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + cl_half *t = (cl_half *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = (cl_half *)(gOut[k]); + int32_t *q2 = (int32_t *)gOut2[k]; + + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; + + // Check for paired NaNs + if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j]) + continue; + + cl_half test = ((cl_half *)q)[j]; + int correct2 = INT_MIN; + float correct = + (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2); + float err = Ulp_Error_Half(test, correct); + int64_t iErr; + + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); + else + iErr = (long long)q2[j] - (long long)correct2; + + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f) + || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]); + if (iptrUndefined) iErr = 0; + + int fail = !(fabsf(err) <= half_ulps && iErr == 0); + if (ftz && fail) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + int correct3i, correct4i; + float correct3 = + (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i); + float correct4 = + (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + int correct7i, correct8i; + correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); + correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i); + double correct7 = + f->func.f_ffpI(0.0, -0.0, &correct7i); + double correct8 = + f->func.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Ulp_Error_Half(test, correct3); + err3 = Ulp_Error_Half(test, correct4); + float err4 = Ulp_Error_Half(test, correct7); + float err5 = Ulp_Error_Half(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= half_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= half_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct3, half_ulps) + || IsHalfResultSubnormal(correct4, half_ulps) + || IsHalfResultSubnormal(correct7, half_ulps) + || IsHalfResultSubnormal(correct8, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + int correct3i, correct4i; + double correct3 = + f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i); + double correct4 = + f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(p[j]); + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = HTF(p[j]); + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %" PRId64 + "} ulp error at {%a, %a} " + "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, " + "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, HTF(p[j]), + HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j], + t2[j], HTF(test), q2[j], test, q2[j]); + return -1; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp index 47f493e7a6..3771a6fb00 100644 --- a/test_conformance/math_brute_force/common.cpp +++ b/test_conformance/math_brute_force/common.cpp @@ -27,8 +27,11 @@ const char *GetTypeName(ParameterType type) { switch (type) { + case ParameterType::Half: return "half"; case ParameterType::Float: return "float"; case ParameterType::Double: return "double"; + case ParameterType::Short: return "short"; + case ParameterType::UShort: return "ushort"; case ParameterType::Int: return "int"; case ParameterType::UInt: return "uint"; case ParameterType::Long: return "long"; @@ -41,9 +44,13 @@ const char *GetUndefValue(ParameterType type) { switch (type) { + case ParameterType::Half: case ParameterType::Float: case ParameterType::Double: return "NAN"; + case ParameterType::Short: + case ParameterType::UShort: return "0x5678"; + case ParameterType::Int: case ParameterType::UInt: return "0x12345678"; @@ -71,14 +78,17 @@ void EmitEnableExtension(std::ostringstream &kernel, const std::initializer_list &types) { bool needsFp64 = false; + bool needsFp16 = false; for (const auto &type : types) { switch (type) { case ParameterType::Double: needsFp64 = true; break; - + case ParameterType::Half: needsFp16 = true; break; case ParameterType::Float: + case ParameterType::Short: + case ParameterType::UShort: case ParameterType::Int: case ParameterType::UInt: case ParameterType::Long: @@ -89,6 +99,7 @@ void EmitEnableExtension(std::ostringstream &kernel, } if (needsFp64) kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; + if (needsFp16) kernel << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; } std::string GetBuildOptions(bool relaxed_mode) diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h index 481b3b2a29..793a00fe92 100644 --- a/test_conformance/math_brute_force/common.h +++ b/test_conformance/math_brute_force/common.h @@ -36,8 +36,11 @@ using Buffers = std::array; // Types supported for kernel code generation. enum class ParameterType { + Half, Float, Double, + Short, + UShort, Int, UInt, Long, @@ -91,4 +94,5 @@ using SourceGenerator = std::string (*)(const std::string &kernel_name, cl_int BuildKernels(BuildKernelInfo &info, cl_uint job_id, SourceGenerator generator); + #endif /* COMMON_H */ diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 917362852c..b2f3de82ef 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -29,36 +29,41 @@ // Only use ulps information in spir test #ifdef FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ { \ STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \ + _type, _relaxed_embedded_ulp) \ { \ STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, \ - _rmode, RELAXED_ON, _type \ + _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp, \ + _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), { NULL }, \ - { NULL }, { NULL }, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, \ - _rmode, RELAXED_OFF, _type \ + { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, \ + INFINITY, _rmode, RELAXED_OFF, _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, \ + _rmode, _type) \ { \ STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ + _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } #define unaryF NULL +#define unaryOF NULL #define i_unaryF NULL #define unaryF_u NULL #define macro_unaryF NULL #define binaryF NULL +#define binaryOF NULL +#define binaryF_nextafter NULL #define binaryOperatorF NULL #define binaryF_i NULL #define macro_binaryF NULL @@ -76,31 +81,34 @@ #else // FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ { \ STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ - _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \ + _type, _relaxed_embedded_ulp) \ { \ STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ { (void*)reference_##_name##l }, \ - { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _embedded_ulp, \ - _relaxed_ulp, _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ + { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp, \ + _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ + RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), \ { (void*)reference_##_name }, { NULL }, { NULL }, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ + _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, \ + _rmode, _type) \ { \ STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, \ + { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp, \ _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } @@ -108,85 +116,114 @@ static constexpr vtbl _unary = { "unary", TestFunc_Float_Float, TestFunc_Double_Double, + TestFunc_Half_Half, }; +static constexpr vtbl _unaryof = { "unaryof", TestFunc_Float_Float, NULL, + NULL }; + static constexpr vtbl _i_unary = { "i_unary", TestFunc_Int_Float, TestFunc_Int_Double, + TestFunc_Int_Half, }; static constexpr vtbl _unary_u = { "unary_u", TestFunc_Float_UInt, TestFunc_Double_ULong, + TestFunc_Half_UShort, }; static constexpr vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float, TestMacro_Int_Double, + TestMacro_Int_Half, }; static constexpr vtbl _binary = { "binary", TestFunc_Float_Float_Float, TestFunc_Double_Double_Double, + TestFunc_Half_Half_Half, }; +static constexpr vtbl _binary_nextafter = { + "binary", + TestFunc_Float_Float_Float, + TestFunc_Double_Double_Double, + TestFunc_Half_Half_Half_nextafter, +}; + +static constexpr vtbl _binaryof = { "binaryof", TestFunc_Float_Float_Float, + NULL, NULL }; + static constexpr vtbl _binary_operator = { "binaryOperator", TestFunc_Float_Float_Float_Operator, TestFunc_Double_Double_Double_Operator, + TestFunc_Half_Half_Half_Operator, }; static constexpr vtbl _binary_i = { "binary_i", TestFunc_Float_Float_Int, TestFunc_Double_Double_Int, + TestFunc_Half_Half_Int, }; static constexpr vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float, TestMacro_Int_Double_Double, + TestMacro_Int_Half_Half, }; static constexpr vtbl _ternary = { "ternary", TestFunc_Float_Float_Float_Float, TestFunc_Double_Double_Double_Double, + TestFunc_Half_Half_Half_Half, }; static constexpr vtbl _unary_two_results = { "unary_two_results", TestFunc_Float2_Float, TestFunc_Double2_Double, + TestFunc_Half2_Half, }; static constexpr vtbl _unary_two_results_i = { "unary_two_results_i", TestFunc_FloatI_Float, TestFunc_DoubleI_Double, + TestFunc_HalfI_Half, }; static constexpr vtbl _binary_two_results_i = { "binary_two_results_i", TestFunc_FloatI_Float_Float, TestFunc_DoubleI_Double_Double, + TestFunc_HalfI_Half_Half, }; static constexpr vtbl _mad_tbl = { "ternary", TestFunc_mad_Float, TestFunc_mad_Double, + TestFunc_mad_Half, }; #define unaryF &_unary +#define unaryOF &_unaryof #define i_unaryF &_i_unary #define unaryF_u &_unary_u #define macro_unaryF &_macro_unary #define binaryF &_binary +#define binaryF_nextafter &_binary_nextafter +#define binaryOF &_binaryof #define binaryOperatorF &_binary_operator #define binaryF_i &_binary_i #define macro_binaryF &_macro_binary @@ -199,24 +236,24 @@ static constexpr vtbl _mad_tbl = { #endif // FUNCTION_LIST_ULPS_ONLY const Func functionList[] = { - ENTRY_EXT(acos, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(acosh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(acospi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY_EXT(asin, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(asinh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(asinpi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY_EXT(atan, 5.0f, 5.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(atanh, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(atanpi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(atan2, 6.0f, 6.0f, FTZ_OFF, binaryF), - ENTRY(atan2pi, 6.0f, 6.0f, FTZ_OFF, binaryF), - ENTRY(cbrt, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(ceil, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(copysign, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY_EXT(cos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(acosh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(acospi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(asinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(asinpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(atanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(atanpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(atan2, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(cbrt, 2.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY(ceil, 0.0f, 0.0f, 0.f, FTZ_OFF, unaryF), + ENTRY(copysign, 0.0f, 0.0f, 0.f, FTZ_OFF, binaryF), + ENTRY_EXT(cos, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY(cosh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY_EXT(cospi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(cosh, 4.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY_EXT(cospi, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 // ENTRY( erfc, 16.0f, // 16.0f, FTZ_OFF, unaryF), @@ -225,81 +262,84 @@ const Func functionList[] = { // 16.0f, 16.0f, FTZ_OFF, // unaryF), //disabled for 1.0 due to lack // of reference implementation - ENTRY_EXT(exp, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f), // relaxed error is actually overwritten in unary.c as it // is 3+floor(fabs(2*x)) - ENTRY_EXT(exp2, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp2, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f), // relaxed error is actually overwritten in unary.c as it // is 3+floor(fabs(2*x)) - ENTRY_EXT(exp10, 3.0f, 4.0f, 8192.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp10, 3.0f, 4.0f, 2.f, 8192.0f, FTZ_OFF, unaryF, 8192.0f), // relaxed error is actually overwritten in unary.c as // it is 3+floor(fabs(2*x)) in derived mode, // in non-derived mode it uses the ulp error for half_exp10. - ENTRY(expm1, 3.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(fabs, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(fdim, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(floor, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(fma, 0.0f, 0.0f, FTZ_OFF, ternaryF), - ENTRY(fmax, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fmin, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fmod, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fract, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), - ENTRY(frexp, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i), - ENTRY(hypot, 4.0f, 4.0f, FTZ_OFF, binaryF), - ENTRY(ilogb, 0.0f, 0.0f, FTZ_OFF, i_unaryF), - ENTRY(isequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isfinite, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isgreaterequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isinf, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isless, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(islessequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(islessgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isnan, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isnormal, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isnotequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isunordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(ldexp, 0.0f, 0.0f, FTZ_OFF, binaryF_i), - ENTRY(lgamma, INFINITY, INFINITY, FTZ_OFF, unaryF), - ENTRY(lgamma_r, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i), - ENTRY_EXT(log, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY(expm1, 3.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY(fabs, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(fdim, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(floor, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(fma, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF), + ENTRY(fmax, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fmin, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fmod, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fract, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), + ENTRY(frexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i), + ENTRY(hypot, 4.0f, 4.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(ilogb, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF), + ENTRY(isequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isfinite, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isinf, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isless, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(islessequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isnan, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isnormal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i), + ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF), + ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, + unaryF_two_results_i), + ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY_EXT(log2, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY_EXT(log10, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY(log1p, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(logb, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, + ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, + mad_function, INFINITY), // in fast-relaxed-math mode it has to be either // exactly rounded fma or exactly rounded a*b+c - ENTRY(maxmag, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), - ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u), - ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF, + ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(minmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(modf, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), + ENTRY(nan, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u), + ENTRY(nextafter, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter), + ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 8192.0f, FTZ_OFF, binaryF, 8192.0f), // in derived mode the ulp error is calculated as // exp2(y*log2(x)) and in non-derived it is the same as // half_pow - ENTRY(pown, 16.0f, 16.0f, FTZ_OFF, binaryF_i), - ENTRY(powr, 16.0f, 16.0f, FTZ_OFF, binaryF), + ENTRY(pown, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i), + ENTRY(powr, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF), // ENTRY( reciprocal, 1.0f, // 1.0f, FTZ_OFF, unaryF), - ENTRY(remainder, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(remquo, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i), - ENTRY(rint, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(rootn, 16.0f, 16.0f, FTZ_OFF, binaryF_i), - ENTRY(round, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(rsqrt, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(signbit, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY_EXT(sin, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(remainder, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(remquo, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i), + ENTRY(rint, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(rootn, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i), + ENTRY(round, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, FTZ_OFF, unaryF), + ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY_EXT(sincos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results, + ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, + unaryF_two_results, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY(sinh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY_EXT(sinpi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 { "sqrt", "sqrt", @@ -308,6 +348,7 @@ const Func functionList[] = { { NULL }, 3.0f, 0.0f, + 0.0f, 4.0f, INFINITY, INFINITY, @@ -322,41 +363,42 @@ const Func functionList[] = { 0.0f, 0.0f, 0.0f, + 0.0f, INFINITY, INFINITY, FTZ_OFF, RELAXED_OFF, unaryF }, ENTRY_EXT( - tan, 5.0f, 5.0f, 8192.0f, FTZ_OFF, unaryF, + tan, 5.0f, 5.0f, 2.0f, 8192.0f, FTZ_OFF, unaryF, 8192.0f), // in derived mode it the ulp error is calculated as sin/cos // and in non-derived mode it is the same as half_tan. - ENTRY(tanh, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(tanpi, 6.0f, 6.0f, FTZ_OFF, unaryF), + ENTRY(tanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(tanpi, 6.0f, 6.0f, 2.0f, FTZ_OFF, unaryF), // ENTRY( tgamma, 16.0f, // 16.0f, FTZ_OFF, unaryF), // // Commented this out until we can be // sure this requirement is realistic - ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(trunc, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), - HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryF), - HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryF), - HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryF), + HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF), + HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryOF), + HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF), // basic operations - OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), - OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), { "divide", "/", { (void*)reference_divide }, @@ -364,6 +406,7 @@ const Func functionList[] = { { (void*)reference_relaxed_divide }, 2.5f, 0.0f, + 0.0f, 3.0f, 2.5f, INFINITY, @@ -378,15 +421,16 @@ const Func functionList[] = { 0.0f, 0.0f, 0.0f, + 0.0f, 0.f, INFINITY, FTZ_OFF, RELAXED_OFF, binaryOperatorF }, - OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), - OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, FTZ_OFF, + OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), // A simple copy operation - OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), }; const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]); diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h index 95a2945932..6ea0fa9e2b 100644 --- a/test_conformance/math_brute_force/function_list.h +++ b/test_conformance/math_brute_force/function_list.h @@ -70,6 +70,9 @@ struct vtbl int (*DoubleTestFunc)( const struct Func *, MTdata, bool); // may be NULL if function is single precision only + int (*HalfTestFunc)( + const struct Func *, MTdata, + bool); // may be NULL if function is single precision only }; struct Func @@ -82,6 +85,7 @@ struct Func fptr rfunc; float float_ulps; float double_ulps; + float half_ulps; float float_embedded_ulps; float relaxed_error; float relaxed_embedded_error; diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp new file mode 100644 index 0000000000..baff3ee20d --- /dev/null +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -0,0 +1,220 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include +#include + +namespace { + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Int, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. + int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSizeIn = bufferElements * sizeof(cl_half); + size_t bufferSizeOut = bufferElements * sizeof(cl_int); + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // This test is not using ThreadPool so we need to disable FTZ here + // for reference computations + FPU_mode_type oldMode; + DisableFTZ(&oldMode); + std::shared_ptr at_scope_exit( + nullptr, [&oldMode](int *) { RestoreFPState(&oldMode); }); + + // Init the kernels + { + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + std::vector s(bufferElements); + + for (uint64_t i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + + for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSizeIn, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSizeOut); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSizeOut, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSizeOut, + 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_int); + size_t localCount = (bufferSizeOut + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + int *r = (int *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + s[j] = HTF(p[j]); + r[j] = f->func.i_f(s[j]); + } + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSizeOut, gOut[j], 0, NULL, + NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + if (ftz && IsHalfSubnormal(p[j])) + { + unsigned int correct0 = f->func.i_f(0.0); + unsigned int correct1 = f->func.i_f(-0.0); + if (q[j] == correct0 || q[j] == correct1) continue; + } + + uint32_t err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): " + "*%d vs. %d\n", + f->name, sizeNames[k], err, s[j], p[j], t[j], + q[j]); + return -1; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zd \n", + i, step, bufferSizeOut); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp index 51d5b64b39..9c8a61ed34 100644 --- a/test_conformance/math_brute_force/macro_binary_double.cpp +++ b/test_conformance/math_brute_force/macro_binary_double.cpp @@ -185,8 +185,7 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp index b00a29ff82..8e73acad8c 100644 --- a/test_conformance/math_brute_force/macro_binary_float.cpp +++ b/test_conformance/math_brute_force/macro_binary_float.cpp @@ -176,8 +176,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); cl_int Test(cl_uint job_id, cl_uint thread_id, void *data) { diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp new file mode 100644 index 0000000000..d25342dda5 --- /dev/null +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -0,0 +1,540 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Short, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + MTdataHolder d; + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_short *t, *r; + std::vector s(0), s2(0); + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_short *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + } + + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_short *)gOut_Ref + thread_id * buffer_elements; + t = (cl_short *)r; + s.resize(buffer_elements); + s2.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + r[j] = (short)func.i_ff(s[j], s2[j]); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Wait for the last buffer + out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + cl_short *q = (cl_short *)out[0]; + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + if (IsHalfSubnormal(p2[j])) + { + short correct = (short)func.i_ff(0.0f, 0.0f); + short correct2 = (short)func.i_ff(0.0f, -0.0f); + short correct3 = (short)func.i_ff(-0.0f, 0.0f); + short correct4 = (short)func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + short correct = (short)func.i_ff(0.0f, s2[j]); + short correct2 = (short)func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsHalfSubnormal(p2[j])) + { + short correct = (short)func.i_ff(s[j], 0.0f); + short correct2 = (short)func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error( + "\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n", + name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j); + error = -1; + return error; + } + + + for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; + k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + if (IsHalfSubnormal(p2[j])) + { + short correct = (short)-func.i_ff(0.0f, 0.0f); + short correct2 = (short)-func.i_ff(0.0f, -0.0f); + short correct3 = (short)-func.i_ff(-0.0f, 0.0f); + short correct4 = (short)-func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + short correct = (short)-func.i_ff(0.0f, s2[j]); + short correct2 = (short)-func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsHalfSubnormal(p2[j])) + { + short correct = (short)-func.i_ff(s[j], 0.0f); + short correct2 = (short)-func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + cl_ushort err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x " + "(index: %d)\n", + name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j); + error = -1; + return error; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return error; +} + +} // anonymous namespace + +int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp new file mode 100644 index 0000000000..a755ddb15a --- /dev/null +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -0,0 +1,427 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Short, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + fptr func = job->f->func; + int ftz = job->ftz; + cl_uint j, k; + cl_int error = CL_SUCCESS; + const char *name = job->f->name; + std::vector s(0); + + int signbit_test = 0; + if (!strcmp(name, "signbit")) signbit_test = 1; + +#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s)) + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_short *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Write the new values to the input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale; + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements; + cl_short *t = (cl_short *)r; + s.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + if (!strcmp(name, "isnormal")) + { + if ((IsHalfSubnormal(p[j]) == 0) && !((p[j] & 0x7fffU) >= 0x7c00U) + && ((p[j] & 0x7fffU) != 0x0000U)) + r[j] = 1; + else + r[j] = 0; + } + else + r[j] = (short)ref_func(s[j]); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + // Wait for the last buffer + out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + cl_short *q = out[0]; + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + // If we aren't getting the correctly rounded result + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + short correct = (short)ref_func(+0.0f); + short correct2 = (short)ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: " + "%d vs. %d\n", + name, err, s[j], p[j], t[j], q[j]); + error = -1; + return error; + } + + + for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; + k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + short correct = (short)-ref_func(+0.0f); + short correct2 = (short)-ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a " + "(0x%04x)\nExpected: %d \nActual: %d\n", + name, sizeNames[k], err, s[j], p[j], -t[j], q[j]); + error = -1; + return error; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + return error; +} + +} // anonymous namespace + +int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp new file mode 100644 index 0000000000..d8aefde386 --- /dev/null +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -0,0 +1,201 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetTernaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. + float maxError = 0.0f; + + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + size_t bufferSize = BUFFER_SIZE; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + uint64_t step = getTestStep(sizeof(cl_half), bufferSize); + + // Init the kernels + { + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + cl_ushort *p2 = (cl_ushort *)gIn2; + cl_ushort *p3 = (cl_ushort *)gIn3; + for (size_t j = 0; j < bufferSize / sizeof(cl_ushort); j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + p3[j] = (cl_ushort)genrand_int32(d); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_half) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer3), &gInBuffer3))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data - no verification possible. MAD is a random number + // generator. + + if (0 == (i & 0x0fffffff)) + { + vlog("."); + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("pass"); + + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + } + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp index 74dd5c47d7..8d8acb1b19 100644 --- a/test_conformance/math_brute_force/main.cpp +++ b/test_conformance/math_brute_force/main.cpp @@ -49,6 +49,8 @@ #include "harness/testHarness.h" #define kPageSize 4096 +#define HALF_REQUIRED_FEATURES_1 (CL_FP_ROUND_TO_ZERO) +#define HALF_REQUIRED_FEATURES_2 (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) #define DOUBLE_REQUIRED_FEATURES \ (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO \ | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM) @@ -81,6 +83,8 @@ static int gTestFastRelaxed = 1; */ int gFastRelaxedDerived = 1; static int gToggleCorrectlyRoundedDivideSqrt = 0; +int gHasHalf = 0; +cl_device_fp_config gHalfCapabilities = 0; int gDeviceILogb0 = 1; int gDeviceILogbNaN = 1; int gCheckTininessBeforeRounding = 1; @@ -104,6 +108,8 @@ cl_device_fp_config gFloatCapabilities = 0; int gWimpyReductionFactor = 32; int gVerboseBruteForce = 0; +cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE; + static int ParseArgs(int argc, const char **argv); static void PrintUsage(void); static void PrintFunctions(void); @@ -167,7 +173,6 @@ static int doTest(const char *name) return 0; } } - { if (0 == strcmp("ilogb", func_data->name)) { @@ -236,6 +241,23 @@ static int doTest(const char *name) } } } + + if (gHasHalf && NULL != func_data->vtbl_ptr->HalfTestFunc) + { + gTestCount++; + vlog("%3d: ", gTestCount); + if (func_data->vtbl_ptr->HalfTestFunc(func_data, gMTdata, + false /* relaxed mode*/)) + { + gFailCount++; + error++; + if (gStopOnError) + { + gSkipRestOfTests = true; + return error; + } + } + } } return error; @@ -408,6 +430,8 @@ static int ParseArgs(int argc, const char **argv) case 'm': singleThreaded ^= 1; break; + case 'g': gHasHalf ^= 1; break; + case 'r': gTestFastRelaxed ^= 1; break; case 's': gStopOnError ^= 1; break; @@ -540,6 +564,8 @@ static void PrintUsage(void) vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 " "on)\n"); vlog("\t\t-f\tToggle float precision testing. (Default: on)\n"); + vlog("\t\t-g\tToggle half precision testing. (Default: on if khr_fp_16 " + "on)\n"); vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n"); vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math " "precision. (Default: on)\n"); @@ -640,6 +666,54 @@ test_status InitCL(cl_device_id device) #endif } + gFloatToHalfRoundingMode = kRoundToNearestEven; + if (is_extension_available(gDevice, "cl_khr_fp16")) + { + gHasHalf ^= 1; +#if defined(CL_DEVICE_HALF_FP_CONFIG) + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_HALF_FP_CONFIG, + sizeof(gHalfCapabilities), + &gHalfCapabilities, NULL))) + { + vlog_error( + "ERROR: Unable to get device CL_DEVICE_HALF_FP_CONFIG. (%d)\n", + error); + return TEST_FAIL; + } + if (HALF_REQUIRED_FEATURES_1 + != (gHalfCapabilities & HALF_REQUIRED_FEATURES_1) + && HALF_REQUIRED_FEATURES_2 + != (gHalfCapabilities & HALF_REQUIRED_FEATURES_2)) + { + char list[300] = ""; + if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_NEAREST)) + strncat(list, "CL_FP_ROUND_TO_NEAREST, ", sizeof(list) - 1); + if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_ZERO)) + strncat(list, "CL_FP_ROUND_TO_ZERO, ", sizeof(list) - 1); + if (0 == (gHalfCapabilities & CL_FP_INF_NAN)) + strncat(list, "CL_FP_INF_NAN, ", sizeof(list) - 1); + vlog_error("ERROR: required half features are missing: %s\n", list); + + return TEST_FAIL; + } + + if ((gHalfCapabilities & CL_FP_ROUND_TO_NEAREST) != 0) + { + gHalfRoundingMode = CL_HALF_RTE; + } + else // due to above condition it must be RTZ + { + gHalfRoundingMode = CL_HALF_RTZ; + } + +#else + vlog_error("FAIL: device says it supports cl_khr_fp16 but " + "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n"); + return TEST_FAIL; +#endif + } + + uint32_t deviceFrequency = 0; size_t configSize = sizeof(deviceFrequency); if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, @@ -828,6 +902,7 @@ test_status InitCL(cl_device_id device) "Bruteforce_Ulp_Error_Double() for more details.\n\n"); } + vlog("\tTesting half precision? %s\n", no_yes[0 != gHasHalf]); vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]); if (gIsEmbedded) vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]); diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index afa072f8e0..c31221e3ab 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -4699,6 +4699,49 @@ double reference_nextafter(double xx, double yy) return a.f; } +cl_half reference_nanh(cl_ushort x) +{ + cl_ushort u; + cl_half h; + u = x | 0x7e00U; + memcpy(&h, &u, sizeof(cl_half)); + return h; +} + +float reference_nextafterh(float xx, float yy, bool allow_denorms) +{ + cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE); + cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE); + float x = cl_half_to_float(tmp_a); + float y = cl_half_to_float(tmp_b); + + // take care of nans + if (x != x) return x; + + if (y != y) return y; + + if (x == y) return y; + + short a_h = cl_half_from_float(x, CL_HALF_RTE); + short b_h = cl_half_from_float(y, CL_HALF_RTE); + short oa_h = a_h; + + if (a_h & 0x8000) a_h = 0x8000 - a_h; + if (b_h & 0x8000) b_h = 0x8000 - b_h; + + a_h += (a_h < b_h) ? 1 : -1; + a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h; + + if (!allow_denorms && IsHalfSubnormal(a_h)) + { + if (cl_half_to_float(0x7fff & oa_h) < cl_half_to_float(0x7fff & a_h)) + a_h = (a_h & 0x8000) ? 0x8400 : 0x0400; + else + a_h = 0; + } + + return cl_half_to_float(a_h); +} long double reference_nextafterl(long double xx, long double yy) { diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h index 78b245105e..175ee73120 100644 --- a/test_conformance/math_brute_force/reference_math.h +++ b/test_conformance/math_brute_force/reference_math.h @@ -18,8 +18,10 @@ #if defined(__APPLE__) #include + #else #include +#include "CL/cl_half.h" #endif // -- for testing float -- @@ -160,6 +162,8 @@ long double reference_fractl(long double, long double*); long double reference_fmal(long double, long double, long double); long double reference_madl(long double, long double, long double); long double reference_nextafterl(long double, long double); +float reference_nextafterh(float, float, bool allow_denormals = true); +cl_half reference_nanh(cl_ushort); long double reference_recipl(long double); long double reference_rootnl(long double, int); long double reference_rsqrtl(long double); diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp index 2ae65424f8..7de115b294 100644 --- a/test_conformance/math_brute_force/ternary_double.cpp +++ b/test_conformance/math_brute_force/ternary_double.cpp @@ -108,8 +108,7 @@ const double specialValues[] = { +0.0, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); } // anonymous namespace diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp index d11f4ba3b9..c597d240bb 100644 --- a/test_conformance/math_brute_force/ternary_float.cpp +++ b/test_conformance/math_brute_force/ternary_float.cpp @@ -118,8 +118,7 @@ const float specialValues[] = { +0.0f, }; -constexpr size_t specialValuesCount = - sizeof(specialValues) / sizeof(specialValues[0]); +constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues); } // anonymous namespace diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp new file mode 100644 index 0000000000..ba6dd4d480 --- /dev/null +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -0,0 +1,777 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +#define CORRECTLY_ROUNDED 0 +#define FLUSHED 1 + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetTernaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// A table of more difficult cases to get right +static const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half); + + cl_uchar overflow[bufferElements]; + float half_ulps = f->half_ulps; + int skipNanInf = (0 == strcmp("fma", f->nameInCode)); + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *hp0 = (cl_half *)gIn; + cl_half *hp1 = (cl_half *)gIn2; + cl_half *hp2 = (cl_half *)gIn3; + size_t idx = 0; + + if (i == 0) + { // test edge cases + uint32_t x, y, z; + x = y = z = 0; + for (; idx < bufferElements; idx++) + { + hp0[idx] = specialValuesHalf[x]; + hp1[idx] = specialValuesHalf[y]; + hp2[idx] = specialValuesHalf[z]; + + if (++x >= specialValuesHalfCount) + { + x = 0; + if (++y >= specialValuesHalfCount) + { + y = 0; + if (++z >= specialValuesHalfCount) break; + } + } + } + if (idx == bufferElements) + vlog_error("Test Error: not all special cases tested!\n"); + } + + auto any_value = [&d]() { + float t = (float)((double)genrand_int32(d) / (double)0xFFFFFFFF); + return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX); + }; + + for (; idx < bufferElements; idx++) + { + hp0[idx] = any_value(); + hp1[idx] = any_value(); + hp2[idx] = any_value(); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + BUFFER_SIZE, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_half) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer3), &gInBuffer3))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog("clFlush failed\n"); + return error; + } + + // Calculate the correctly rounded reference result + cl_half *res = (cl_half *)gOut_Ref; + if (skipNanInf) + { + for (size_t j = 0; j < bufferElements; j++) + { + feclearexcept(FE_OVERFLOW); + res[j] = HFF((float)f->func.f_fma( + HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED)); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + } + } + else + { + for (size_t j = 0; j < bufferElements; j++) + res[j] = HFF((float)f->func.f_fma( + HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED)); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint16_t *t = (uint16_t *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint16_t *q = (uint16_t *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + int fail; + cl_half test = ((cl_half *)q)[j]; + float ref1 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]), CORRECTLY_ROUNDED); + cl_half correct = HFF(ref1); + + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (skipNanInf) + { + if (overflow[j] || IsHalfInfinity(correct) + || IsHalfNaN(correct) || IsHalfInfinity(hp0[j]) + || IsHalfNaN(hp0[j]) || IsHalfInfinity(hp1[j]) + || IsHalfNaN(hp1[j]) || IsHalfInfinity(hp2[j]) + || IsHalfNaN(hp2[j])) + continue; + } + + float err = + test != correct ? Ulp_Error_Half(test, ref1) : 0.f; + fail = !(fabsf(err) <= half_ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 with flushing on + if (0.0f == test + && 0.0f + == f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]), FLUSHED)) + { + fail = 0; + err = 0.0f; + } + + // retry per section 6.5.3.3 + if (fail && IsHalfSubnormal(hp0[j])) + { // look at me, + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = + f->func.f_fma(0.0f, HTF(hp1[j]), HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(-0.0f, HTF(hp1[j]), HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + HTF(hp2[j]), FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + HTF(hp2[j]), FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + // try with first two args as zero + if (IsHalfSubnormal(hp1[j])) + { // its fun to have fun, + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(0.0f, 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(-0.0f, 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(0.0f, -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(-0.0f, -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, 0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, 0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, -0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, -0.0f, + HTF(hp2[j]), + FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + if (IsHalfSubnormal(hp2[j])) + { + if (test == 0.0f) // 0*0+0 is 0 + { + fail = 0; + err = 0.0f; + } + } + } + else if (IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(0.0f, HTF(hp1[j]), 0.0f, + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(-0.0f, HTF(hp1[j]), 0.0f, + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(0.0f, HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(-0.0f, HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + -0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + -0.0f, FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + else if (fail && IsHalfSubnormal(hp1[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = + f->func.f_fma(HTF(hp0[j]), 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(HTF(hp0[j]), -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + HTF(hp2[j]), FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + HTF(hp2[j]), FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + // try with second two args as zero + if (IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(HTF(hp0[j]), 0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(HTF(hp0[j]), -0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(HTF(hp0[j]), 0.0f, -0.0f, + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(HTF(hp0[j]), -0.0f, -0.0f, + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + -0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + -0.0f, FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + else if (fail && IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + 0.0f, CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, correct2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, correct3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), + HTF(hp1[j]), 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), + HTF(hp1[j]), -0.0f, + FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(hp0[j]); + maxErrorVal2 = HTF(hp1[j]); + maxErrorVal3 = HTF(hp2[j]); + } + + if (fail) + { + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a, %a, %a} " + "({0x%4.4x, 0x%4.4x, 0x%4.4x}): *%a vs. %a\n", + f->name, sizeNames[k], err, HTF(hp0[j]), + HTF(hp1[j]), HTF(hp2[j]), hp0[j], hp1[j], hp2[j], + HTF(res[j]), HTF(test)); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h index 78aef9c9a6..16b361d53a 100644 --- a/test_conformance/math_brute_force/test_functions.h +++ b/test_conformance/math_brute_force/test_functions.h @@ -24,6 +24,9 @@ int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double) int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half) +int TestFunc_Half_Half(const Func *f, MTdata, bool relaxedMode); + // int foo(float) int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode); @@ -36,6 +39,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode); // double foo(ulong) int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode); +// half (Ushort) +int TestFunc_Half_UShort(const Func *f, MTdata, bool relaxedMode); + // Returns {0, 1} for scalar and {0, -1} for vector. // int foo(float) int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode); @@ -44,21 +50,34 @@ int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode); // int foo(double) int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode); +// int foo(half,half) +int TestMacro_Int_Half_Half(const Func *f, MTdata, bool relaxedMode); + +// int foo(half) +int TestMacro_Int_Half(const Func *f, MTdata, bool relaxedMode); + +// int foo(half) +int TestFunc_Int_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, float) int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double) int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode); +// Half foo(half, half) +int TestFunc_Half_Half_Half(const Func *f, MTdata, bool relaxedMode); // Special handling for nextafter. -// float foo(float, float) -int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata, - bool relaxedMode); +// Half foo(Half, Half) +int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata, bool relaxedMode); + +// Half foo(Half, Half) +int TestFunc_Half_Half_Half_common(const Func *f, MTdata, int isNextafter, + bool relaxedMode); + +// Half foo(Half, int) +int TestFunc_Half_Half_Int(const Func *f, MTdata, bool relaxedMode); -// Special handling for nextafter. -// double foo(double, double) -int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata, - bool relaxedMode); // float op float int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, @@ -68,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata, bool relaxedMode); +// half op half +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int) int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode); @@ -89,24 +111,36 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode); int TestFunc_Double_Double_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, half) +int TestFunc_Half_Half_Half_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, float*) int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double*) int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half*) +int TestFunc_Half2_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int*) int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, int*) int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, int*) +int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode); + // float foo(float, float, int*) int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double, int*) int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, int*) +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode); + // Special handling for mad. // float mad(float, float, float) int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); @@ -115,4 +149,7 @@ int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); // double mad(double, double, double) int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode); +// half mad(half, half, half) +int TestFunc_mad_Half(const Func *f, MTdata, bool relaxedMode); + #endif diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp new file mode 100644 index 0000000000..9b230f96bc --- /dev/null +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -0,0 +1,483 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double maxErrorValue; // position of the max error value. Init to 0. + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +} ThreadInfo; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isRangeLimited; // 1 if the function is only to be evaluated over a + // range + float half_sin_cos_tan_limit; +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + cl_uint j, k; + cl_int error = CL_SUCCESS; + + int isRangeLimited = job->isRangeLimited; + float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit; + int ftz = job->ftz; + + std::vector s(0); + + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Write the new values to the input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + { + p[j] = base + j * scale; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + s.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = (float)cl_half_to_float(p[j]); + r[j] = HFF(func.f_f(s[j])); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + // Wait for the last buffer + out[j] = (uint16_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (r[j] != q[j]) + { + float test = cl_half_to_float(q[j]); + double correct = func.f_f(s[j]); + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + // half_sin/cos/tan are only valid between +-2**16, Inf, NaN + if (isRangeLimited + && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) + && fabsf(s[j]) < INFINITY) + { + if (fabsf(test) <= half_sin_cos_tan_limit) + { + err = 0; + fail = 0; + } + } + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2 = func.f_f(0.0); + double correct3 = func.f_f(-0.0); + float err2 = Ulp_Error_Half(q[j], correct2); + float err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + } + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at %a " + "(half 0x%04x)\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x)\n", + job->f->name, sizeNames[k], err, s[j], p[j], + cl_half_to_float(r[j]), r[j], test, q[j]); + error = -1; + return error; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return error; +} + +} // anonymous namespace + +int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Check for special cases for unary float + test_info.isRangeLimited = 0; + test_info.half_sin_cos_tan_limit = 0; + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] + } + else if (0 == strcmp(f->name, "half_tan")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp new file mode 100644 index 0000000000..70a9f4c79e --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -0,0 +1,452 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError0 = 0.0f; + float maxError1 = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal0 = 0.0f; + float maxErrorVal1 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSize = bufferElements * sizeof(cl_half); + + std::vector overflow(bufferElements); + int isFract = 0 == strcmp("fract", f->nameInCode); + int skipNanInf = isFract; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + float half_ulps = f->half_ulps; + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_half *pIn = (cl_half *)gIn; + for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSize, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, bufferSize, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_half); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog_error("clFlush failed\n"); + return error; + } + + FPU_mode_type oldMode; + RoundingMode oldRoundMode = kRoundToNearestEven; + if (isFract) + { + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + if (gIsInRTZMode) + oldRoundMode = set_round(kRoundTowardZero, kfloat); + } + + // Calculate the correctly rounded reference result + cl_half *ref1 = (cl_half *)gOut_Ref; + cl_half *ref2 = (cl_half *)gOut_Ref2; + + if (skipNanInf) + { + for (size_t j = 0; j < bufferElements; j++) + { + double dd; + feclearexcept(FE_OVERFLOW); + + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); + ref2[j] = HFF((float)dd); + + // ensure correct rounding of fract result is not reaching 1 + if (isFract && HTF(ref1[j]) >= 1.f) ref1[j] = 0x3bff; + + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + } + } + else + { + for (size_t j = 0; j < bufferElements; j++) + { + double dd; + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); + ref2[j] = HFF((float)dd); + } + } + + if (isFract && ftz) RestoreFPState(&oldMode); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) + { + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + break; + } + + // Verify data + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *test1 = (cl_half *)gOut[k]; + cl_half *test2 = (cl_half *)gOut2[k]; + + // If we aren't getting the correctly rounded result + if (ref1[j] != test1[j] || ref2[j] != test2[j]) + { + double fp_correct1 = 0, fp_correct2 = 0; + float err = 0, err2 = 0; + + fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); + + cl_half correct1 = HFF(fp_correct1); + cl_half correct2 = HFF(fp_correct2); + + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (skipNanInf) + { + if (skipNanInf && overflow[j]) continue; + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsHalfInfinity(correct1) || IsHalfNaN(correct1) + || IsHalfInfinity(correct2) || IsHalfNaN(correct2) + || IsHalfInfinity(pIn[j]) || IsHalfNaN(pIn[j])) + continue; + } + + err = Ulp_Error_Half(test1[j], fp_correct1); + err2 = Ulp_Error_Half(test2[j], fp_correct2); + + int fail = + !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps); + + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(fp_correct1, half_ulps)) + { + if (IsHalfResultSubnormal(fp_correct2, half_ulps)) + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && HTF(test2[j]) == 0.0f); + if (!fail) + { + err = 0.0f; + err2 = 0.0f; + } + } + else + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && fabsf(err2) <= half_ulps); + if (!fail) err = 0.0f; + } + } + else if (IsHalfResultSubnormal(fp_correct2, half_ulps)) + { + fail = fail + && !(HTF(test2[j]) == 0.0f + && fabsf(err) <= half_ulps); + if (!fail) err2 = 0.0f; + } + + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(pIn[j])) + { + double fp_correctp, fp_correctn; + double fp_correct2p, fp_correct2n; + float errp, err2p, errn, err2n; + + if (skipNanInf) feclearexcept(FE_OVERFLOW); + fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); + fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n); + + cl_half correctp = HFF(fp_correctp); + cl_half correctn = HFF(fp_correctn); + cl_half correct2p = HFF(fp_correct2p); + cl_half correct2n = HFF(fp_correct2n); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correctp) + || IsHalfNaN(correctp) + || IsHalfInfinity(correctn) + || IsHalfNaN(correctn) + || IsHalfInfinity(correct2p) + || IsHalfNaN(correct2p) + || IsHalfInfinity(correct2n) + || IsHalfNaN(correct2n)) + continue; + } + + errp = Ulp_Error_Half(test1[j], fp_correctp); + err2p = Ulp_Error_Half(test1[j], fp_correct2p); + errn = Ulp_Error_Half(test1[j], fp_correctn); + err2n = Ulp_Error_Half(test1[j], fp_correct2n); + + fail = fail + && ((!(fabsf(errp) <= half_ulps)) + && (!(fabsf(err2p) <= half_ulps)) + && ((!(fabsf(errn) <= half_ulps)) + && (!(fabsf(err2n) <= half_ulps)))); + if (fabsf(errp) < fabsf(err)) err = errp; + if (fabsf(errn) < fabsf(err)) err = errn; + if (fabsf(err2p) < fabsf(err2)) err2 = err2p; + if (fabsf(err2n) < fabsf(err2)) err2 = err2n; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(fp_correctp, half_ulps) + || IsHalfResultSubnormal(fp_correctn, + half_ulps)) + { + if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && HTF(test2[j]) == 0.0f); + if (!fail) err = err2 = 0.0f; + } + else + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && fabsf(err2) <= half_ulps); + if (!fail) err = 0.0f; + } + } + else if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) + { + fail = fail + && !(HTF(test2[j]) == 0.0f + && (fabsf(err) <= half_ulps)); + if (!fail) err2 = 0.0f; + } + } + } + if (fabsf(err) > maxError0) + { + maxError0 = fabsf(err); + maxErrorVal0 = HTF(pIn[j]); + } + if (fabsf(err2) > maxError1) + { + maxError1 = fabsf(err2); + maxErrorVal1 = HTF(pIn[j]); + } + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: " + "*{%a, %a} vs. {%a, %a}\n", + f->name, sizeNames[k], err, err2, + HTF(pIn[j]), HTF(ref1[j]), HTF(ref2[j]), + HTF(test1[j]), HTF(test2[j])); + return -1; + } + } + } + } + + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zu \n", + i, step, bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, + maxErrorVal1); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp new file mode 100644 index 0000000000..5906c2837a --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -0,0 +1,347 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +cl_ulong abs_cl_long(cl_long i) +{ + cl_long mask = i >> 63; + return (i ^ mask) - mask; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int64_t maxError2 = 0; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // sizeof(cl_half) < sizeof (int32_t) + // to prevent overflowing gOut_Ref2 it is necessary to use + // bigger type as denominator for buffer size calculation + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int), + size_t(1ULL << (sizeof(cl_half) * 8))); + + size_t bufferSizeLo = bufferElements * sizeof(cl_half); + size_t bufferSizeHi = bufferElements * sizeof(cl_int); + + cl_ulong maxiError = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + float half_ulps = f->half_ulps; + + maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0; + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_half *pIn = (cl_half *)gIn; + for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSizeLo, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSizeLo); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSizeLo, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, bufferSizeHi); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, bufferSizeHi, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSizeLo, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, + sizeof(pattern), 0, bufferSizeHi, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // align working group size with the bigger output type + size_t vectorSize = sizeValues[j] * sizeof(cl_int); + size_t localCount = (bufferSizeHi + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog_error("clFlush failed\n"); + return error; + } + + // Calculate the correctly rounded reference result + cl_half *ref1 = (cl_half *)gOut_Ref; + int32_t *ref2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < bufferElements; j++) + ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j)); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, + bufferSizeLo, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, + 0, bufferSizeHi, gOut2[j], 0, NULL, + NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *test1 = (cl_half *)(gOut[k]); + int32_t *test2 = (int32_t *)(gOut2[k]); + + // If we aren't getting the correctly rounded result + if (ref1[j] != test1[j] || ref2[j] != test2[j]) + { + cl_half test = ((cl_half *)test1)[j]; + int correct2 = INT_MIN; + float fp_correct = + (float)f->func.f_fpI(HTF(pIn[j]), &correct2); + cl_half correct = HFF(fp_correct); + float err = correct != test + ? Ulp_Error_Half(test, fp_correct) + : 0.f; + cl_long iErr = (int64_t)test2[j] - (int64_t)correct2; + int fail = !(fabsf(err) <= half_ulps + && abs_cl_long(iErr) <= maxiError); + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(fp_correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(pIn[j])) + { + int correct5, correct6; + double fp_correct3 = f->func.f_fpI(0.0, &correct5); + double fp_correct4 = f->func.f_fpI(-0.0, &correct6); + + float err2 = Ulp_Error_Half(test, fp_correct3); + float err3 = Ulp_Error_Half(test, fp_correct4); + + cl_long iErr2 = + (long long)test2[j] - (long long)correct5; + cl_long iErr3 = + (long long)test2[j] - (long long)correct6; + + // Did +0 work? + if (fabsf(err2) <= half_ulps + && abs_cl_long(iErr2) <= maxiError) + { + err = err2; + iErr = iErr2; + fail = 0; + } + // Did -0 work? + else if (fabsf(err3) <= half_ulps + && abs_cl_long(iErr3) <= maxiError) + { + err = err3; + iErr = iErr3; + fail = 0; + } + + // retry per section 6.5.3.4 + if (fail + && (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(fp_correct3, + half_ulps))) + { + fail = fail + && !(test == 0.0f + && (abs_cl_long(iErr2) <= maxiError + || abs_cl_long(iErr3) + <= maxiError)); + if (!fail) + { + err = 0.0f; + iErr = 0; + } + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = pIn[j]; + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = pIn[j]; + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: " + "*{%a, %d} vs. {%a, %d}\n", + f->name, sizeNames[k], err, (int)iErr, + HTF(pIn[j]), HTF(ref1[j]), + ((int *)gOut_Ref2)[j], HTF(test), test2[j]); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zu \n", + i, step, bufferSizeHi); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp new file mode 100644 index 0000000000..6f21ef3eee --- /dev/null +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -0,0 +1,239 @@ +// +// Copyright (c) 2023 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" +#include "reference_math.h" + +#include +#include + +namespace { + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::UShort, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSize = bufferElements * sizeof(cl_half); + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + const char *name = f->name; + float half_ulps = f->half_ulps; + + // Init the kernels + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + { + return error; + } + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSize, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_half); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + cl_half *r = (cl_half *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + if (!strcmp(name, "nan")) + r[j] = reference_nanh(p[j]); + else + r[j] = HFF(f->func.f_u(p[j])); + } + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + cl_ushort *t = (cl_ushort *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = (cl_ushort *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double test = cl_half_to_float(q[j]); + double correct; + if (!strcmp(name, "nan")) + correct = cl_half_to_float(reference_nanh(p[j])); + else + correct = f->func.f_u(p[j]); + + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= half_ulps); + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, half_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = p[j]; + } + if (fail) + { + vlog_error( + "\n%s%s: %f ulp error at 0x%04x \nExpected: %a " + "(0x%04x) \nActual: %a (0x%04x)\n", + f->name, sizeNames[k], err, p[j], + cl_half_to_float(r[j]), r[j], test, q[j]); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zd \n", + i, step, bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index 652d990a21..264fc7a435 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -22,6 +22,7 @@ #include "harness/testHarness.h" #include "harness/ThreadPool.h" #include "harness/conversions.h" +#include "CL/cl_half.h" #define BUFFER_SIZE (1024 * 1024 * 2) #define EMBEDDED_REDUCTION_FACTOR (64) @@ -61,10 +62,20 @@ extern int gFastRelaxedDerived; extern int gWimpyMode; extern int gHostFill; extern int gIsInRTZMode; +extern int gHasHalf; +extern int gInfNanSupport; +extern int gIsEmbedded; extern int gVerboseBruteForce; extern uint32_t gMaxVectorSizeIndex; extern uint32_t gMinVectorSizeIndex; extern cl_device_fp_config gFloatCapabilities; +extern cl_device_fp_config gHalfCapabilities; +extern RoundingMode gFloatToHalfRoundingMode; + +extern cl_half_rounding_mode gHalfRoundingMode; + +#define HFF(num) cl_half_from_float(num, gHalfRoundingMode) +#define HTF(num) cl_half_to_float(num) #define LOWER_IS_BETTER 0 #define HIGHER_IS_BETTER 1 @@ -115,6 +126,12 @@ inline int IsFloatResultSubnormal(double x, float ulps) return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } +inline int IsHalfResultSubnormal(float x, float ulps) +{ + x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps; + return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14); +} + inline int IsFloatResultSubnormalAbsError(double x, float abs_err) { x = x - abs_err; @@ -157,6 +174,26 @@ inline int IsFloatNaN(double x) return ((u.u & 0x7fffffffU) > 0x7F800000U); } +inline bool IsHalfNaN(const cl_half v) +{ + // Extract FP16 exponent and mantissa + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + + // NaN test + return (h_exp == 0x1F && h_mant != 0); +} + +inline bool IsHalfInfinity(const cl_half v) +{ + // Extract FP16 exponent and mantissa + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + + // Inf test + return (h_exp == 0x1F && h_mant == 0); +} + cl_uint RoundUpToNextPowerOfTwo(cl_uint x); // Windows (since long double got deprecated) sets the x87 to 53-bit precision