From b6941b6c61137b95bfe652ec1c0294f6f17be361 Mon Sep 17 00:00:00 2001 From: Sven van Haastregt Date: Tue, 18 Jun 2024 18:43:11 +0200 Subject: [PATCH] Add fp16 testing to conversions and bruteforce (#1975) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merge the `fp16-staging` branch into `main`, adding fp16 (`half`) testing to the conversions and math bruteforce tests. --------- Signed-off-by: Ahmed Hesham Signed-off-by: Sven van Haastregt Signed-off-by: Guo, Yilong Signed-off-by: John Kesapides Co-authored-by: Marcin Hajder Co-authored-by: Ewan Crawford Co-authored-by: Wawiorko, Grzegorz Co-authored-by: Sreelakshmi Haridas Maruthur Co-authored-by: Harald van Dijk Co-authored-by: Ben Ashbaugh Co-authored-by: Haonan Yang Co-authored-by: Ahmed Hesham <117350656+ahesham-arm@users.noreply.github.com> Co-authored-by: niranjanjoshi121 <43807392+niranjanjoshi121@users.noreply.github.com> Co-authored-by: Wenwan Xing Co-authored-by: Yilong Guo Co-authored-by: Romaric Jodin <89833130+rjodinchr@users.noreply.github.com> Co-authored-by: joshqti <127994991+joshqti@users.noreply.github.com> Co-authored-by: Pekka Jääskeläinen Co-authored-by: imilenkovic00 <155085410+imilenkovic00@users.noreply.github.com> Co-authored-by: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com> Co-authored-by: Aharon Abramson --- test_common/harness/errorHelpers.cpp | 2 +- test_common/harness/errorHelpers.h | 2 +- test_common/harness/rounding_mode.h | 11 +- .../conversions/basic_test_conversions.cpp | 403 ++++++--- .../conversions/basic_test_conversions.h | 48 +- .../conversions/conversions_data_info.h | 267 ++++-- .../conversions/test_conversions.cpp | 71 +- .../math_brute_force/CMakeLists.txt | 13 + .../math_brute_force/binary_half.cpp | 784 ++++++++++++++++++ .../math_brute_force/binary_i_half.cpp | 548 ++++++++++++ .../math_brute_force/binary_operator_half.cpp | 661 +++++++++++++++ .../binary_two_results_i_half.cpp | 477 +++++++++++ test_conformance/math_brute_force/common.cpp | 15 +- test_conformance/math_brute_force/common.h | 5 +- .../math_brute_force/function_list.cpp | 278 ++++--- .../math_brute_force/function_list.h | 6 +- .../math_brute_force/i_unary_half.cpp | 220 +++++ .../math_brute_force/macro_binary_half.cpp | 540 ++++++++++++ .../math_brute_force/macro_unary_half.cpp | 427 ++++++++++ .../math_brute_force/mad_half.cpp | 201 +++++ test_conformance/math_brute_force/main.cpp | 79 +- .../math_brute_force/reference_math.cpp | 45 +- .../math_brute_force/reference_math.h | 5 +- .../math_brute_force/ternary_half.cpp | 777 +++++++++++++++++ .../math_brute_force/test_functions.h | 53 +- .../math_brute_force/unary_half.cpp | 483 +++++++++++ .../unary_two_results_half.cpp | 452 ++++++++++ .../unary_two_results_i_half.cpp | 347 ++++++++ .../math_brute_force/unary_u_half.cpp | 239 ++++++ test_conformance/math_brute_force/utility.h | 40 +- 30 files changed, 7149 insertions(+), 350 deletions(-) create mode 100644 test_conformance/math_brute_force/binary_half.cpp create mode 100644 test_conformance/math_brute_force/binary_i_half.cpp create mode 100644 test_conformance/math_brute_force/binary_operator_half.cpp create mode 100644 test_conformance/math_brute_force/binary_two_results_i_half.cpp create mode 100644 test_conformance/math_brute_force/i_unary_half.cpp create mode 100644 test_conformance/math_brute_force/macro_binary_half.cpp create mode 100644 test_conformance/math_brute_force/macro_unary_half.cpp create mode 100644 test_conformance/math_brute_force/mad_half.cpp create mode 100644 test_conformance/math_brute_force/ternary_half.cpp create mode 100644 test_conformance/math_brute_force/unary_half.cpp create mode 100644 test_conformance/math_brute_force/unary_two_results_half.cpp create mode 100644 test_conformance/math_brute_force/unary_two_results_i_half.cpp create mode 100644 test_conformance/math_brute_force/unary_u_half.cpp diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp index d8f3d5d4b..c2e1e5256 100644 --- a/test_common/harness/errorHelpers.cpp +++ b/test_common/harness/errorHelpers.cpp @@ -371,7 +371,7 @@ static float Ulp_Error_Half_Float(float test, double reference) return (float)scalbn(testVal - reference, ulp_exp); } -float Ulp_Error_Half(cl_half test, float reference) +float Ulp_Error_Half(cl_half test, double reference) { return Ulp_Error_Half_Float(cl_half_to_float(test), reference); } diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h index 3f1d8fb14..e6d4620b4 100644 --- a/test_common/harness/errorHelpers.h +++ b/test_common/harness/errorHelpers.h @@ -185,7 +185,7 @@ static int vlog_win32(const char *format, ...); extern const char *IGetErrorString(int clErrorCode); -extern float Ulp_Error_Half(cl_half test, float reference); +extern float Ulp_Error_Half(cl_half test, double reference); extern float Ulp_Error(float test, double reference); extern float Ulp_Error_Double(double test, long double reference); diff --git a/test_common/harness/rounding_mode.h b/test_common/harness/rounding_mode.h index 6f52f0a00..dfd9c3eea 100644 --- a/test_common/harness/rounding_mode.h +++ b/test_common/harness/rounding_mode.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -42,10 +42,11 @@ typedef enum kshort = 3, kuint = 4, kint = 5, - kfloat = 6, - kdouble = 7, - kulong = 8, - klong = 9, + khalf = 6, + kfloat = 7, + kdouble = 8, + kulong = 9, + klong = 10, // This goes last kTypeCount diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp index 1020638af..a44a36366 100644 --- a/test_conformance/conversions/basic_test_conversions.cpp +++ b/test_conformance/conversions/basic_test_conversions.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -48,6 +48,7 @@ #include #include +#include #include "basic_test_conversions.h" @@ -86,9 +87,13 @@ int gWimpyReductionFactor = 128; int gSkipTesting = 0; int gForceFTZ = 0; int gIsRTZ = 0; +int gForceHalfFTZ = 0; +int gIsHalfRTZ = 0; uint32_t gSimdSize = 1; int gHasDouble = 0; int gTestDouble = 1; +int gHasHalfs = 0; +int gTestHalfs = 1; const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" }; int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 }; int gMinVectorSize = 0; @@ -100,6 +105,8 @@ int argCount = 0; double SubtractTime(uint64_t endTime, uint64_t startTime); +cl_half_rounding_mode DataInitInfo::halfRoundingMode = CL_HALF_RTE; +cl_half_rounding_mode ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTE; // clang-format off // for readability sake keep this section unformatted @@ -256,8 +263,30 @@ std::vector DataInitInfo::specialValuesDouble = { MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10), }; -// clang-format on +// A table of more difficult cases to get right +std::vector DataInitInfo::specialValuesHalf = { + 0xffff, + 0x0000, + 0x0001, + 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; +// clang-format on // Windows (since long double got deprecated) sets the x87 to 53-bit precision // (that's x87 default state). This causes problems with the tests that @@ -282,15 +311,32 @@ static inline void Force64BitFPUPrecision(void) #endif } - -template -int CalcRefValsPat::check_result(void *test, uint32_t count, - int vectorSize) +template +int CalcRefValsPat::check_result(void *test, + uint32_t count, + int vectorSize) { const cl_uchar *a = (const cl_uchar *)gAllowZ; - if (std::is_integral::value) - { // char/uchar/short/ushort/int/uint/long/ulong + if (is_half()) + { + const cl_half *t = (const cl_half *)test; + const cl_half *c = (const cl_half *)gRef; + + for (uint32_t i = 0; i < count; i++) + if (t[i] != c[i] && + // Allow nan's to be binary different + !((t[i] & 0x7fff) > 0x7C00 && (c[i] & 0x7fff) > 0x7C00) + && !(a[i] != (cl_uchar)0 && t[i] == (c[i] & 0x8000))) + { + vlog( + "\nError for vector size %d found at 0x%8.8x: *%a vs %a\n", + vectorSize, i, HTF(c[i]), HTF(t[i])); + return i + 1; + } + } + else if (std::is_integral::value) + { // char/uchar/short/ushort/half/int/uint/long/ulong const OutType *t = (const OutType *)test; const OutType *c = (const OutType *)gRef; for (uint32_t i = 0; i < count; i++) @@ -388,6 +434,20 @@ cl_int CustomConversionsTest::Run() continue; } + // skip half if we don't have it + if (!gTestHalfs && (inType == khalf || outType == khalf)) + { + if (gHasHalfs) + { + vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n", + gTypeNames[outType], gSaturationNames[sat], + gRoundingModeNames[round], gTypeNames[inType]); + vlog("\t\tcl_khr_fp16 enabled, but half testing turned " + "off.\n"); + } + continue; + } + // skip longs on embedded if (!gHasLong && (inType == klong || outType == klong || inType == kulong @@ -427,8 +487,8 @@ ConversionsTest::ConversionsTest(cl_device_id device, cl_context context, cl_command_queue queue) : context(context), device(device), queue(queue), num_elements(0), typeIterator({ cl_uchar(0), cl_char(0), cl_ushort(0), cl_short(0), - cl_uint(0), cl_int(0), cl_float(0), cl_double(0), - cl_ulong(0), cl_long(0) }) + cl_uint(0), cl_int(0), cl_half(0), cl_float(0), + cl_double(0), cl_ulong(0), cl_long(0) }) {} @@ -445,11 +505,31 @@ cl_int ConversionsTest::Run() cl_int ConversionsTest::SetUp(int elements) { num_elements = elements; + if (is_extension_available(device, "cl_khr_fp16")) + { + const cl_device_fp_config fpConfigHalf = + get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG); + if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0) + { + DataInitInfo::halfRoundingMode = CL_HALF_RTE; + ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTE; + } + else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0) + { + DataInitInfo::halfRoundingMode = CL_HALF_RTZ; + ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTZ; + } + else + { + log_error("Error while acquiring half rounding mode"); + return TEST_FAIL; + } + } + return CL_SUCCESS; } - -template +template void ConversionsTest::TestTypesConversion(const Type &inType, const Type &outType, int &testNumber, int startMinVectorSize) @@ -470,7 +550,8 @@ void ConversionsTest::TestTypesConversion(const Type &inType, sat = (SaturationMode)(sat + 1)) { // skip illegal saturated conversions to float type - if (kSaturated == sat && (outType == kfloat || outType == kdouble)) + if (kSaturated == sat + && (outType == kfloat || outType == kdouble || outType == khalf)) { continue; } @@ -507,6 +588,20 @@ void ConversionsTest::TestTypesConversion(const Type &inType, continue; } + // skip half if we don't have it + if (!gTestHalfs && (inType == khalf || outType == khalf)) + { + if (gHasHalfs) + { + vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n", + gTypeNames[outType], gSaturationNames[sat], + gRoundingModeNames[round], gTypeNames[inType]); + vlog("\t\tcl_khr_fp16 enabled, but half testing turned " + "off.\n"); + } + continue; + } + // Skip the implicit converts if the rounding mode is // not default or test is saturated if (0 == startMinVectorSize) @@ -517,7 +612,8 @@ void ConversionsTest::TestTypesConversion(const Type &inType, gMinVectorSize = 0; } - if ((error = DoTest(outType, inType, sat, round))) + if ((error = DoTest(outType, inType, + sat, round))) { vlog_error("\t *** %d) convert_%sn%s%s( %sn ) " "FAILED ** \n", @@ -529,8 +625,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType, } } - -template +template int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat, RoundingMode round) { @@ -541,7 +636,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat, cl_uint threads = GetThreadCount(); DataInitInfo info = { 0, 0, outType, inType, sat, round, threads }; - DataInfoSpec init_info(info); + DataInfoSpec init_info(info); WriteInputBufferInfo writeInputBufferInfo; int vectorSize; int error = 0; @@ -564,7 +659,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat, for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++) { writeInputBufferInfo.calcInfo[vectorSize].reset( - new CalcRefValsPat()); + new CalcRefValsPat()); writeInputBufferInfo.calcInfo[vectorSize]->program = conv_test::MakeProgram( outType, inType, sat, round, vectorSize, @@ -597,6 +692,11 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat, if (round == kDefaultRoundingMode && gIsRTZ) init_info.round = round = kRoundTowardZero; } + else if (std::is_same::value && OutFP) + { + if (round == kDefaultRoundingMode && gIsHalfRTZ) + init_info.round = round = kRoundTowardZero; + } // Figure out how many elements are in a work block // we handle 64-bit types a bit differently. @@ -764,6 +864,10 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat, vlog("Input value: 0x%8.8x ", ((unsigned int *)gIn)[error - 1]); break; + case khalf: + vlog("Input value: %a ", + HTF(((cl_half *)gIn)[error - 1])); + break; case kfloat: vlog("Input value: %a ", ((float *)gIn)[error - 1]); break; @@ -901,16 +1005,6 @@ double SubtractTime(uint64_t endTime, uint64_t startTime) } #endif -//////////////////////////////////////////////////////////////////////////////// - -static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count) -{ - cl_uint i; - for (i = 0; i < count; ++i) - allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0); -} - - void MapResultValuesComplete(const std::unique_ptr &ptr); void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, @@ -951,6 +1045,112 @@ void MapResultValuesComplete(const std::unique_ptr &info) // destroyed automatically soon after we exit. } +template static bool isnan_fp(const T &v) +{ + if (std::is_same::value) + { + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + return (h_exp == 0x1F && h_mant != 0); + } + else + { +#if !defined(_WIN32) + return std::isnan(v); +#else + return _isnan(v); +#endif + } +} + +template +void ZeroNanToIntCases(cl_uint count, void *mapped, Type outType) +{ + InType *inp = (InType *)gIn; + for (auto j = 0; j < count; j++) + { + if (isnan_fp(inp[j])) + memset((char *)mapped + j * gTypeSizes[outType], 0, + gTypeSizes[outType]); + } +} + +template +void FixNanToFltConversions(InType *inp, OutType *outp, cl_uint count) +{ + if (std::is_same::value) + { + for (auto j = 0; j < count; j++) + if (isnan_fp(inp[j]) && isnan_fp(outp[j])) + outp[j] = 0x7e00; // HALF_NAN + } + else + { + for (auto j = 0; j < count; j++) + if (isnan_fp(inp[j]) && isnan_fp(outp[j])) outp[j] = NAN; + } +} + +void FixNanConversions(Type outType, Type inType, void *d, cl_uint count) +{ + if (outType != kfloat && outType != kdouble && outType != khalf) + { + if (inType == kfloat) + ZeroNanToIntCases(count, d, outType); + else if (inType == kdouble) + ZeroNanToIntCases(count, d, outType); + else if (inType == khalf) + ZeroNanToIntCases(count, d, outType); + } + else if (inType == kfloat || inType == kdouble || inType == khalf) + { + // outtype and intype is float or double or half. NaN conversions for + // float/double/half could be any NaN + if (inType == kfloat) + { + float *inp = (float *)gIn; + if (outType == kdouble) + { + double *outp = (double *)d; + FixNanToFltConversions(inp, outp, count); + } + else if (outType == khalf) + { + cl_half *outp = (cl_half *)d; + FixNanToFltConversions(inp, outp, count); + } + } + else if (inType == kdouble) + { + double *inp = (double *)gIn; + if (outType == kfloat) + { + float *outp = (float *)d; + FixNanToFltConversions(inp, outp, count); + } + else if (outType == khalf) + { + cl_half *outp = (cl_half *)d; + FixNanToFltConversions(inp, outp, count); + } + } + else if (inType == khalf) + { + cl_half *inp = (cl_half *)gIn; + if (outType == kfloat) + { + float *outp = (float *)d; + FixNanToFltConversions(inp, outp, count); + } + else if (outType == kdouble) + { + double *outp = (double *)d; + FixNanToFltConversions(inp, outp, count); + } + } + } +} + void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, void *data) @@ -963,7 +1163,6 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, Type outType = info->parent->outType; // the data type of the conversion result Type inType = info->parent->inType; // the data type of the conversion input - size_t j; cl_int error; cl_event doneBarrier = info->parent->doneBarrier; @@ -985,51 +1184,7 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, // Patch up NaNs conversions to integer to zero -- these can be converted to // any integer - if (outType != kfloat && outType != kdouble) - { - if (inType == kfloat) - { - float *inp = (float *)gIn; - for (j = 0; j < count; j++) - { - if (isnan(inp[j])) - memset((char *)mapped + j * gTypeSizes[outType], 0, - gTypeSizes[outType]); - } - } - if (inType == kdouble) - { - double *inp = (double *)gIn; - for (j = 0; j < count; j++) - { - if (isnan(inp[j])) - memset((char *)mapped + j * gTypeSizes[outType], 0, - gTypeSizes[outType]); - } - } - } - else if (inType == kfloat || inType == kdouble) - { // outtype and intype is float or double. NaN conversions for float <-> - // double can be any NaN - if (inType == kfloat && outType == kdouble) - { - float *inp = (float *)gIn; - double *outp = (double *)mapped; - for (j = 0; j < count; j++) - { - if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN; - } - } - if (inType == kdouble && outType == kfloat) - { - double *inp = (double *)gIn; - float *outp = (float *)mapped; - for (j = 0; j < count; j++) - { - if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN; - } - } - } + FixNanConversions(outType, inType, mapped, count); if (memcmp(mapped, gRef, count * gTypeSizes[outType])) info->result = @@ -1077,12 +1232,8 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status, // CalcReferenceValuesComplete exit. } -// - namespace conv_test { -//////////////////////////////////////////////////////////////////////////////// - cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p) { DataInitBase *info = (DataInitBase *)p; @@ -1092,8 +1243,6 @@ cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p) return CL_SUCCESS; } -//////////////////////////////////////////////////////////////////////////////// - cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) { DataInitBase *info = (DataInitBase *)p; @@ -1102,7 +1251,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) Type inType = info->inType; Type outType = info->outType; RoundingMode round = info->round; - size_t j; Force64BitFPUPrecision(); @@ -1110,7 +1258,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) void *a = (cl_uchar *)gAllowZ + job_id * count; void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType]; - if (outType != inType) { // create the reference while we wait @@ -1144,7 +1291,33 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) qcom_sat = info->sat; #endif - RoundingMode oldRound = set_round(round, outType); + RoundingMode oldRound; + if (outType == khalf) + { + oldRound = set_round(kRoundToNearestEven, kfloat); + switch (round) + { + default: + case kDefaultRoundingMode: + DataInitInfo::halfRoundingMode = + ConversionsTest::defaultHalfRoundingMode; + break; + case kRoundToNearestEven: + DataInitInfo::halfRoundingMode = CL_HALF_RTE; + break; + case kRoundUp: + DataInitInfo::halfRoundingMode = CL_HALF_RTP; + break; + case kRoundDown: + DataInitInfo::halfRoundingMode = CL_HALF_RTN; + break; + case kRoundTowardZero: + DataInitInfo::halfRoundingMode = CL_HALF_RTZ; + break; + } + } + else + oldRound = set_round(round, outType); if (info->sat) info->conv_array_sat(d, s, count); @@ -1156,10 +1329,13 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) // Decide if we allow a zero result in addition to the correctly rounded // one memset(a, 0, count); - if (gForceFTZ) + if (gForceFTZ && (inType == kfloat || outType == kfloat)) { - if (inType == kfloat || outType == kfloat) - setAllowZ((uint8_t *)a, (uint32_t *)s, count); + info->set_allow_zero_array((uint8_t *)a, d, s, count); + } + if (gForceHalfFTZ && (inType == khalf || outType == khalf)) + { + info->set_allow_zero_array((uint8_t *)a, d, s, count); } } else @@ -1170,55 +1346,11 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p) // Patch up NaNs conversions to integer to zero -- these can be converted to // any integer - if (info->outType != kfloat && info->outType != kdouble) - { - if (inType == kfloat) - { - float *inp = (float *)s; - for (j = 0; j < count; j++) - { - if (isnan(inp[j])) - memset((char *)d + j * gTypeSizes[outType], 0, - gTypeSizes[outType]); - } - } - if (inType == kdouble) - { - double *inp = (double *)s; - for (j = 0; j < count; j++) - { - if (isnan(inp[j])) - memset((char *)d + j * gTypeSizes[outType], 0, - gTypeSizes[outType]); - } - } - } - else if (inType == kfloat || inType == kdouble) - { // outtype and intype is float or double. NaN conversions for float <-> - // double can be any NaN - if (inType == kfloat && outType == kdouble) - { - float *inp = (float *)s; - for (j = 0; j < count; j++) - { - if (isnan(inp[j])) ((double *)d)[j] = NAN; - } - } - if (inType == kdouble && outType == kfloat) - { - double *inp = (double *)s; - for (j = 0; j < count; j++) - { - if (isnan(inp[j])) ((float *)d)[j] = NAN; - } - } - } + FixNanConversions(outType, inType, d, count); return CL_SUCCESS; } -//////////////////////////////////////////////////////////////////////////////// - uint64_t GetTime(void) { #if defined(__APPLE__) @@ -1233,8 +1365,6 @@ uint64_t GetTime(void) #endif } -//////////////////////////////////////////////////////////////////////////////// - // Note: not called reentrantly void WriteInputBufferComplete(void *data) { @@ -1295,8 +1425,6 @@ void WriteInputBufferComplete(void *data) // automatically soon after we exit. } -//////////////////////////////////////////////////////////////////////////////// - cl_program MakeProgram(Type outType, Type inType, SaturationMode sat, RoundingMode round, int vectorSize, cl_kernel *outKernel) { @@ -1308,6 +1436,9 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat, if (outType == kdouble || inType == kdouble) source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; + if (outType == khalf || inType == khalf) + source << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + // Create the program. This is a bit complicated because we are trying to // avoid byte and short stores. if (0 == vectorSize) @@ -1408,7 +1539,7 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat, *outKernel = NULL; const char *flags = NULL; - if (gForceFTZ) flags = "-cl-denorms-are-zero"; + if (gForceFTZ || gForceHalfFTZ) flags = "-cl-denorms-are-zero"; // build it std::string sourceString = source.str(); diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h index 2314ee748..c7b24b708 100644 --- a/test_conformance/conversions/basic_test_conversions.h +++ b/test_conformance/conversions/basic_test_conversions.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2023 The Khronos Group Inc. +// Copyright (c) 2023-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -30,6 +30,8 @@ #include #endif +#include + #include "harness/mt19937.h" #include "harness/testHarness.h" #include "harness/typeWrappers.h" @@ -76,6 +78,8 @@ extern cl_mem gInBuffer; extern cl_mem gOutBuffers[]; extern int gHasDouble; extern int gTestDouble; +extern int gHasHalfs; +extern int gTestHalfs; extern int gWimpyMode; extern int gWimpyReductionFactor; extern int gSkipTesting; @@ -87,6 +91,8 @@ extern int gReportAverageTimes; extern int gStartTestNumber; extern int gEndTestNumber; extern int gIsRTZ; +extern int gForceHalfFTZ; +extern int gIsHalfRTZ; extern void *gIn; extern void *gRef; extern void *gAllowZ; @@ -135,7 +141,7 @@ struct CalcRefValsBase cl_int result; }; -template +template struct CalcRefValsPat : CalcRefValsBase { int check_result(void *, uint32_t, int) override; @@ -162,8 +168,12 @@ struct WriteInputBufferInfo }; // Must be aligned with Type enums! -using TypeIter = std::tuple; +using TypeIter = + std::tuple; + +// hardcoded solution needed due to typeid confusing cl_ushort/cl_half +constexpr bool isTypeFp[] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0 }; // Helper test fixture for constructing OpenCL objects used in testing // a variety of simple command-buffer enqueue scenarios. @@ -179,13 +189,13 @@ struct ConversionsTest // Test body returning an OpenCL error code cl_int Run(); - template + template int DoTest(Type outType, Type inType, SaturationMode sat, RoundingMode round); - template + template void TestTypesConversion(const Type &inType, const Type &outType, int &tn, - const int smvs); + int startMinVectorSize); protected: cl_context context; @@ -195,6 +205,9 @@ struct ConversionsTest size_t num_elements; TypeIter typeIterator; + +public: + static cl_half_rounding_mode defaultHalfRoundingMode; }; struct CustomConversionsTest : ConversionsTest @@ -221,17 +234,18 @@ int MakeAndRunTest(cl_device_id device, cl_context context, struct TestType { - template bool testType(Type in) + template bool testType(Type in) { switch (in) { default: return false; case kuchar: return std::is_same::value; case kchar: return std::is_same::value; - case kushort: return std::is_same::value; + case kushort: return std::is_same::value && !FP; case kshort: return std::is_same::value; case kuint: return std::is_same::value; case kint: return std::is_same::value; + case khalf: return std::is_same::value && FP; case kfloat: return std::is_same::value; case kdouble: return std::is_same::value; case kulong: return std::is_same::value; @@ -263,13 +277,15 @@ struct IterOverTypes : public TestType typename InType> void iterate_in_type(const InType &t) { - if (!testType(inType)) vlog_error("Unexpected data type!\n"); + if (!testType(inType)) + vlog_error("Unexpected data type!\n"); - if (!testType(outType)) vlog_error("Unexpected data type!\n"); + if (!testType(outType)) + vlog_error("Unexpected data type!\n"); // run the conversions - test.TestTypesConversion(inType, outType, testNumber, - startMinVectorSize); + test.TestTypesConversion( + inType, outType, testNumber, startMinVectorSize); inType = (Type)(inType + 1); } @@ -337,11 +353,13 @@ struct IterOverSelectedTypes : public TestType typename InType> void iterate_in_type(const InType &t) { - if (testType(inType) && testType(outType)) + if (testType(inType) + && testType(outType)) { // run selected conversion // testing of the result will happen afterwards - test.DoTest(outType, inType, saturation, rounding); + test.DoTest( + outType, inType, saturation, rounding); } } diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h index d63bb07c8..9d2cbc606 100644 --- a/test_conformance/conversions/conversions_data_info.h +++ b/test_conformance/conversions/conversions_data_info.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2023 The Khronos Group Inc. +// Copyright (c) 2023-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,8 +28,12 @@ extern bool qcom_sat; extern roundingMode qcom_rm; #endif +#include + +#include "harness/conversions.h" #include "harness/mt19937.h" #include "harness/rounding_mode.h" +#include "harness/typeWrappers.h" #include @@ -60,11 +64,17 @@ struct DataInitInfo RoundingMode round; cl_uint threads; + static cl_half_rounding_mode halfRoundingMode; static std::vector specialValuesUInt; static std::vector specialValuesFloat; static std::vector specialValuesDouble; + static std::vector specialValuesHalf; }; +#define HFF(num) cl_half_from_float(num, DataInitInfo::halfRoundingMode) +#define HTF(num) cl_half_to_float(num) +#define HFD(num) cl_half_from_double(num, DataInitInfo::halfRoundingMode) + struct DataInitBase : public DataInitInfo { virtual ~DataInitBase() = default; @@ -73,9 +83,12 @@ struct DataInitBase : public DataInitInfo virtual void conv_array(void *out, void *in, size_t n) {} virtual void conv_array_sat(void *out, void *in, size_t n) {} virtual void init(const cl_uint &, const cl_uint &) {} + virtual void set_allow_zero_array(uint8_t *allow, void *out, void *in, + size_t n) + {} }; -template +template struct DataInfoSpec : public DataInitBase { explicit DataInfoSpec(const DataInitInfo &agg); @@ -90,6 +103,9 @@ struct DataInfoSpec : public DataInitBase void conv(OutType *out, InType *in); void conv_sat(OutType *out, InType *in); + // Decide if we allow a zero result in addition to the correctly rounded one + void set_allow_zero(uint8_t *allow, OutType *out, InType *in); + // min/max ranges for output type of data std::pair ranges; @@ -98,6 +114,16 @@ struct DataInfoSpec : public DataInitBase std::vector mdv; + constexpr bool is_in_half() const + { + return (std::is_same::value && InFP); + } + + constexpr bool is_out_half() const + { + return (std::is_same::value && OutFP); + } + void conv_array(void *out, void *in, size_t n) override { for (size_t i = 0; i < n; i++) @@ -111,6 +137,12 @@ struct DataInfoSpec : public DataInitBase } void init(const cl_uint &, const cl_uint &) override; + void set_allow_zero_array(uint8_t *allow, void *out, void *in, + size_t n) override + { + for (size_t i = 0; i < n; i++) + set_allow_zero(&allow[i], &((OutType *)out)[i], &((InType *)in)[i]); + } InType clamp(const InType &); inline float fclamp(float lo, float v, float hi) { @@ -125,19 +157,22 @@ struct DataInfoSpec : public DataInitBase } }; -template -DataInfoSpec::DataInfoSpec(const DataInitInfo &agg) +template +DataInfoSpec::DataInfoSpec( + const DataInitInfo &agg) : DataInitBase(agg), mdv(0) { if (std::is_same::value) ranges = std::make_pair(CL_FLT_MIN, CL_FLT_MAX); else if (std::is_same::value) ranges = std::make_pair(CL_DBL_MIN, CL_DBL_MAX); + else if (std::is_same::value && OutFP) + ranges = std::make_pair(HFF(CL_HALF_MIN), HFF(CL_HALF_MAX)); else if (std::is_same::value) ranges = std::make_pair(0, CL_UCHAR_MAX); else if (std::is_same::value) ranges = std::make_pair(CL_CHAR_MIN, CL_CHAR_MAX); - else if (std::is_same::value) + else if (std::is_same::value && !OutFP) ranges = std::make_pair(0, CL_USHRT_MAX); else if (std::is_same::value) ranges = std::make_pair(CL_SHRT_MIN, CL_SHRT_MAX); @@ -158,12 +193,12 @@ DataInfoSpec::DataInfoSpec(const DataInitInfo &agg) InType outMax = static_cast(ranges.second); InType eps = std::is_same::value ? (InType) FLT_EPSILON : (InType) DBL_EPSILON; - if (std::is_integral::value) + if (std::is_integral::value && !OutFP) { // to char/uchar/short/ushort/int/uint/long/ulong if (sizeof(OutType)<=sizeof(cl_short)) { // to char/uchar/short/ushort clamp_ranges= - {{outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps}, + {{outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps}, {outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps}, {outMin-1.0f+(std::is_signed::value?outMax:0.5f)*eps, outMax-1.f}, {outMin-0.0f, outMax - outMax * 0.5f * eps }, @@ -249,11 +284,55 @@ DataInfoSpec::DataInfoSpec(const DataInitInfo &agg) } } } + else if (is_in_half()) + { + float outMin = static_cast(ranges.first); + float outMax = static_cast(ranges.second); + float eps = CL_HALF_EPSILON; + cl_half_rounding_mode prev_half_round = DataInitInfo::halfRoundingMode; + DataInitInfo::halfRoundingMode = CL_HALF_RTZ; + + if (std::is_integral::value) + { // to char/uchar/short/ushort/int/uint/long/ulong + if (sizeof(OutType)<=sizeof(cl_char) || std::is_same::value) + { // to char/uchar + clamp_ranges= + {{HFF(outMin-0.5f), HFF(outMax + 0.5f - outMax * 0.5f * eps)}, + {HFF(outMin-0.5f), HFF(outMax + 0.5f - outMax * 0.5f * eps)}, + {HFF(outMin-1.0f+(std::is_signed::value?outMax:0.5f)*eps), HFF(outMax-1.f)}, + {HFF(outMin-0.0f), HFF(outMax - outMax * 0.5f * eps) }, + {HFF(outMin-1.0f+(std::is_signed::value?outMax:0.5f)*eps), HFF(outMax - outMax * 0.5f * eps)}}; + } + else + { // to ushort/int/uint/long/ulong + if (std::is_signed::value) + { + clamp_ranges= + { {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)}, + {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)}, + {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)}, + {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)}, + {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)}}; + } + else + { + clamp_ranges= + { {HFF(outMin), HFF(CL_HALF_MAX)}, + {HFF(outMin), HFF(CL_HALF_MAX)}, + {HFF(outMin), HFF(CL_HALF_MAX)}, + {HFF(outMin), HFF(CL_HALF_MAX)}, + {HFF(outMin), HFF(CL_HALF_MAX)}}; + } + } + } + + DataInitInfo::halfRoundingMode = prev_half_round; + } // clang-format on } -template -float DataInfoSpec::round_to_int(float f) +template +float DataInfoSpec::round_to_int(float f) { static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23), -MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23) }; @@ -281,8 +360,9 @@ float DataInfoSpec::round_to_int(float f) return f; } -template -long long DataInfoSpec::round_to_int_and_clamp(double f) +template +long long +DataInfoSpec::round_to_int_and_clamp(double f) { static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) }; @@ -313,8 +393,8 @@ long long DataInfoSpec::round_to_int_and_clamp(double f) return (long long)f; } -template -OutType DataInfoSpec::absolute(const OutType &x) +template +OutType DataInfoSpec::absolute(const OutType &x) { union { cl_uint u; @@ -331,17 +411,30 @@ OutType DataInfoSpec::absolute(const OutType &x) return u.f; } -template -void DataInfoSpec::conv(OutType *out, InType *in) +template constexpr bool is_half() { - if (std::is_same::value) + return (std::is_same::value && fp); +} + +template +void DataInfoSpec::conv(OutType *out, InType *in) +{ + if (std::is_same::value || is_in_half()) { cl_float inVal = *in; + if (std::is_same::value) + { + inVal = HTF(*in); + } if (std::is_floating_point::value) { *out = (OutType)inVal; } + else if (is_out_half()) + { + *out = HFF(*in); + } else if (std::is_same::value) { #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) @@ -376,6 +469,8 @@ void DataInfoSpec::conv(OutType *out, InType *in) { if (std::is_same::value) *out = (OutType)*in; + else if (is_out_half()) + *out = static_cast(HFD(*in)); else *out = rint(*in); } @@ -418,7 +513,7 @@ void DataInfoSpec::conv(OutType *out, InType *in) *out = (vi == 0 ? 0.0 : static_cast(vi)); #endif } - else if (std::is_same::value) + else if (std::is_same::value || is_out_half()) { cl_float outVal = 0.f; @@ -465,7 +560,9 @@ void DataInfoSpec::conv(OutType *out, InType *in) #endif #endif - *out = outVal; + *out = std::is_same::value + ? static_cast(HFF(outVal)) + : outVal; } else { @@ -486,6 +583,8 @@ void DataInfoSpec::conv(OutType *out, InType *in) // Per IEEE-754-2008 5.4.1, 0 always converts to +0.0 *out = (*in == 0 ? 0.0 : *in); } + else if (is_out_half()) + *out = static_cast(HFF(*in == 0 ? 0.f : *in)); else { *out = (OutType)*in; @@ -496,19 +595,26 @@ void DataInfoSpec::conv(OutType *out, InType *in) #define CLAMP(_lo, _x, _hi) \ ((_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x))) -template -void DataInfoSpec::conv_sat(OutType *out, InType *in) +template +void DataInfoSpec::conv_sat(OutType *out, + InType *in) { - if (std::is_floating_point::value) + if (std::is_floating_point::value || is_in_half()) { - if (std::is_floating_point::value) - { // in float/double, out float/double - *out = (OutType)(*in); + cl_float inVal = *in; + if (is_in_half()) inVal = HTF(*in); + + if (std::is_floating_point::value || is_out_half()) + { // in half/float/double, out half/float/double + if (is_out_half()) + *out = static_cast(HFF(inVal)); + else + *out = (OutType)(is_in_half() ? inVal : *in); } - else if ((std::is_same::value) + else if ((std::is_same::value || is_in_half()) && std::is_same::value) { - cl_float x = round_to_int(*in); + cl_float x = round_to_int(is_in_half() ? HTF(*in) : *in); #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) // VS2005 (at least) on x86 uses fistp to store the float as a @@ -536,47 +642,57 @@ void DataInfoSpec::conv_sat(OutType *out, InType *in) } #else *out = x >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64) - ? 0xFFFFFFFFFFFFFFFFULL - : x < 0 ? 0 : (OutType)x; + ? (OutType)0xFFFFFFFFFFFFFFFFULL + : x < 0 ? 0 + : (OutType)x; #endif } - else if ((std::is_same::value) + else if ((std::is_same::value || is_in_half()) && std::is_same::value) { - cl_float f = round_to_int(*in); + cl_float f = round_to_int(is_in_half() ? HTF(*in) : *in); *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63) - ? 0x7FFFFFFFFFFFFFFFULL + ? (OutType)0x7FFFFFFFFFFFFFFFULL : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63) - ? 0x8000000000000000LL - : (OutType)f; + ? (OutType)0x8000000000000000LL + : (OutType)f; } else if (std::is_same::value && std::is_same::value) { InType f = rint(*in); *out = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64) - ? 0xFFFFFFFFFFFFFFFFULL - : f < 0 ? 0 : (OutType)f; + ? (OutType)0xFFFFFFFFFFFFFFFFULL + : f < 0 ? 0 + : (OutType)f; } else if (std::is_same::value && std::is_same::value) { InType f = rint(*in); *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63) - ? 0x7FFFFFFFFFFFFFFFULL + ? (OutType)0x7FFFFFFFFFFFFFFFULL : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63) - ? 0x8000000000000000LL - : (OutType)f; + ? (OutType)0x8000000000000000LL + : (OutType)f; } else - { // in float/double, out char/uchar/short/ushort/int/uint - *out = - CLAMP(ranges.first, round_to_int_and_clamp(*in), ranges.second); + { // in half/float/double, out char/uchar/short/ushort/int/uint + *out = CLAMP(ranges.first, + round_to_int_and_clamp(is_in_half() ? inVal : *in), + ranges.second); } } else if (std::is_integral::value && std::is_integral::value) { + if (is_out_half()) + { + *out = std::is_signed::value + ? static_cast(HFF((cl_float)*in)) + : absolute(static_cast(HFF((cl_float)*in))); + } + else { if ((std::is_signed::value && std::is_signed::value) @@ -614,14 +730,64 @@ void DataInfoSpec::conv_sat(OutType *out, InType *in) } } -template -void DataInfoSpec::init(const cl_uint &job_id, - const cl_uint &thread_id) +template +void DataInfoSpec::set_allow_zero(uint8_t *allow, + OutType *out, + InType *in) +{ + // from double + if (std::is_same::value) + *allow |= IsDoubleSubnormal(*in); + // from float + if (std::is_same::value) *allow |= IsFloatSubnormal(*in); + // from half + if (is_in_half()) *allow |= IsHalfSubnormal(*in); + + // handle the cases that the converted result is subnormal + // from double + if (std::is_same::value) + *allow |= IsDoubleSubnormal(*out); + // from float + if (std::is_same::value) + *allow |= IsFloatSubnormal(*out); + // from half + if (is_out_half()) *allow |= IsHalfSubnormal(*out); +} + +template +void DataInfoSpec::init(const cl_uint &job_id, + const cl_uint &thread_id) { uint64_t ulStart = start; void *pIn = (char *)gIn + job_id * size * gTypeSizes[inType]; - if (std::is_integral::value) + if (is_in_half()) + { + cl_half *o = (cl_half *)pIn; + int i; + + if (gIsEmbedded) + for (i = 0; i < size; i++) + o[i] = (cl_half)genrand_int32(mdv[thread_id]); + else + for (i = 0; i < size; i++) o[i] = (cl_half)((i + ulStart) % 0xffff); + + if (0 == ulStart) + { + size_t tableSize = specialValuesHalf.size() + * sizeof(decltype(specialValuesHalf)::value_type); + if (sizeof(InType) * size < tableSize) + tableSize = sizeof(InType) * size; + memcpy((char *)(o + i) - tableSize, &specialValuesHalf.front(), + tableSize); + } + + if (kUnsaturated == sat) + { + for (i = 0; i < size; i++) o[i] = clamp(o[i]); + } + } + else if (std::is_integral::value) { InType *o = (InType *)pIn; if (sizeof(InType) <= sizeof(cl_short)) @@ -776,10 +942,10 @@ void DataInfoSpec::init(const cl_uint &job_id, } } -template -InType DataInfoSpec::clamp(const InType &in) +template +InType DataInfoSpec::clamp(const InType &in) { - if (std::is_integral::value) + if (std::is_integral::value && !OutFP) { if (std::is_same::value) { @@ -791,6 +957,11 @@ InType DataInfoSpec::clamp(const InType &in) return dclamp(clamp_ranges[round].first, in, clamp_ranges[round].second); } + else if (std::is_same::value && InFP) + { + return HFF(fclamp(HTF(clamp_ranges[round].first), HTF(in), + HTF(clamp_ranges[round].second))); + } } return in; } diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp index dab61dc50..122a84107 100644 --- a/test_conformance/conversions/test_conversions.cpp +++ b/test_conformance/conversions/test_conversions.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -73,9 +73,9 @@ static void PrintUsage(void); test_status InitCL(cl_device_id device); -const char *gTypeNames[kTypeCount] = { "uchar", "char", "ushort", "short", - "uint", "int", "float", "double", - "ulong", "long" }; +const char *gTypeNames[kTypeCount] = { "uchar", "char", "ushort", "short", + "uint", "int", "half", "float", + "double", "ulong", "long" }; const char *gRoundingModeNames[kRoundingModeCount] = { "", "_rte", "_rtp", "_rtn", "_rtz" }; @@ -83,9 +83,9 @@ const char *gRoundingModeNames[kRoundingModeCount] = { "", "_rte", "_rtp", const char *gSaturationNames[2] = { "", "_sat" }; size_t gTypeSizes[kTypeCount] = { - sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short), - sizeof(cl_uint), sizeof(cl_int), sizeof(cl_float), sizeof(cl_double), - sizeof(cl_ulong), sizeof(cl_long), + sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short), + sizeof(cl_uint), sizeof(cl_int), sizeof(cl_half), sizeof(cl_float), + sizeof(cl_double), sizeof(cl_ulong), sizeof(cl_long), }; char appName[64] = "ctest"; @@ -221,13 +221,17 @@ static int ParseArgs(int argc, const char **argv) switch (*arg) { case 'd': gTestDouble ^= 1; break; + case 'h': gTestHalfs ^= 1; break; case 'l': gSkipTesting ^= 1; break; case 'm': gMultithread ^= 1; break; case 'w': gWimpyMode ^= 1; break; case '[': parseWimpyReductionFactor(arg, gWimpyReductionFactor); break; - case 'z': gForceFTZ ^= 1; break; + case 'z': + gForceFTZ ^= 1; + gForceHalfFTZ ^= 1; + break; case 't': gTimeResults ^= 1; break; case 'a': gReportAverageTimes ^= 1; break; case '1': @@ -355,7 +359,6 @@ static void PrintUsage(void) } - test_status InitCL(cl_device_id device) { int error, i; @@ -412,6 +415,50 @@ test_status InitCL(cl_device_id device) } gTestDouble &= gHasDouble; + if (is_extension_available(device, "cl_khr_fp16")) + { + gHasHalfs = 1; + + cl_device_fp_config floatCapabilities = 0; + if ((error = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG, + sizeof(floatCapabilities), + &floatCapabilities, NULL))) + floatCapabilities = 0; + + if (0 == (CL_FP_DENORM & floatCapabilities)) gForceHalfFTZ ^= 1; + + if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST)) + { + char profileStr[128] = ""; + // Verify that we are an embedded profile device + if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, + sizeof(profileStr), profileStr, NULL))) + { + vlog_error("FAILURE: Could not get device profile: error %d\n", + error); + return TEST_FAIL; + } + + if (strcmp(profileStr, "EMBEDDED_PROFILE")) + { + vlog_error( + "FAILURE: non-embedded profile device does not support " + "CL_FP_ROUND_TO_NEAREST\n"); + return TEST_FAIL; + } + + if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO)) + { + vlog_error("FAILURE: embedded profile device supports neither " + "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n"); + return TEST_FAIL; + } + + gIsHalfRTZ = 1; + } + } + gTestHalfs &= gHasHalfs; + // detect whether profile of the device is embedded char profile[1024] = ""; if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile), @@ -492,8 +539,12 @@ test_status InitCL(cl_device_id device) vlog("\tSubnormal values supported for floats? %s\n", no_yes[0 != (CL_FP_DENORM & floatCapabilities)]); vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]); + vlog("\tTesting with FTZ mode ON for halfs? %s\n", + no_yes[0 != gForceHalfFTZ]); vlog("\tTesting with default RTZ mode for floats? %s\n", no_yes[0 != gIsRTZ]); + vlog("\tTesting with default RTZ mode for halfs? %s\n", + no_yes[0 != gIsHalfRTZ]); vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]); if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]); vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]); @@ -503,5 +554,3 @@ test_status InitCL(cl_device_id device) vlog("\n"); return TEST_PASS; } - - diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt index a221f05ad..d53911e43 100644 --- a/test_conformance/math_brute_force/CMakeLists.txt +++ b/test_conformance/math_brute_force/CMakeLists.txt @@ -3,24 +3,32 @@ set(MODULE_NAME BRUTEFORCE) set(${MODULE_NAME}_SOURCES binary_double.cpp binary_float.cpp + binary_half.cpp binary_i_double.cpp binary_i_float.cpp + binary_i_half.cpp binary_operator_double.cpp binary_operator_float.cpp + binary_operator_half.cpp binary_two_results_i_double.cpp binary_two_results_i_float.cpp + binary_two_results_i_half.cpp common.cpp common.h function_list.cpp function_list.h i_unary_double.cpp i_unary_float.cpp + i_unary_half.cpp macro_binary_double.cpp macro_binary_float.cpp + macro_binary_half.cpp macro_unary_double.cpp macro_unary_float.cpp + macro_unary_half.cpp mad_double.cpp mad_float.cpp + mad_half.cpp main.cpp reference_math.cpp reference_math.h @@ -28,15 +36,20 @@ set(${MODULE_NAME}_SOURCES sleep.h ternary_double.cpp ternary_float.cpp + ternary_half.cpp test_functions.h unary_double.cpp unary_float.cpp + unary_half.cpp unary_two_results_double.cpp unary_two_results_float.cpp + unary_two_results_half.cpp unary_two_results_i_double.cpp unary_two_results_i_float.cpp + unary_two_results_i_half.cpp unary_u_double.cpp unary_u_float.cpp + unary_u_half.cpp utility.cpp utility.h ) diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp new file mode 100644 index 000000000..3a2395c70 --- /dev/null +++ b/test_conformance/math_brute_force/binary_half.cpp @@ -0,0 +1,784 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "harness/errorHelpers.h" + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" +#include "reference_math.h" + +#include +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + double maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdataHolder d; + + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isFDim; + int skipNanInf; + int isNextafter; +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_int error; + const char *name = job->f->name; + + int isFDim = job->isFDim; + int skipNanInf = job->skipNanInf; + int isNextafter = job->isNextafter; + cl_ushort *t; + cl_half *r; + std::vector s(0), s2(0); + cl_uint j = 0; + + RoundingMode oldRoundMode; + cl_int copysign_test = 0; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + FPU_mode_type oldMode; + oldRoundMode = kRoundToNearestEven; + if (isFDim) + { + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + } + + if (!strcmp(name, "copysign")) copysign_test = 1; + +#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2)) + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + t = (cl_ushort *)r; + s.resize(buffer_elements); + s2.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + if (isNextafter) + r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]), + CL_HALF_RTE); + else + r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE); + } + + if (isFDim && ftz) RestoreFPState(&oldMode); + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Verify data + + for (j = 0; j < buffer_elements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double correct; + if (isNextafter) + correct = reference_nextafterh(s[j], s2[j]); + else + correct = ref_func(s[j], s2[j]); + + float test = cl_half_to_float(q[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (skipNanInf) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + if (isNextafter) + { + correct = reference_nextafterh(s[j], s2[j], false); + err = Ulp_Error_Half(q[j], correct); + fail = !(fabsf(err) <= ulps); + } + + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + if (isNextafter) + { + correct2 = reference_nextafterh(0.0, s2[j]); + correct3 = reference_nextafterh(-0.0, s2[j]); + } + else + { + correct2 = ref_func(0.0, s2[j]); + correct3 = ref_func(-0.0, s2[j]); + } + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + auto check_error = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + }; + check_error(); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(0.0, s2[j], false); + correct3 = + reference_nextafterh(-0.0, s2[j], false); + check_error(); + } + + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p[j]); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + if (isNextafter) + { + correct2 = reference_nextafterh(0.0, 0.0); + correct3 = reference_nextafterh(-0.0, 0.0); + correct4 = reference_nextafterh(0.0, -0.0); + correct5 = reference_nextafterh(-0.0, -0.0); + } + else + { + correct2 = ref_func(0.0, 0.0); + correct3 = ref_func(-0.0, 0.0); + correct4 = ref_func(0.0, -0.0); + correct5 = ref_func(-0.0, -0.0); + } + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p2[j]); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + if (isNextafter) + { + correct2 = reference_nextafterh(s[j], 0.0); + correct3 = reference_nextafterh(s[j], -0.0); + } + else + { + correct2 = ref_func(s[j], 0.0); + correct3 = ref_func(s[j], -0.0); + } + + if (skipNanInf) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + auto check_error = [&]() { + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + }; + check_error(); + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + if (fail && isNextafter) + { + correct2 = + reference_nextafterh(s[j], 0.0, false); + correct3 = + reference_nextafterh(s[j], -0.0, false); + check_error(); + } + + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // allow to omit denorm values for platforms with no + // denorm support for nextafter + if (fail && (isNextafter) + && (correct <= cl_half_to_float(0x3FF)) + && (correct >= cl_half_to_float(0x83FF))) + { + fail = fail && (q[j] != p2[j]); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %u\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], cl_half_to_float(r[j]), r[j], test, q[j], + j); + error = -1; + return error; + } + } + } + } + + if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return error; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter, + bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.isFDim = 0 == strcmp("fdim", f->nameInCode); + test_info.skipNanInf = test_info.isFDim && !gInfNanSupport; + test_info.isNextafter = isNextafter; + + test_info.tinfo.resize(test_info.threadCount); + + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error( + "Error: Unable to create sub-buffer of gOutBuffer[%d] " + "for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} + +int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode); +} + +int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode) +{ + return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode); +} diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp new file mode 100644 index 000000000..c74a845a4 --- /dev/null +++ b/test_conformance/math_brute_force/binary_i_half.cpp @@ -0,0 +1,548 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Int, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double + maxErrorValue; // position of the max error value (param 1). Init to 0. + cl_int maxErrorValue2; // position of the max error value (param 2). Init + // to 0. + MTdataHolder d; + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +} ThreadInfo; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +const int specialValuesInt3[] = { 0, 1, 2, 3, 1022, 1023, + 1024, INT_MIN, INT_MAX, -1, -2, -3, + -1022, -1023, -11024, -INT_MAX }; +size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_ushort *t; + cl_half *r; + std::vector s; + cl_int *s2; + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesInt3Count; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesInt3[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesInt3Count) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_elements * sizeof(cl_half), p, 0, + NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_elements * sizeof(cl_int), p2, 0, + NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, + buffer_elements * sizeof(cl_half)); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer( + tinfo->tQueue, tinfo->outBuf[j], &pattern, sizeof(pattern), 0, + buffer_elements * sizeof(cl_half), 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + t = (cl_ushort *)r; + s.resize(buffer_elements); + s2 = (cl_int *)gIn2 + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + r[j] = HFF(func.f_fi(s[j], s2[j])); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Wait for the last buffer + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, + buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + float test = cl_half_to_float(q[j]); + double correct = func.f_fi(s[j], s2[j]); + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + correct2 = func.f_fi(0.0, s2[j]); + correct3 = func.f_fi(-0.0, s2[j]); + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%d}\nExpected: %a (half 0x%04x) \nActual: %a " + "(half 0x%04x) at index: %d\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + cl_half_to_float(r[j]), r[j], test, q[j], + (cl_uint)j); + error = -1; + return error; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + return error; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + cl_int maxErrorVal2 = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + cl_buffer_region region2 = { i * test_info.subBufferSize + * sizeof(cl_int), + test_info.subBufferSize * sizeof(cl_int) }; + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion2, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + // Run the kernels + if (!gSkipCorrectnessTesting) + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp new file mode 100644 index 000000000..366e63e8b --- /dev/null +++ b/test_conformance/math_brute_force/binary_operator_half.cpp @@ -0,0 +1,661 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + // Input and output buffers for the thread + clMemWrapper inBuf; + clMemWrapper inBuf2; + Buffers outBuf; + + // max error value. Init to 0. + float maxError; + // position of the max error value (param 1). Init to 0. + double maxErrorValue; + // position of the max error value (param 2). Init to 0. + double maxErrorValue2; + MTdataHolder d; + + // Per thread command queue to improve performance + clCommandQueueWrapper tQueue; +}; + +struct TestInfo +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; + + // Array of thread specific information + std::vector tinfo; + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + // no special fields +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_int error; + + const char *name = job->f->name; + cl_half *r = 0; + std::vector s(0), s2(0); + RoundingMode oldRoundMode; + + cl_event e[VECTOR_SIZE_COUNT]; + cl_half *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_half *p = (cl_half *)gIn + thread_id * buffer_elements; + cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements; + cl_uint idx = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)lastSpecialJobIndex) + { + // Insert special values + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; idx < buffer_elements; idx++) + { + p[idx] = specialValuesHalf[x]; + p2[idx] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values + for (; idx < buffer_elements; idx++) + { + p[idx] = (cl_half)genrand_int32(d); + p2[idx] = (cl_half)genrand_int32(d); + } + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // Run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) + { + return CL_SUCCESS; + } + + // Calculate the correctly rounded reference result + FPU_mode_type oldMode; + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + oldRoundMode = kRoundToNearestEven; + if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat); + + // Calculate the correctly rounded reference result + r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + s.resize(buffer_elements); + s2.resize(buffer_elements); + + for (size_t j = 0; j < buffer_elements; j++) + { + s[j] = HTF(p[j]); + s2[j] = HTF(p2[j]); + r[j] = HFF(func.f_ff(s[j], s2[j])); + } + + if (ftz) RestoreFPState(&oldMode); + + // Read the data back -- no need to wait for the first N-1 buffers but wait + // for the last buffer. This is an in order queue. + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + out[j] = (cl_ushort *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Verify data + + for (size_t j = 0; j < buffer_elements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = out[k]; + + // If we aren't getting the correctly rounded result + if (r[j] != q[j]) + { + float test = HTF(q[j]); + float correct = func.f_ff(s[j], s2[j]); + + // Per section 10 paragraph 6, accept any result if an input or + // output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j]) + || IsFloatInfinity(s[j]) || IsFloatNaN(s[j])) + continue; + } + + float err = Ulp_Error_Half(q[j], correct); + + int fail = !(fabsf(err) <= ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = func.f_ff(0.0, s2[j]); + correct3 = func.f_ff(-0.0, s2[j]); + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + double correct4, correct5; + float err4, err5; + + correct2 = func.f_ff(0.0, 0.0); + correct3 = func.f_ff(-0.0, 0.0); + correct4 = func.f_ff(0.0, -0.0); + correct5 = func.f_ff(-0.0, -0.0); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct2) + || IsFloatNaN(correct2) + || IsFloatInfinity(correct3) + || IsFloatNaN(correct3) + || IsFloatInfinity(correct4) + || IsFloatNaN(correct4) + || IsFloatInfinity(correct5) + || IsFloatNaN(correct5)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + err4 = Ulp_Error_Half(q[j], correct4); + err5 = Ulp_Error_Half(q[j], correct5); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps)) + && (!(fabsf(err4) <= ulps)) + && (!(fabsf(err5) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps) + || IsHalfResultSubnormal(correct4, ulps) + || IsHalfResultSubnormal(correct5, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + double correct2, correct3; + float err2, err3; + + correct2 = func.f_ff(s[j], 0.0); + correct3 = func.f_ff(s[j], -0.0); + + + // Per section 10 paragraph 6, accept any result if an + // input or output is a infinity or NaN or overflow + if (!gInfNanSupport) + { + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsFloatInfinity(correct) || IsFloatNaN(correct) + || IsFloatInfinity(correct2) + || IsFloatNaN(correct2)) + continue; + } + + err2 = Ulp_Error_Half(q[j], correct2); + err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + tinfo->maxErrorValue2 = s2[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), " + "%a (0x%04x)}\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x) at index: %zu\n", + name, sizeNames[k], err, s[j], p[j], s2[j], + p2[j], HTF(r[j]), r[j], test, q[j], j); + return -1; + } + } + } + } + + if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfo test_info{}; + cl_int error; + float maxError = 0.0f; + double maxErrorVal = 0.0; + double maxErrorVal2 = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of " + "gOutBuffer[%d] for region {%zd, %zd}\n", + (int)j, region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info{ test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + // Run the kernels + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (cl_uint i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + maxErrorVal2 = test_info.tinfo[i].maxErrorValue2; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp new file mode 100644 index 000000000..adeada7ca --- /dev/null +++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp @@ -0,0 +1,477 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +struct ComputeReferenceInfoF +{ + const cl_half *x; + const cl_half *y; + cl_half *r; + int32_t *i; + double (*f_ffpI)(double, double, int *); + cl_uint lim; + cl_uint count; +}; + +cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo) +{ + ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo; + cl_uint lim = cri->lim; + cl_uint count = cri->count; + cl_uint off = jid * count; + const cl_half *x = cri->x + off; + const cl_half *y = cri->y + off; + cl_half *r = cri->r + off; + int32_t *i = cri->i + off; + double (*f)(double, double, int *) = cri->f_ffpI; + + if (off + count > lim) count = lim - off; + + for (cl_uint j = 0; j < count; ++j) + r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j)); + + return CL_SUCCESS; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + int64_t maxError2 = 0; + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // use larger type of output data to prevent overflowing buffer size + constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t); + + cl_uint threadCount = GetThreadCount(); + + float half_ulps = f->half_ulps; + + int testingRemquo = !strcmp(f->name, "remquo"); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *p = (cl_half *)gIn; + cl_half *p2 = (cl_half *)gIn2; + for (size_t j = 0; j < buffer_size; j++) + { + p[j] = (cl_half)genrand_int32(d); + p2[j] = (cl_half)genrand_int32(d); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn, 0, + NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + buffer_size * sizeof(cl_half), gIn2, + 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // align working group size with the bigger output type + size_t vectorSize = sizeValues[j] * sizeof(int32_t); + size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + if (threadCount > 1) + { + ComputeReferenceInfoF cri; + cri.x = p; + cri.y = p2; + cri.r = (cl_half *)gOut_Ref; + cri.i = (int32_t *)gOut_Ref2; + cri.f_ffpI = f->func.f_ffpI; + cri.lim = buffer_size; + cri.count = (cri.lim + threadCount - 1) / threadCount; + ThreadPool_Do(ReferenceF, threadCount, &cri); + } + else + { + cl_half *r = (cl_half *)gOut_Ref; + int32_t *r2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + r[j] = + HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j)); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0, + BUFFER_SIZE, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + cl_half *t = (cl_half *)gOut_Ref; + int32_t *t2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < buffer_size; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *q = (cl_half *)(gOut[k]); + int32_t *q2 = (int32_t *)gOut2[k]; + + // Check for exact match to correctly rounded result + if (t[j] == q[j] && t2[j] == q2[j]) continue; + + // Check for paired NaNs + if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j]) + continue; + + cl_half test = ((cl_half *)q)[j]; + int correct2 = INT_MIN; + float correct = + (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2); + float err = Ulp_Error_Half(test, correct); + int64_t iErr; + + // in case of remquo, we only care about the sign and last + // seven bits of integer as per the spec. + if (testingRemquo) + iErr = (long long)(q2[j] & 0x0000007f) + - (long long)(correct2 & 0x0000007f); + else + iErr = (long long)q2[j] - (long long)correct2; + + // For remquo, if y = 0, x is infinite, or either is NaN + // then the standard either neglects to say what is returned + // in iptr or leaves it undefined or implementation defined. + int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f) + || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]); + if (iptrUndefined) iErr = 0; + + int fail = !(fabsf(err) <= half_ulps && iErr == 0); + if (ftz && fail) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + int correct3i, correct4i; + float correct3 = + (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i); + float correct4 = + (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + + // try with both args as zero + if (IsHalfSubnormal(p2[j])) + { + int correct7i, correct8i; + correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i); + correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i); + double correct7 = + f->func.f_ffpI(0.0, -0.0, &correct7i); + double correct8 = + f->func.f_ffpI(-0.0, -0.0, &correct8i); + err2 = Ulp_Error_Half(test, correct3); + err3 = Ulp_Error_Half(test, correct4); + float err4 = Ulp_Error_Half(test, correct7); + float err5 = Ulp_Error_Half(test, correct8); + iErr3 = (long long)q2[j] - (long long)correct3i; + iErr4 = (long long)q2[j] - (long long)correct4i; + int64_t iErr7 = + (long long)q2[j] - (long long)correct7i; + int64_t iErr8 = + (long long)q2[j] - (long long)correct8i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps + && iErr4 == 0)) + && (!(fabsf(err4) <= half_ulps + && iErr7 == 0)) + && (!(fabsf(err5) <= half_ulps + && iErr8 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + if (llabs(iErr7) < llabs(iErr)) iErr = iErr7; + if (llabs(iErr8) < llabs(iErr)) iErr = iErr8; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct3, half_ulps) + || IsHalfResultSubnormal(correct4, half_ulps) + || IsHalfResultSubnormal(correct7, half_ulps) + || IsHalfResultSubnormal(correct8, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0 + || iErr7 == 0 || iErr8 == 0)); + if (!fail) err = 0.0f; + } + } + } + else if (IsHalfSubnormal(p2[j])) + { + int correct3i, correct4i; + double correct3 = + f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i); + double correct4 = + f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i); + float err2 = Ulp_Error_Half(test, correct3); + float err3 = Ulp_Error_Half(test, correct4); + int64_t iErr3 = (long long)q2[j] - (long long)correct3i; + int64_t iErr4 = (long long)q2[j] - (long long)correct4i; + fail = fail + && ((!(fabsf(err2) <= half_ulps && iErr3 == 0)) + && (!(fabsf(err3) <= half_ulps && iErr4 == 0))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (llabs(iErr3) < llabs(iErr)) iErr = iErr3; + if (llabs(iErr4) < llabs(iErr)) iErr = iErr4; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(correct3, half_ulps)) + { + fail = fail + && !(test == 0.0f + && (iErr3 == 0 || iErr4 == 0)); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(p[j]); + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = HTF(p[j]); + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %" PRId64 + "} ulp error at {%a, %a} " + "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, " + "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n", + f->name, sizeNames[k], err, iErr, HTF(p[j]), + HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j], + t2[j], HTF(test), q2[j], test, q2[j]); + return -1; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp index 47f493e7a..df45a7007 100644 --- a/test_conformance/math_brute_force/common.cpp +++ b/test_conformance/math_brute_force/common.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2022 The Khronos Group Inc. +// Copyright (c) 2022-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -27,8 +27,11 @@ const char *GetTypeName(ParameterType type) { switch (type) { + case ParameterType::Half: return "half"; case ParameterType::Float: return "float"; case ParameterType::Double: return "double"; + case ParameterType::Short: return "short"; + case ParameterType::UShort: return "ushort"; case ParameterType::Int: return "int"; case ParameterType::UInt: return "uint"; case ParameterType::Long: return "long"; @@ -41,9 +44,13 @@ const char *GetUndefValue(ParameterType type) { switch (type) { + case ParameterType::Half: case ParameterType::Float: case ParameterType::Double: return "NAN"; + case ParameterType::Short: + case ParameterType::UShort: return "0x5678"; + case ParameterType::Int: case ParameterType::UInt: return "0x12345678"; @@ -71,14 +78,17 @@ void EmitEnableExtension(std::ostringstream &kernel, const std::initializer_list &types) { bool needsFp64 = false; + bool needsFp16 = false; for (const auto &type : types) { switch (type) { case ParameterType::Double: needsFp64 = true; break; - + case ParameterType::Half: needsFp16 = true; break; case ParameterType::Float: + case ParameterType::Short: + case ParameterType::UShort: case ParameterType::Int: case ParameterType::UInt: case ParameterType::Long: @@ -89,6 +99,7 @@ void EmitEnableExtension(std::ostringstream &kernel, } if (needsFp64) kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; + if (needsFp16) kernel << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; } std::string GetBuildOptions(bool relaxed_mode) diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h index 481b3b2a2..eb82c5f8f 100644 --- a/test_conformance/math_brute_force/common.h +++ b/test_conformance/math_brute_force/common.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2021 The Khronos Group Inc. +// Copyright (c) 2021-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -36,8 +36,11 @@ using Buffers = std::array; // Types supported for kernel code generation. enum class ParameterType { + Half, Float, Double, + Short, + UShort, Int, UInt, Long, diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp index 917362852..bd76ef2cc 100644 --- a/test_conformance/math_brute_force/function_list.cpp +++ b/test_conformance/math_brute_force/function_list.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -29,36 +29,41 @@ // Only use ulps information in spir test #ifdef FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ { \ STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \ + _type, _relaxed_embedded_ulp) \ { \ STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL }, \ - _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, \ - _rmode, RELAXED_ON, _type \ + _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp, \ + _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), { NULL }, \ - { NULL }, { NULL }, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, \ - _rmode, RELAXED_OFF, _type \ + { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, \ + INFINITY, _rmode, RELAXED_OFF, _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, \ + _rmode, _type) \ { \ STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ + _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } #define unaryF NULL +#define unaryOF NULL #define i_unaryF NULL #define unaryF_u NULL #define macro_unaryF NULL #define binaryF NULL +#define binaryOF NULL +#define binaryF_nextafter NULL #define binaryOperatorF NULL #define binaryF_i NULL #define macro_binaryF NULL @@ -76,31 +81,34 @@ #else // FUNCTION_LIST_ULPS_ONLY -#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ +#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type) \ { \ STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ { (void*)reference_##_name##l }, { (void*)reference_##_name }, \ - _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ + _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, \ RELAXED_OFF, _type \ } -#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type, \ - _relaxed_embedded_ulp) \ +#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \ + _type, _relaxed_embedded_ulp) \ { \ STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name }, \ { (void*)reference_##_name##l }, \ - { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _embedded_ulp, \ - _relaxed_ulp, _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type \ + { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp, \ + _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode, \ + RELAXED_ON, _type \ } #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type) \ { \ "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), \ { (void*)reference_##_name }, { NULL }, { NULL }, _ulp, _ulp, \ - _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ + _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \ + _type \ } -#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type) \ +#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, \ + _rmode, _type) \ { \ STRINGIFY(_name), _operator, { (void*)reference_##_name }, \ - { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, \ + { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp, \ _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type \ } @@ -108,85 +116,114 @@ static constexpr vtbl _unary = { "unary", TestFunc_Float_Float, TestFunc_Double_Double, + TestFunc_Half_Half, }; +static constexpr vtbl _unaryof = { "unaryof", TestFunc_Float_Float, NULL, + NULL }; + static constexpr vtbl _i_unary = { "i_unary", TestFunc_Int_Float, TestFunc_Int_Double, + TestFunc_Int_Half, }; static constexpr vtbl _unary_u = { "unary_u", TestFunc_Float_UInt, TestFunc_Double_ULong, + TestFunc_Half_UShort, }; static constexpr vtbl _macro_unary = { "macro_unary", TestMacro_Int_Float, TestMacro_Int_Double, + TestMacro_Int_Half, }; static constexpr vtbl _binary = { "binary", TestFunc_Float_Float_Float, TestFunc_Double_Double_Double, + TestFunc_Half_Half_Half, }; +static constexpr vtbl _binary_nextafter = { + "binary", + TestFunc_Float_Float_Float, + TestFunc_Double_Double_Double, + TestFunc_Half_Half_Half_nextafter, +}; + +static constexpr vtbl _binaryof = { "binaryof", TestFunc_Float_Float_Float, + NULL, NULL }; + static constexpr vtbl _binary_operator = { "binaryOperator", TestFunc_Float_Float_Float_Operator, TestFunc_Double_Double_Double_Operator, + TestFunc_Half_Half_Half_Operator, }; static constexpr vtbl _binary_i = { "binary_i", TestFunc_Float_Float_Int, TestFunc_Double_Double_Int, + TestFunc_Half_Half_Int, }; static constexpr vtbl _macro_binary = { "macro_binary", TestMacro_Int_Float_Float, TestMacro_Int_Double_Double, + TestMacro_Int_Half_Half, }; static constexpr vtbl _ternary = { "ternary", TestFunc_Float_Float_Float_Float, TestFunc_Double_Double_Double_Double, + TestFunc_Half_Half_Half_Half, }; static constexpr vtbl _unary_two_results = { "unary_two_results", TestFunc_Float2_Float, TestFunc_Double2_Double, + TestFunc_Half2_Half, }; static constexpr vtbl _unary_two_results_i = { "unary_two_results_i", TestFunc_FloatI_Float, TestFunc_DoubleI_Double, + TestFunc_HalfI_Half, }; static constexpr vtbl _binary_two_results_i = { "binary_two_results_i", TestFunc_FloatI_Float_Float, TestFunc_DoubleI_Double_Double, + TestFunc_HalfI_Half_Half, }; static constexpr vtbl _mad_tbl = { "ternary", TestFunc_mad_Float, TestFunc_mad_Double, + TestFunc_mad_Half, }; #define unaryF &_unary +#define unaryOF &_unaryof #define i_unaryF &_i_unary #define unaryF_u &_unary_u #define macro_unaryF &_macro_unary #define binaryF &_binary +#define binaryF_nextafter &_binary_nextafter +#define binaryOF &_binaryof #define binaryOperatorF &_binary_operator #define binaryF_i &_binary_i #define macro_binaryF &_macro_binary @@ -198,25 +235,26 @@ static constexpr vtbl _mad_tbl = { #endif // FUNCTION_LIST_ULPS_ONLY +// clang-format off const Func functionList[] = { - ENTRY_EXT(acos, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(acosh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(acospi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY_EXT(asin, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(asinh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(asinpi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY_EXT(atan, 5.0f, 5.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), - ENTRY(atanh, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(atanpi, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(atan2, 6.0f, 6.0f, FTZ_OFF, binaryF), - ENTRY(atan2pi, 6.0f, 6.0f, FTZ_OFF, binaryF), - ENTRY(cbrt, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(ceil, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(copysign, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY_EXT(cos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(acosh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(acospi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(asinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(asinpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f), + ENTRY(atanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(atanpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(atan2, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(cbrt, 2.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY(ceil, 0.0f, 0.0f, 0.f, FTZ_OFF, unaryF), + ENTRY(copysign, 0.0f, 0.0f, 0.f, FTZ_OFF, binaryF), + ENTRY_EXT(cos, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY(cosh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY_EXT(cospi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(cosh, 4.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY_EXT(cospi, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 // ENTRY( erfc, 16.0f, // 16.0f, FTZ_OFF, unaryF), @@ -225,81 +263,84 @@ const Func functionList[] = { // 16.0f, 16.0f, FTZ_OFF, // unaryF), //disabled for 1.0 due to lack // of reference implementation - ENTRY_EXT(exp, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f), // relaxed error is actually overwritten in unary.c as it // is 3+floor(fabs(2*x)) - ENTRY_EXT(exp2, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp2, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF, 4.0f), // relaxed error is actually overwritten in unary.c as it // is 3+floor(fabs(2*x)) - ENTRY_EXT(exp10, 3.0f, 4.0f, 8192.0f, FTZ_OFF, unaryF, + ENTRY_EXT(exp10, 3.0f, 4.0f, 2.f, 8192.0f, FTZ_OFF, unaryF, 8192.0f), // relaxed error is actually overwritten in unary.c as // it is 3+floor(fabs(2*x)) in derived mode, // in non-derived mode it uses the ulp error for half_exp10. - ENTRY(expm1, 3.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(fabs, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(fdim, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(floor, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(fma, 0.0f, 0.0f, FTZ_OFF, ternaryF), - ENTRY(fmax, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fmin, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fmod, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(fract, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), - ENTRY(frexp, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i), - ENTRY(hypot, 4.0f, 4.0f, FTZ_OFF, binaryF), - ENTRY(ilogb, 0.0f, 0.0f, FTZ_OFF, i_unaryF), - ENTRY(isequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isfinite, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isgreaterequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isinf, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isless, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(islessequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(islessgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isnan, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isnormal, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY(isnotequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(isunordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), - ENTRY(ldexp, 0.0f, 0.0f, FTZ_OFF, binaryF_i), - ENTRY(lgamma, INFINITY, INFINITY, FTZ_OFF, unaryF), - ENTRY(lgamma_r, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i), - ENTRY_EXT(log, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY(expm1, 3.0f, 4.0f, 2.f, FTZ_OFF, unaryF), + ENTRY(fabs, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(fdim, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(floor, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(fma, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF), + ENTRY(fmax, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fmin, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fmod, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(fract, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), + ENTRY(frexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i), + ENTRY(hypot, 4.0f, 4.0f, 2.0f, FTZ_OFF, binaryF), + ENTRY(ilogb, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF), + ENTRY(isequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isfinite, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isinf, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isless, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(islessequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isnan, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isnormal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF), + ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i), + ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF), + ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, + unaryF_two_results_i), + ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY_EXT(log2, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY_EXT(log10, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, + ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF, 4.76837158203125e-7f), // relaxed ulp 2^-21 - ENTRY(log1p, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(logb, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function, + ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, + mad_function, INFINITY), // in fast-relaxed-math mode it has to be either // exactly rounded fma or exactly rounded a*b+c - ENTRY(maxmag, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), - ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u), - ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF, + ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(minmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(modf, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results), + ENTRY(nan, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u), + ENTRY(nextafter, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter), + ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 8192.0f, FTZ_OFF, binaryF, 8192.0f), // in derived mode the ulp error is calculated as // exp2(y*log2(x)) and in non-derived it is the same as // half_pow - ENTRY(pown, 16.0f, 16.0f, FTZ_OFF, binaryF_i), - ENTRY(powr, 16.0f, 16.0f, FTZ_OFF, binaryF), + ENTRY(pown, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i), + ENTRY(powr, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF), // ENTRY( reciprocal, 1.0f, // 1.0f, FTZ_OFF, unaryF), - ENTRY(remainder, 0.0f, 0.0f, FTZ_OFF, binaryF), - ENTRY(remquo, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i), - ENTRY(rint, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(rootn, 16.0f, 16.0f, FTZ_OFF, binaryF_i), - ENTRY(round, 0.0f, 0.0f, FTZ_OFF, unaryF), - ENTRY(rsqrt, 2.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY(signbit, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), - ENTRY_EXT(sin, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(remainder, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF), + ENTRY(remquo, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i), + ENTRY(rint, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(rootn, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i), + ENTRY(round, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, FTZ_OFF, unaryF), + ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY_EXT(sincos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results, + ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, + unaryF_two_results, 0.00048828125f), // relaxed ulp 2^-11 - ENTRY(sinh, 4.0f, 4.0f, FTZ_OFF, unaryF), - ENTRY_EXT(sinpi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF, + ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF, 0.00048828125f), // relaxed ulp 2^-11 { "sqrt", "sqrt", @@ -308,6 +349,7 @@ const Func functionList[] = { { NULL }, 3.0f, 0.0f, + 0.0f, 4.0f, INFINITY, INFINITY, @@ -322,41 +364,42 @@ const Func functionList[] = { 0.0f, 0.0f, 0.0f, + 0.0f, INFINITY, INFINITY, FTZ_OFF, RELAXED_OFF, unaryF }, ENTRY_EXT( - tan, 5.0f, 5.0f, 8192.0f, FTZ_OFF, unaryF, + tan, 5.0f, 5.0f, 2.0f, 8192.0f, FTZ_OFF, unaryF, 8192.0f), // in derived mode it the ulp error is calculated as sin/cos // and in non-derived mode it is the same as half_tan. - ENTRY(tanh, 5.0f, 5.0f, FTZ_OFF, unaryF), - ENTRY(tanpi, 6.0f, 6.0f, FTZ_OFF, unaryF), + ENTRY(tanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF), + ENTRY(tanpi, 6.0f, 6.0f, 2.0f, FTZ_OFF, unaryF), // ENTRY( tgamma, 16.0f, // 16.0f, FTZ_OFF, unaryF), // // Commented this out until we can be // sure this requirement is realistic - ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF), + ENTRY(trunc, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), - HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryF), - HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryF), - HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF), - HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryF), + HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF), + HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryOF), + HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF), + HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF), // basic operations - OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), - OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), { "divide", "/", { (void*)reference_divide }, @@ -364,6 +407,7 @@ const Func functionList[] = { { (void*)reference_relaxed_divide }, 2.5f, 0.0f, + 0.0f, 3.0f, 2.5f, INFINITY, @@ -378,15 +422,17 @@ const Func functionList[] = { 0.0f, 0.0f, 0.0f, + 0.0f, 0.f, INFINITY, FTZ_OFF, RELAXED_OFF, binaryOperatorF }, - OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), - OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, FTZ_OFF, + OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF), + OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF), // A simple copy operation - OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF), + OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF), }; +// clang-format on const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]); diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h index 95a294593..71bde9fdb 100644 --- a/test_conformance/math_brute_force/function_list.h +++ b/test_conformance/math_brute_force/function_list.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -70,6 +70,9 @@ struct vtbl int (*DoubleTestFunc)( const struct Func *, MTdata, bool); // may be NULL if function is single precision only + int (*HalfTestFunc)( + const struct Func *, MTdata, + bool); // may be NULL if function is single precision only }; struct Func @@ -82,6 +85,7 @@ struct Func fptr rfunc; float float_ulps; float double_ulps; + float half_ulps; float float_embedded_ulps; float relaxed_error; float relaxed_embedded_error; diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp new file mode 100644 index 000000000..3a8991a38 --- /dev/null +++ b/test_conformance/math_brute_force/i_unary_half.cpp @@ -0,0 +1,220 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include +#include + +namespace { + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Int, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. + int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSizeIn = bufferElements * sizeof(cl_half); + size_t bufferSizeOut = bufferElements * sizeof(cl_int); + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // This test is not using ThreadPool so we need to disable FTZ here + // for reference computations + FPU_mode_type oldMode; + DisableFTZ(&oldMode); + std::shared_ptr at_scope_exit( + nullptr, [&oldMode](int *) { RestoreFPState(&oldMode); }); + + // Init the kernels + { + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + std::vector s(bufferElements); + + for (uint64_t i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + + for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSizeIn, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSizeOut); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSizeOut, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSizeOut, + 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_int); + size_t localCount = (bufferSizeOut + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + int *r = (int *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + s[j] = HTF(p[j]); + r[j] = f->func.i_f(s[j]); + } + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSizeOut, gOut[j], 0, NULL, + NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint32_t *t = (uint32_t *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint32_t *q = (uint32_t *)(gOut[k]); + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + if (ftz && IsHalfSubnormal(p[j])) + { + unsigned int correct0 = f->func.i_f(0.0); + unsigned int correct1 = f->func.i_f(-0.0); + if (q[j] == correct0 || q[j] == correct1) continue; + } + + uint32_t err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): " + "*%d vs. %d\n", + f->name, sizeNames[k], err, s[j], p[j], t[j], + q[j]); + return -1; + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zd \n", + i, step, bufferSizeOut); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp new file mode 100644 index 000000000..ea4ef8128 --- /dev/null +++ b/test_conformance/math_brute_force/macro_binary_half.cpp @@ -0,0 +1,540 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetBinaryKernel(kernel_name, builtin, ParameterType::Short, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper inBuf2; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + MTdataHolder d; + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +// A table of more difficult cases to get right +const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + fptr func = job->f->func; + int ftz = job->ftz; + MTdata d = tinfo->d; + cl_uint j, k; + cl_int error; + const char *name = job->f->name; + cl_short *t, *r; + std::vector s(0), s2(0); + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_short *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Init input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements; + j = 0; + int totalSpecialValueCount = + specialValuesHalfCount * specialValuesHalfCount; + int indx = (totalSpecialValueCount - 1) / buffer_elements; + + if (job_id <= (cl_uint)indx) + { // test edge cases + uint32_t x, y; + + x = (job_id * buffer_elements) % specialValuesHalfCount; + y = (job_id * buffer_elements) / specialValuesHalfCount; + + for (; j < buffer_elements; j++) + { + p[j] = specialValuesHalf[x]; + p2[j] = specialValuesHalf[y]; + if (++x >= specialValuesHalfCount) + { + x = 0; + y++; + if (y >= specialValuesHalfCount) break; + } + } + } + + // Init any remaining values. + for (; j < buffer_elements; j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + } + + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0, + buffer_size, p2, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2), + &tinfo->inBuf2))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + r = (cl_short *)gOut_Ref + thread_id * buffer_elements; + t = (cl_short *)r; + s.resize(buffer_elements); + s2.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + s2[j] = cl_half_to_float(p2[j]); + r[j] = (short)func.i_ff(s[j], s2[j]); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Wait for the last buffer + out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + cl_short *q = (cl_short *)out[0]; + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + if (IsHalfSubnormal(p2[j])) + { + short correct = (short)func.i_ff(0.0f, 0.0f); + short correct2 = (short)func.i_ff(0.0f, -0.0f); + short correct3 = (short)func.i_ff(-0.0f, 0.0f); + short correct4 = (short)func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + short correct = (short)func.i_ff(0.0f, s2[j]); + short correct2 = (short)func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsHalfSubnormal(p2[j])) + { + short correct = (short)func.i_ff(s[j], 0.0f); + short correct2 = (short)func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error( + "\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n", + name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j); + error = -1; + return error; + } + + + for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; + k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + if (IsHalfSubnormal(p2[j])) + { + short correct = (short)-func.i_ff(0.0f, 0.0f); + short correct2 = (short)-func.i_ff(0.0f, -0.0f); + short correct3 = (short)-func.i_ff(-0.0f, 0.0f); + short correct4 = (short)-func.i_ff(-0.0f, -0.0f); + + if (correct == q[j] || correct2 == q[j] + || correct3 == q[j] || correct4 == q[j]) + continue; + } + else + { + short correct = (short)-func.i_ff(0.0f, s2[j]); + short correct2 = (short)-func.i_ff(-0.0f, s2[j]); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + else if (IsHalfSubnormal(p2[j])) + { + short correct = (short)-func.i_ff(s[j], 0.0f); + short correct2 = (short)-func.i_ff(s[j], -0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + cl_ushort err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a " + "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x " + "(index: %d)\n", + name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j); + error = -1; + return error; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return error; +} + +} // anonymous namespace + +int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + test_info.tinfo[i].inBuf2 = + clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf2) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + + test_info.tinfo[i].d = MTdataHolder(genrand_int32(d)); + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp new file mode 100644 index 000000000..cb2020551 --- /dev/null +++ b/test_conformance/math_brute_force/macro_unary_half.cpp @@ -0,0 +1,427 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Short, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +}; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + int ftz; // non-zero if running in flush to zero mode +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + fptr func = job->f->func; + int ftz = job->ftz; + cl_uint j, k; + cl_int error = CL_SUCCESS; + const char *name = job->f->name; + std::vector s(0); + + int signbit_test = 0; + if (!strcmp(name, "signbit")) signbit_test = 1; + +#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s)) + + // start the map of the output arrays + cl_event e[VECTOR_SIZE_COUNT]; + cl_short *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Write the new values to the input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale; + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements; + cl_short *t = (cl_short *)r; + s.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = cl_half_to_float(p[j]); + if (!strcmp(name, "isnormal")) + { + if ((IsHalfSubnormal(p[j]) == 0) && !((p[j] & 0x7fffU) >= 0x7c00U) + && ((p[j] & 0x7fffU) != 0x0000U)) + r[j] = 1; + else + r[j] = 0; + } + else + r[j] = (short)ref_func(s[j]); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (cl_short *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + // Wait for the last buffer + out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + cl_short *q = out[0]; + + // If we aren't getting the correctly rounded result + if (gMinVectorSizeIndex == 0 && t[j] != q[j]) + { + // If we aren't getting the correctly rounded result + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + short correct = (short)ref_func(+0.0f); + short correct2 = (short)ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = t[j] - q[j]; + if (q[j] > t[j]) err = q[j] - t[j]; + vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: " + "%d vs. %d\n", + name, err, s[j], p[j], t[j], q[j]); + error = -1; + return error; + } + + + for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex; + k++) + { + q = out[k]; + // If we aren't getting the correctly rounded result + if (-t[j] != q[j]) + { + if (ftz) + { + if (IsHalfSubnormal(p[j])) + { + short correct = (short)-ref_func(+0.0f); + short correct2 = (short)-ref_func(-0.0f); + if (correct == q[j] || correct2 == q[j]) continue; + } + } + + short err = -t[j] - q[j]; + if (q[j] > -t[j]) err = q[j] + t[j]; + vlog_error("\nERROR: %s%s: %d ulp error at %a " + "(0x%04x)\nExpected: %d \nActual: %d\n", + name, sizeNames[k], err, s[j], p[j], -t[j], q[j]); + error = -1; + return error; + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + return error; +} + +} // anonymous namespace + +int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp new file mode 100644 index 000000000..b6cdda640 --- /dev/null +++ b/test_conformance/math_brute_force/mad_half.cpp @@ -0,0 +1,201 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetTernaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. + float maxError = 0.0f; + + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + size_t bufferSize = BUFFER_SIZE; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + uint64_t step = getTestStep(sizeof(cl_half), bufferSize); + + // Init the kernels + { + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + } + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + cl_ushort *p2 = (cl_ushort *)gIn2; + cl_ushort *p3 = (cl_ushort *)gIn3; + for (size_t j = 0; j < bufferSize / sizeof(cl_ushort); j++) + { + p[j] = (cl_ushort)genrand_int32(d); + p2[j] = (cl_ushort)genrand_int32(d); + p3[j] = (cl_ushort)genrand_int32(d); + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + bufferSize, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + bufferSize, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_half) * sizeValues[j]; + size_t localCount = (bufferSize + vectorSize - 1) + / vectorSize; // bufferSize / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer3), &gInBuffer3))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data - no verification possible. MAD is a random number + // generator. + + if (0 == (i & 0x0fffffff)) + { + vlog("."); + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("pass"); + + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + } + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp index 947b94509..d939984e5 100644 --- a/test_conformance/math_brute_force/main.cpp +++ b/test_conformance/math_brute_force/main.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -49,6 +49,8 @@ #include "harness/testHarness.h" #define kPageSize 4096 +#define HALF_REQUIRED_FEATURES_1 (CL_FP_ROUND_TO_ZERO) +#define HALF_REQUIRED_FEATURES_2 (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN) #define DOUBLE_REQUIRED_FEATURES \ (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO \ | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM) @@ -81,6 +83,8 @@ static int gTestFastRelaxed = 1; */ int gFastRelaxedDerived = 1; static int gToggleCorrectlyRoundedDivideSqrt = 0; +int gHasHalf = 0; +cl_device_fp_config gHalfCapabilities = 0; int gDeviceILogb0 = 1; int gDeviceILogbNaN = 1; int gCheckTininessBeforeRounding = 1; @@ -104,6 +108,8 @@ cl_device_fp_config gFloatCapabilities = 0; int gWimpyReductionFactor = 32; int gVerboseBruteForce = 0; +cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE; + static int ParseArgs(int argc, const char **argv); static void PrintUsage(void); static void PrintFunctions(void); @@ -167,7 +173,6 @@ static int doTest(const char *name) return 0; } } - { if (0 == strcmp("ilogb", func_data->name)) { @@ -236,6 +241,23 @@ static int doTest(const char *name) } } } + + if (gHasHalf && NULL != func_data->vtbl_ptr->HalfTestFunc) + { + gTestCount++; + vlog("%3d: ", gTestCount); + if (func_data->vtbl_ptr->HalfTestFunc(func_data, gMTdata, + false /* relaxed mode*/)) + { + gFailCount++; + error++; + if (gStopOnError) + { + gSkipRestOfTests = true; + return error; + } + } + } } return error; @@ -408,6 +430,8 @@ static int ParseArgs(int argc, const char **argv) case 'm': singleThreaded ^= 1; break; + case 'g': gHasHalf ^= 1; break; + case 'r': gTestFastRelaxed ^= 1; break; case 's': gStopOnError ^= 1; break; @@ -540,6 +564,8 @@ static void PrintUsage(void) vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 " "on)\n"); vlog("\t\t-f\tToggle float precision testing. (Default: on)\n"); + vlog("\t\t-g\tToggle half precision testing. (Default: on if khr_fp_16 " + "on)\n"); vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n"); vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math " "precision. (Default: on)\n"); @@ -640,6 +666,54 @@ test_status InitCL(cl_device_id device) #endif } + gFloatToHalfRoundingMode = kRoundToNearestEven; + if (is_extension_available(gDevice, "cl_khr_fp16")) + { + gHasHalf ^= 1; +#if defined(CL_DEVICE_HALF_FP_CONFIG) + if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_HALF_FP_CONFIG, + sizeof(gHalfCapabilities), + &gHalfCapabilities, NULL))) + { + vlog_error( + "ERROR: Unable to get device CL_DEVICE_HALF_FP_CONFIG. (%d)\n", + error); + return TEST_FAIL; + } + if (HALF_REQUIRED_FEATURES_1 + != (gHalfCapabilities & HALF_REQUIRED_FEATURES_1) + && HALF_REQUIRED_FEATURES_2 + != (gHalfCapabilities & HALF_REQUIRED_FEATURES_2)) + { + char list[300] = ""; + if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_NEAREST)) + strncat(list, "CL_FP_ROUND_TO_NEAREST, ", sizeof(list) - 1); + if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_ZERO)) + strncat(list, "CL_FP_ROUND_TO_ZERO, ", sizeof(list) - 1); + if (0 == (gHalfCapabilities & CL_FP_INF_NAN)) + strncat(list, "CL_FP_INF_NAN, ", sizeof(list) - 1); + vlog_error("ERROR: required half features are missing: %s\n", list); + + return TEST_FAIL; + } + + if ((gHalfCapabilities & CL_FP_ROUND_TO_NEAREST) != 0) + { + gHalfRoundingMode = CL_HALF_RTE; + } + else // due to above condition it must be RTZ + { + gHalfRoundingMode = CL_HALF_RTZ; + } + +#else + vlog_error("FAIL: device says it supports cl_khr_fp16 but " + "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n"); + return TEST_FAIL; +#endif + } + + uint32_t deviceFrequency = 0; size_t configSize = sizeof(deviceFrequency); if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, @@ -829,6 +903,7 @@ test_status InitCL(cl_device_id device) "Bruteforce_Ulp_Error_Double() for more details.\n\n"); } + vlog("\tTesting half precision? %s\n", no_yes[0 != gHasHalf]); vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]); if (gIsEmbedded) vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]); diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp index afa072f8e..c43b03b66 100644 --- a/test_conformance/math_brute_force/reference_math.cpp +++ b/test_conformance/math_brute_force/reference_math.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -4699,6 +4699,49 @@ double reference_nextafter(double xx, double yy) return a.f; } +cl_half reference_nanh(cl_ushort x) +{ + cl_ushort u; + cl_half h; + u = x | 0x7e00U; + memcpy(&h, &u, sizeof(cl_half)); + return h; +} + +float reference_nextafterh(float xx, float yy, bool allow_denorms) +{ + cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE); + cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE); + float x = cl_half_to_float(tmp_a); + float y = cl_half_to_float(tmp_b); + + // take care of nans + if (x != x) return x; + + if (y != y) return y; + + if (x == y) return y; + + short a_h = cl_half_from_float(x, CL_HALF_RTE); + short b_h = cl_half_from_float(y, CL_HALF_RTE); + short oa_h = a_h; + + if (a_h & 0x8000) a_h = 0x8000 - a_h; + if (b_h & 0x8000) b_h = 0x8000 - b_h; + + a_h += (a_h < b_h) ? 1 : -1; + a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h; + + if (!allow_denorms && IsHalfSubnormal(a_h)) + { + if (cl_half_to_float(0x7fff & oa_h) < cl_half_to_float(0x7fff & a_h)) + a_h = (a_h & 0x8000) ? 0x8400 : 0x0400; + else + a_h = 0; + } + + return cl_half_to_float(a_h); +} long double reference_nextafterl(long double xx, long double yy) { diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h index 78b245105..03eeacb8a 100644 --- a/test_conformance/math_brute_force/reference_math.h +++ b/test_conformance/math_brute_force/reference_math.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #else #include +#include "CL/cl_half.h" #endif // -- for testing float -- @@ -160,6 +161,8 @@ long double reference_fractl(long double, long double*); long double reference_fmal(long double, long double, long double); long double reference_madl(long double, long double, long double); long double reference_nextafterl(long double, long double); +float reference_nextafterh(float, float, bool allow_denormals = true); +cl_half reference_nanh(cl_ushort); long double reference_recipl(long double); long double reference_rootnl(long double, int); long double reference_rsqrtl(long double); diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp new file mode 100644 index 000000000..18075379d --- /dev/null +++ b/test_conformance/math_brute_force/ternary_half.cpp @@ -0,0 +1,777 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +#define CORRECTLY_ROUNDED 0 +#define FLUSHED 1 + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetTernaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// A table of more difficult cases to get right +static const cl_half specialValuesHalf[] = { + 0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/ + 0xfc00, /*-INFINITY*/ + 0x8000, /*-0*/ + 0x7bff, /*HALF_MAX*/ + 0x0400, /*HALF_MIN*/ + 0x03ff, /* Largest denormal */ + 0x3c00, /* 1 */ + 0xbc00, /* -1 */ + 0x3555, /*nearest value to 1/3*/ + 0x3bff, /*largest number less than one*/ + 0xc000, /* -2 */ + 0xfbff, /* -HALF_MAX */ + 0x8400, /* -HALF_MIN */ + 0x4248, /* M_PI_H */ + 0xc248, /* -M_PI_H */ + 0xbbff, /* Largest negative fraction */ +}; + +constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf); + +} // anonymous namespace + +int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + float maxErrorVal3 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half); + + cl_uchar overflow[bufferElements]; + float half_ulps = f->half_ulps; + int skipNanInf = (0 == strcmp("fma", f->nameInCode)); + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_half *hp0 = (cl_half *)gIn; + cl_half *hp1 = (cl_half *)gIn2; + cl_half *hp2 = (cl_half *)gIn3; + size_t idx = 0; + + if (i == 0) + { // test edge cases + uint32_t x, y, z; + x = y = z = 0; + for (; idx < bufferElements; idx++) + { + hp0[idx] = specialValuesHalf[x]; + hp1[idx] = specialValuesHalf[y]; + hp2[idx] = specialValuesHalf[z]; + + if (++x >= specialValuesHalfCount) + { + x = 0; + if (++y >= specialValuesHalfCount) + { + y = 0; + if (++z >= specialValuesHalfCount) break; + } + } + } + if (idx == bufferElements) + vlog_error("Test Error: not all special cases tested!\n"); + } + + auto any_value = [&d]() { + float t = (float)((double)genrand_int32(d) / (double)0xFFFFFFFF); + return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX); + }; + + for (; idx < bufferElements; idx++) + { + hp0[idx] = any_value(); + hp1[idx] = any_value(); + hp2[idx] = any_value(); + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + BUFFER_SIZE, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0, + BUFFER_SIZE, gIn2, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error); + return error; + } + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0, + BUFFER_SIZE, gIn3, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, BUFFER_SIZE); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, BUFFER_SIZE, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, BUFFER_SIZE, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeof(cl_half) * sizeValues[j]; + size_t localCount = (BUFFER_SIZE + vectorSize - 1) + / vectorSize; // BUFFER_SIZE / vectorSize rounded up + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer2), &gInBuffer2))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 3, + sizeof(gInBuffer3), &gInBuffer3))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog("clFlush failed\n"); + return error; + } + + // Calculate the correctly rounded reference result + cl_half *res = (cl_half *)gOut_Ref; + if (skipNanInf) + { + for (size_t j = 0; j < bufferElements; j++) + { + feclearexcept(FE_OVERFLOW); + res[j] = HFD((double)f->dfunc.f_fff(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]))); + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + } + } + else + { + for (size_t j = 0; j < bufferElements; j++) + res[j] = HFD((double)f->dfunc.f_fff(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]))); + } + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + BUFFER_SIZE, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + uint16_t *t = (uint16_t *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + uint16_t *q = (uint16_t *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + int fail; + cl_half test = ((cl_half *)q)[j]; + double ref1 = (double)f->dfunc.f_fff( + HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j])); + cl_half correct = HFD(ref1); + + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (skipNanInf) + { + if (overflow[j] || IsHalfInfinity(correct) + || IsHalfNaN(correct) || IsHalfInfinity(hp0[j]) + || IsHalfNaN(hp0[j]) || IsHalfInfinity(hp1[j]) + || IsHalfNaN(hp1[j]) || IsHalfInfinity(hp2[j]) + || IsHalfNaN(hp2[j])) + continue; + } + + float err = + test != correct ? Ulp_Error_Half(test, ref1) : 0.f; + fail = !(fabsf(err) <= half_ulps); + + if (fail && ftz) + { + // retry per section 6.5.3.2 with flushing on + if (0.0f == test + && 0.0f + == f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + HTF(hp2[j]), FLUSHED)) + { + fail = 0; + err = 0.0f; + } + + // retry per section 6.5.3.3 + if (fail && IsHalfSubnormal(hp0[j])) + { // look at me, + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = + f->func.f_fma(0.0f, HTF(hp1[j]), HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(-0.0f, HTF(hp1[j]), HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + HTF(hp2[j]), FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + HTF(hp2[j]), FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + // try with first two args as zero + if (IsHalfSubnormal(hp1[j])) + { // its fun to have fun, + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(0.0f, 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(-0.0f, 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(0.0f, -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(-0.0f, -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, 0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, 0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, -0.0f, + HTF(hp2[j]), + FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, -0.0f, + HTF(hp2[j]), + FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + if (IsHalfSubnormal(hp2[j])) + { + if (test == 0.0f) // 0*0+0 is 0 + { + fail = 0; + err = 0.0f; + } + } + } + else if (IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(0.0f, HTF(hp1[j]), 0.0f, + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(-0.0f, HTF(hp1[j]), 0.0f, + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(0.0f, HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(-0.0f, HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(0.0f, HTF(hp1[j]), + -0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(-0.0f, HTF(hp1[j]), + -0.0f, FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + else if (fail && IsHalfSubnormal(hp1[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = + f->func.f_fma(HTF(hp0[j]), 0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(HTF(hp0[j]), -0.0f, HTF(hp2[j]), + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + HTF(hp2[j]), FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + HTF(hp2[j]), FLUSHED))) + { + fail = 0; + err = 0.0f; + } + + // try with second two args as zero + if (IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + ref2 = f->func.f_fma(HTF(hp0[j]), 0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct2 = HFF(ref2); + ref3 = f->func.f_fma(HTF(hp0[j]), -0.0f, 0.0f, + CORRECTLY_ROUNDED); + correct3 = HFF(ref3); + float ref4 = + f->func.f_fma(HTF(hp0[j]), 0.0f, -0.0f, + CORRECTLY_ROUNDED); + cl_half correct4 = HFF(ref4); + float ref5 = + f->func.f_fma(HTF(hp0[j]), -0.0f, -0.0f, + CORRECTLY_ROUNDED); + cl_half correct5 = HFF(ref5); + + // Per section 10 paragraph 6, accept any result + // if an input or output is a infinity or NaN or + // overflow + if (!gInfNanSupport) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3) + || IsHalfInfinity(correct4) + || IsHalfNaN(correct4) + || IsHalfInfinity(correct5) + || IsHalfNaN(correct5)) + continue; + } + + err2 = test != correct2 + ? Ulp_Error_Half(test, ref2) + : 0.f; + err3 = test != correct3 + ? Ulp_Error_Half(test, ref3) + : 0.f; + float err4 = test != correct4 + ? Ulp_Error_Half(test, ref4) + : 0.f; + float err5 = test != correct5 + ? Ulp_Error_Half(test, ref5) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps)) + && (!(fabsf(err4) <= half_ulps)) + && (!(fabsf(err5) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + if (fabsf(err4) < fabsf(err)) err = err4; + if (fabsf(err5) < fabsf(err)) err = err5; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + 0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), 0.0f, + -0.0f, FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), -0.0f, + -0.0f, FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + else if (fail && IsHalfSubnormal(hp2[j])) + { + if (skipNanInf) feclearexcept(FE_OVERFLOW); + + float ref2 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), + 0.0f, CORRECTLY_ROUNDED); + cl_half correct2 = HFF(ref2); + float ref3 = + f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), -0.0f, + CORRECTLY_ROUNDED); + cl_half correct3 = HFF(ref3); + + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correct2) + || IsHalfNaN(correct2) + || IsHalfInfinity(correct3) + || IsHalfNaN(correct3)) + continue; + } + + float err2 = test != correct2 + ? Ulp_Error_Half(test, correct2) + : 0.f; + float err3 = test != correct3 + ? Ulp_Error_Half(test, correct3) + : 0.f; + fail = fail + && ((!(fabsf(err2) <= half_ulps)) + && (!(fabsf(err3) <= half_ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (0.0f == test + && (0.0f + == f->func.f_fma(HTF(hp0[j]), + HTF(hp1[j]), 0.0f, + FLUSHED) + || 0.0f + == f->func.f_fma(HTF(hp0[j]), + HTF(hp1[j]), -0.0f, + FLUSHED))) + { + fail = 0; + err = 0.0f; + } + } + } + + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = HTF(hp0[j]); + maxErrorVal2 = HTF(hp1[j]); + maxErrorVal3 = HTF(hp2[j]); + } + + if (fail) + { + vlog_error( + "\nERROR: %s%s: %f ulp error at {%a, %a, %a} " + "({0x%4.4x, 0x%4.4x, 0x%4.4x}): *%a vs. %a\n", + f->name, sizeNames[k], err, HTF(hp0[j]), + HTF(hp1[j]), HTF(hp2[j]), hp0[j], hp1[j], hp2[j], + HTF(res[j]), HTF(test)); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n", + i, step, BUFFER_SIZE); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2, + maxErrorVal3); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h index 78aef9c9a..9abb41f15 100644 --- a/test_conformance/math_brute_force/test_functions.h +++ b/test_conformance/math_brute_force/test_functions.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2021 The Khronos Group Inc. +// Copyright (c) 2021-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -24,6 +24,9 @@ int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double) int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half) +int TestFunc_Half_Half(const Func *f, MTdata, bool relaxedMode); + // int foo(float) int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode); @@ -36,6 +39,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode); // double foo(ulong) int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode); +// half (Ushort) +int TestFunc_Half_UShort(const Func *f, MTdata, bool relaxedMode); + // Returns {0, 1} for scalar and {0, -1} for vector. // int foo(float) int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode); @@ -44,21 +50,34 @@ int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode); // int foo(double) int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode); +// int foo(half,half) +int TestMacro_Int_Half_Half(const Func *f, MTdata, bool relaxedMode); + +// int foo(half) +int TestMacro_Int_Half(const Func *f, MTdata, bool relaxedMode); + +// int foo(half) +int TestFunc_Int_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, float) int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double) int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode); +// Half foo(half, half) +int TestFunc_Half_Half_Half(const Func *f, MTdata, bool relaxedMode); // Special handling for nextafter. -// float foo(float, float) -int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata, - bool relaxedMode); +// Half foo(Half, Half) +int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata, bool relaxedMode); + +// Half foo(Half, Half) +int TestFunc_Half_Half_Half_common(const Func *f, MTdata, int isNextafter, + bool relaxedMode); + +// Half foo(Half, int) +int TestFunc_Half_Half_Int(const Func *f, MTdata, bool relaxedMode); -// Special handling for nextafter. -// double foo(double, double) -int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata, - bool relaxedMode); // float op float int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, @@ -68,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata, int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata, bool relaxedMode); +// half op half +int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int) int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode); @@ -89,24 +111,36 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode); int TestFunc_Double_Double_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, half) +int TestFunc_Half_Half_Half_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, float*) int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double*) int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half*) +int TestFunc_Half2_Half(const Func *f, MTdata, bool relaxedMode); + // float foo(float, int*) int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, int*) int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, int*) +int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode); + // float foo(float, float, int*) int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode); // double foo(double, double, int*) int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode); +// half foo(half, half, int*) +int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode); + // Special handling for mad. // float mad(float, float, float) int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); @@ -115,4 +149,7 @@ int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode); // double mad(double, double, double) int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode); +// half mad(half, half, half) +int TestFunc_mad_Half(const Func *f, MTdata, bool relaxedMode); + #endif diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp new file mode 100644 index 000000000..0980fb16c --- /dev/null +++ b/test_conformance/math_brute_force/unary_half.cpp @@ -0,0 +1,483 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include + +namespace { + +cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +// Thread specific data for a worker thread +typedef struct ThreadInfo +{ + clMemWrapper inBuf; // input buffer for the thread + clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread + float maxError; // max error value. Init to 0. + double maxErrorValue; // position of the max error value. Init to 0. + clCommandQueueWrapper + tQueue; // per thread command queue to improve performance +} ThreadInfo; + +struct TestInfoBase +{ + size_t subBufferSize; // Size of the sub-buffer in elements + const Func *f; // A pointer to the function info + cl_uint threadCount; // Number of worker threads + cl_uint jobCount; // Number of jobs + cl_uint step; // step between each chunk and the next. + cl_uint scale; // stride between individual test values + float ulps; // max_allowed ulps + int ftz; // non-zero if running in flush to zero mode + + int isRangeLimited; // 1 if the function is only to be evaluated over a + // range + float half_sin_cos_tan_limit; +}; + +struct TestInfo : public TestInfoBase +{ + TestInfo(const TestInfoBase &base): TestInfoBase(base) {} + + // Array of thread specific information + std::vector tinfo; + + // Programs for various vector sizes. + Programs programs; + + // Thread-specific kernels for each vector size: + // k[vector_size][thread_id] + KernelMatrix k; +}; + +cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data) +{ + TestInfo *job = (TestInfo *)data; + size_t buffer_elements = job->subBufferSize; + size_t buffer_size = buffer_elements * sizeof(cl_half); + cl_uint scale = job->scale; + cl_uint base = job_id * (cl_uint)job->step; + ThreadInfo *tinfo = &(job->tinfo[thread_id]); + float ulps = job->ulps; + fptr func = job->f->func; + cl_uint j, k; + cl_int error = CL_SUCCESS; + + int isRangeLimited = job->isRangeLimited; + float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit; + int ftz = job->ftz; + + std::vector s(0); + + cl_event e[VECTOR_SIZE_COUNT]; + cl_ushort *out[VECTOR_SIZE_COUNT]; + + if (gHostFill) + { + // start the map of the output arrays + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0, + buffer_size, 0, NULL, e + j, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n"); + } + + // Write the new values to the input array + cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements; + for (j = 0; j < buffer_elements; j++) + { + p[j] = base + j * scale; + } + + if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0, + buffer_size, p, 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if (gHostFill) + { + // Wait for the map to finish + if ((error = clWaitForEvents(1, e + j))) + { + vlog_error("Error: clWaitForEvents failed! err: %d\n", error); + return error; + } + if ((error = clReleaseEvent(e[j]))) + { + vlog_error("Error: clReleaseEvent failed! err: %d\n", error); + return error; + } + } + + // Fill the result buffer with garbage, so that old results don't carry + // over + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(out[j], &pattern, buffer_size); + error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL); + test_error(error, "clEnqueueUnmapMemObject failed!\n"); + } + else + { + error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j], + &pattern, sizeof(pattern), 0, + buffer_size, 0, NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + + // run the kernel + size_t vectorCount = + (buffer_elements + sizeValues[j] - 1) / sizeValues[j]; + cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its + // own copy of the cl_kernel + cl_program program = job->programs[j]; + + if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]), + &tinfo->outBuf[j]))) + { + LogBuildError(program); + return error; + } + if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf), + &tinfo->inBuf))) + { + LogBuildError(program); + return error; + } + + if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL, + &vectorCount, NULL, 0, NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + + // Get that moving + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n"); + + if (gSkipCorrectnessTesting) return CL_SUCCESS; + + // Calculate the correctly rounded reference result + cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements; + s.resize(buffer_elements); + for (j = 0; j < buffer_elements; j++) + { + s[j] = (float)cl_half_to_float(p[j]); + r[j] = HFF(func.f_f(s[j])); + } + + // Read the data back -- no need to wait for the first N-1 buffers. This is + // an in order queue. + for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++) + { + out[j] = (uint16_t *)clEnqueueMapBuffer( + tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, + error); + return error; + } + } + // Wait for the last buffer + out[j] = (uint16_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j], + CL_TRUE, CL_MAP_READ, 0, + buffer_size, 0, NULL, NULL, &error); + if (error || NULL == out[j]) + { + vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error); + return error; + } + + // Verify data + for (j = 0; j < buffer_elements; j++) + { + for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = out[k]; + + // If we aren't getting the correctly rounded result + if (r[j] != q[j]) + { + float test = cl_half_to_float(q[j]); + double correct = func.f_f(s[j]); + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= ulps); + + // half_sin/cos/tan are only valid between +-2**16, Inf, NaN + if (isRangeLimited + && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16) + && fabsf(s[j]) < INFINITY) + { + if (fabsf(test) <= half_sin_cos_tan_limit) + { + err = 0; + fail = 0; + } + } + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(p[j])) + { + double correct2 = func.f_f(0.0); + double correct3 = func.f_f(-0.0); + float err2 = Ulp_Error_Half(q[j], correct2); + float err3 = Ulp_Error_Half(q[j], correct3); + fail = fail + && ((!(fabsf(err2) <= ulps)) + && (!(fabsf(err3) <= ulps))); + if (fabsf(err2) < fabsf(err)) err = err2; + if (fabsf(err3) < fabsf(err)) err = err3; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(correct2, ulps) + || IsHalfResultSubnormal(correct3, ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + } + if (fabsf(err) > tinfo->maxError) + { + tinfo->maxError = fabsf(err); + tinfo->maxErrorValue = s[j]; + } + if (fail) + { + vlog_error("\nERROR: %s%s: %f ulp error at %a " + "(half 0x%04x)\nExpected: %a (half 0x%04x) " + "\nActual: %a (half 0x%04x)\n", + job->f->name, sizeNames[k], err, s[j], p[j], + cl_half_to_float(r[j]), r[j], test, q[j]); + error = -1; + return error; + } + } + } + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j], + out[j], 0, NULL, NULL))) + { + vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n", + j, error); + return error; + } + } + + if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n"); + + + if (0 == (base & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f " + "ThreadCount:%2u\n", + base, job->step, job->scale, buffer_elements, job->ulps, + job->threadCount); + } + else + { + vlog("."); + } + fflush(stdout); + } + + return error; +} + +} // anonymous namespace + +int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode) +{ + TestInfoBase test_info_base; + cl_int error; + size_t i, j; + float maxError = 0.0f; + double maxErrorVal = 0.0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + // Init test_info + memset(&test_info_base, 0, sizeof(test_info_base)); + TestInfo test_info(test_info_base); + + test_info.threadCount = GetThreadCount(); + + test_info.subBufferSize = BUFFER_SIZE + / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount)); + test_info.scale = getTestScale(sizeof(cl_half)); + test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale; + if (test_info.step / test_info.subBufferSize != test_info.scale) + { + // there was overflow + test_info.jobCount = 1; + } + else + { + test_info.jobCount = + std::max((cl_uint)1, + (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step)); + } + + test_info.f = f; + test_info.ulps = f->half_ulps; + test_info.ftz = + f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + + test_info.tinfo.resize(test_info.threadCount); + + for (i = 0; i < test_info.threadCount; i++) + { + cl_buffer_region region = { i * test_info.subBufferSize + * sizeof(cl_half), + test_info.subBufferSize * sizeof(cl_half) }; + test_info.tinfo[i].inBuf = + clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY, + CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); + if (error || NULL == test_info.tinfo[i].inBuf) + { + vlog_error("Error: Unable to create sub-buffer of gInBuffer for " + "region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + + for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + test_info.tinfo[i].outBuf[j] = clCreateSubBuffer( + gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, + ®ion, &error); + if (error || NULL == test_info.tinfo[i].outBuf[j]) + { + vlog_error("Error: Unable to create sub-buffer of gOutBuffer " + "for region {%zd, %zd}\n", + region.origin, region.size); + return error; + } + } + test_info.tinfo[i].tQueue = + clCreateCommandQueue(gContext, gDevice, 0, &error); + if (NULL == test_info.tinfo[i].tQueue || error) + { + vlog_error("clCreateCommandQueue failed. (%d)\n", error); + return error; + } + } + + // Check for special cases for unary float + test_info.isRangeLimited = 0; + test_info.half_sin_cos_tan_limit = 0; + if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = 1.0f + + test_info.ulps + * (FLT_EPSILON / 2.0f); // out of range results from finite + // inputs must be in [-1,1] + } + else if (0 == strcmp(f->name, "half_tan")) + { + test_info.isRangeLimited = 1; + test_info.half_sin_cos_tan_limit = + INFINITY; // out of range resut from finite inputs must be numeric + } + + // Init the kernels + { + BuildKernelInfo build_info = { test_info.threadCount, test_info.k, + test_info.programs, f->nameInCode }; + error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info); + test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n"); + } + + if (!gSkipCorrectnessTesting) + { + error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info); + + // Accumulate the arithmetic errors + for (i = 0; i < test_info.threadCount; i++) + { + if (test_info.tinfo[i].maxError > maxError) + { + maxError = test_info.tinfo[i].maxError; + maxErrorVal = test_info.tinfo[i].maxErrorValue; + } + } + + test_error(error, "ThreadPool_Do: TestHalf failed\n"); + + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp new file mode 100644 index 000000000..7d75c67ae --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_half.cpp @@ -0,0 +1,452 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Half, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError0 = 0.0f; + float maxError1 = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal0 = 0.0f; + float maxErrorVal1 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSize = bufferElements * sizeof(cl_half); + + std::vector overflow(bufferElements); + int isFract = 0 == strcmp("fract", f->nameInCode); + int skipNanInf = isFract; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + float half_ulps = f->half_ulps; + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_half *pIn = (cl_half *)gIn; + for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSize, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, bufferSize, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_half); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog_error("clFlush failed\n"); + return error; + } + + FPU_mode_type oldMode; + RoundingMode oldRoundMode = kRoundToNearestEven; + if (isFract) + { + // Calculate the correctly rounded reference result + memset(&oldMode, 0, sizeof(oldMode)); + if (ftz) ForceFTZ(&oldMode); + + // Set the rounding mode to match the device + if (gIsInRTZMode) + oldRoundMode = set_round(kRoundTowardZero, kfloat); + } + + // Calculate the correctly rounded reference result + cl_half *ref1 = (cl_half *)gOut_Ref; + cl_half *ref2 = (cl_half *)gOut_Ref2; + + if (skipNanInf) + { + for (size_t j = 0; j < bufferElements; j++) + { + double dd; + feclearexcept(FE_OVERFLOW); + + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); + ref2[j] = HFF((float)dd); + + // ensure correct rounding of fract result is not reaching 1 + if (isFract && HTF(ref1[j]) >= 1.f) ref1[j] = 0x3bff; + + overflow[j] = + FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW)); + } + } + else + { + for (size_t j = 0; j < bufferElements; j++) + { + double dd; + ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd)); + ref2[j] = HFF((float)dd); + } + } + + if (isFract && ftz) RestoreFPState(&oldMode); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0, + bufferSize, gOut2[j], 0, NULL, NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) + { + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + break; + } + + // Verify data + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *test1 = (cl_half *)gOut[k]; + cl_half *test2 = (cl_half *)gOut2[k]; + + // If we aren't getting the correctly rounded result + if (ref1[j] != test1[j] || ref2[j] != test2[j]) + { + double fp_correct1 = 0, fp_correct2 = 0; + float err = 0, err2 = 0; + + fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2); + + cl_half correct1 = HFF(fp_correct1); + cl_half correct2 = HFF(fp_correct2); + + // Per section 10 paragraph 6, accept any result if an input + // or output is a infinity or NaN or overflow + if (skipNanInf) + { + if (skipNanInf && overflow[j]) continue; + // Note: no double rounding here. Reference functions + // calculate in single precision. + if (IsHalfInfinity(correct1) || IsHalfNaN(correct1) + || IsHalfInfinity(correct2) || IsHalfNaN(correct2) + || IsHalfInfinity(pIn[j]) || IsHalfNaN(pIn[j])) + continue; + } + + err = Ulp_Error_Half(test1[j], fp_correct1); + err2 = Ulp_Error_Half(test2[j], fp_correct2); + + int fail = + !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps); + + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(fp_correct1, half_ulps)) + { + if (IsHalfResultSubnormal(fp_correct2, half_ulps)) + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && HTF(test2[j]) == 0.0f); + if (!fail) + { + err = 0.0f; + err2 = 0.0f; + } + } + else + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && fabsf(err2) <= half_ulps); + if (!fail) err = 0.0f; + } + } + else if (IsHalfResultSubnormal(fp_correct2, half_ulps)) + { + fail = fail + && !(HTF(test2[j]) == 0.0f + && fabsf(err) <= half_ulps); + if (!fail) err2 = 0.0f; + } + + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(pIn[j])) + { + double fp_correctp, fp_correctn; + double fp_correct2p, fp_correct2n; + float errp, err2p, errn, err2n; + + if (skipNanInf) feclearexcept(FE_OVERFLOW); + fp_correctp = f->func.f_fpf(0.0, &fp_correct2p); + fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n); + + cl_half correctp = HFF(fp_correctp); + cl_half correctn = HFF(fp_correctn); + cl_half correct2p = HFF(fp_correct2p); + cl_half correct2n = HFF(fp_correct2n); + + // Per section 10 paragraph 6, accept any result if + // an input or output is a infinity or NaN or + // overflow + if (skipNanInf) + { + if (fetestexcept(FE_OVERFLOW)) continue; + + // Note: no double rounding here. Reference + // functions calculate in single precision. + if (IsHalfInfinity(correctp) + || IsHalfNaN(correctp) + || IsHalfInfinity(correctn) + || IsHalfNaN(correctn) + || IsHalfInfinity(correct2p) + || IsHalfNaN(correct2p) + || IsHalfInfinity(correct2n) + || IsHalfNaN(correct2n)) + continue; + } + + errp = Ulp_Error_Half(test1[j], fp_correctp); + err2p = Ulp_Error_Half(test1[j], fp_correct2p); + errn = Ulp_Error_Half(test1[j], fp_correctn); + err2n = Ulp_Error_Half(test1[j], fp_correct2n); + + fail = fail + && ((!(fabsf(errp) <= half_ulps)) + && (!(fabsf(err2p) <= half_ulps)) + && ((!(fabsf(errn) <= half_ulps)) + && (!(fabsf(err2n) <= half_ulps)))); + if (fabsf(errp) < fabsf(err)) err = errp; + if (fabsf(errn) < fabsf(err)) err = errn; + if (fabsf(err2p) < fabsf(err2)) err2 = err2p; + if (fabsf(err2n) < fabsf(err2)) err2 = err2n; + + // retry per section 6.5.3.4 + if (IsHalfResultSubnormal(fp_correctp, half_ulps) + || IsHalfResultSubnormal(fp_correctn, + half_ulps)) + { + if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && HTF(test2[j]) == 0.0f); + if (!fail) err = err2 = 0.0f; + } + else + { + fail = fail + && !(HTF(test1[j]) == 0.0f + && fabsf(err2) <= half_ulps); + if (!fail) err = 0.0f; + } + } + else if (IsHalfResultSubnormal(fp_correct2p, + half_ulps) + || IsHalfResultSubnormal(fp_correct2n, + half_ulps)) + { + fail = fail + && !(HTF(test2[j]) == 0.0f + && (fabsf(err) <= half_ulps)); + if (!fail) err2 = 0.0f; + } + } + } + if (fabsf(err) > maxError0) + { + maxError0 = fabsf(err); + maxErrorVal0 = HTF(pIn[j]); + } + if (fabsf(err2) > maxError1) + { + maxError1 = fabsf(err2); + maxErrorVal1 = HTF(pIn[j]); + } + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: " + "*{%a, %a} vs. {%a, %a}\n", + f->name, sizeNames[k], err, err2, + HTF(pIn[j]), HTF(ref1[j]), HTF(ref2[j]), + HTF(test1[j]), HTF(test2[j])); + return -1; + } + } + } + } + + if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat); + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zu \n", + i, step, bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0, + maxErrorVal1); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp new file mode 100644 index 000000000..d81ff12cb --- /dev/null +++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp @@ -0,0 +1,347 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" + +#include +#include +#include + +namespace { + +cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::Int, ParameterType::Half, + vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +cl_ulong abs_cl_long(cl_long i) +{ + cl_long mask = i >> 63; + return (i ^ mask) - mask; +} + +} // anonymous namespace + +int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + const unsigned thread_id = 0; // Test is currently not multithreaded. + KernelMatrix kernels; + float maxError = 0.0f; + int64_t maxError2 = 0; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + float maxErrorVal2 = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + + // sizeof(cl_half) < sizeof (int32_t) + // to prevent overflowing gOut_Ref2 it is necessary to use + // bigger type as denominator for buffer size calculation + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int), + size_t(1ULL << (sizeof(cl_half) * 8))); + + size_t bufferSizeLo = bufferElements * sizeof(cl_half); + size_t bufferSizeHi = bufferElements * sizeof(cl_int); + + cl_ulong maxiError = 0; + + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + + float half_ulps = f->half_ulps; + + maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0; + + // Init the kernels + BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernelFn_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + return error; + + for (uint64_t i = 0; i < (1ULL << 16); i += step) + { + // Init input array + cl_half *pIn = (cl_half *)gIn; + for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSizeLo, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // Write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSizeLo); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSizeLo, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + + memset_pattern4(gOut2[j], &pattern, bufferSizeHi); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j], + CL_FALSE, 0, bufferSizeHi, + gOut2[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSizeLo, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 1 failed!\n"); + + error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern, + sizeof(pattern), 0, bufferSizeHi, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer 2 failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + // align working group size with the bigger output type + size_t vectorSize = sizeValues[j] * sizeof(cl_int); + size_t localCount = (bufferSizeHi + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = + clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gOutBuffer2[j]), &gOutBuffer2[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 2, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) + { + vlog_error("clFlush failed\n"); + return error; + } + + // Calculate the correctly rounded reference result + cl_half *ref1 = (cl_half *)gOut_Ref; + int32_t *ref2 = (int32_t *)gOut_Ref2; + for (size_t j = 0; j < bufferElements; j++) + ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j)); + + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + cl_bool blocking = + (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE; + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0, + bufferSizeLo, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, + 0, bufferSizeHi, gOut2[j], 0, NULL, + NULL))) + { + vlog_error("ReadArray2 failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_half *test1 = (cl_half *)(gOut[k]); + int32_t *test2 = (int32_t *)(gOut2[k]); + + // If we aren't getting the correctly rounded result + if (ref1[j] != test1[j] || ref2[j] != test2[j]) + { + cl_half test = ((cl_half *)test1)[j]; + int correct2 = INT_MIN; + float fp_correct = + (float)f->func.f_fpI(HTF(pIn[j]), &correct2); + cl_half correct = HFF(fp_correct); + float err = correct != test + ? Ulp_Error_Half(test, fp_correct) + : 0.f; + cl_long iErr = (int64_t)test2[j] - (int64_t)correct2; + int fail = !(fabsf(err) <= half_ulps + && abs_cl_long(iErr) <= maxiError); + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(fp_correct, half_ulps)) + { + fail = fail && !(test == 0.0f && iErr == 0); + if (!fail) err = 0.0f; + } + + // retry per section 6.5.3.3 + if (IsHalfSubnormal(pIn[j])) + { + int correct5, correct6; + double fp_correct3 = f->func.f_fpI(0.0, &correct5); + double fp_correct4 = f->func.f_fpI(-0.0, &correct6); + + float err2 = Ulp_Error_Half(test, fp_correct3); + float err3 = Ulp_Error_Half(test, fp_correct4); + + cl_long iErr2 = + (long long)test2[j] - (long long)correct5; + cl_long iErr3 = + (long long)test2[j] - (long long)correct6; + + // Did +0 work? + if (fabsf(err2) <= half_ulps + && abs_cl_long(iErr2) <= maxiError) + { + err = err2; + iErr = iErr2; + fail = 0; + } + // Did -0 work? + else if (fabsf(err3) <= half_ulps + && abs_cl_long(iErr3) <= maxiError) + { + err = err3; + iErr = iErr3; + fail = 0; + } + + // retry per section 6.5.3.4 + if (fail + && (IsHalfResultSubnormal(correct2, half_ulps) + || IsHalfResultSubnormal(fp_correct3, + half_ulps))) + { + fail = fail + && !(test == 0.0f + && (abs_cl_long(iErr2) <= maxiError + || abs_cl_long(iErr3) + <= maxiError)); + if (!fail) + { + err = 0.0f; + iErr = 0; + } + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = pIn[j]; + } + if (llabs(iErr) > maxError2) + { + maxError2 = llabs(iErr); + maxErrorVal2 = pIn[j]; + } + + if (fail) + { + vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: " + "*{%a, %d} vs. {%a, %d}\n", + f->name, sizeNames[k], err, (int)iErr, + HTF(pIn[j]), HTF(ref1[j]), + ((int *)gOut_Ref2)[j], HTF(test), test2[j]); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zu \n", + i, step, bufferSizeHi); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + + vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2, + maxErrorVal, maxErrorVal2); + } + + vlog("\n"); + + return CL_SUCCESS; +} diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp new file mode 100644 index 000000000..5c83d04e4 --- /dev/null +++ b/test_conformance/math_brute_force/unary_u_half.cpp @@ -0,0 +1,239 @@ +// +// Copyright (c) 2017-2024 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.h" +#include "function_list.h" +#include "test_functions.h" +#include "utility.h" +#include "reference_math.h" + +#include +#include + +namespace { + +static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, + void *p) +{ + BuildKernelInfo &info = *(BuildKernelInfo *)p; + auto generator = [](const std::string &kernel_name, const char *builtin, + cl_uint vector_size_index) { + return GetUnaryKernel(kernel_name, builtin, ParameterType::Half, + ParameterType::UShort, vector_size_index); + }; + return BuildKernels(info, job_id, generator); +} + +} // anonymous namespace + +int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode) +{ + int error; + Programs programs; + KernelMatrix kernels; + const unsigned thread_id = 0; // Test is currently not multithreaded. + float maxError = 0.0f; + int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities); + float maxErrorVal = 0.0f; + uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE); + size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half), + size_t(1ULL << (sizeof(cl_half) * 8))); + size_t bufferSize = bufferElements * sizeof(cl_half); + logFunctionInfo(f->name, sizeof(cl_half), relaxedMode); + const char *name = f->name; + float half_ulps = f->half_ulps; + + // Init the kernels + BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode }; + if ((error = ThreadPool_Do(BuildKernel_HalfFn, + gMaxVectorSizeIndex - gMinVectorSizeIndex, + &build_info))) + { + return error; + } + + for (uint64_t i = 0; i < (1ULL << 32); i += step) + { + // Init input array + cl_ushort *p = (cl_ushort *)gIn; + for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j; + + if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0, + bufferSize, gIn, 0, NULL, NULL))) + { + vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error); + return error; + } + + // write garbage into output arrays + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + uint32_t pattern = 0xacdcacdc; + if (gHostFill) + { + memset_pattern4(gOut[j], &pattern, bufferSize); + if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j], + CL_FALSE, 0, bufferSize, + gOut[j], 0, NULL, NULL))) + { + vlog_error( + "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n", + error, j); + return error; + } + } + else + { + error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern, + sizeof(pattern), 0, bufferSize, 0, + NULL, NULL); + test_error(error, "clEnqueueFillBuffer failed!\n"); + } + } + + // Run the kernels + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + size_t vectorSize = sizeValues[j] * sizeof(cl_half); + size_t localCount = (bufferSize + vectorSize - 1) / vectorSize; + if ((error = clSetKernelArg(kernels[j][thread_id], 0, + sizeof(gOutBuffer[j]), &gOutBuffer[j]))) + { + LogBuildError(programs[j]); + return error; + } + if ((error = clSetKernelArg(kernels[j][thread_id], 1, + sizeof(gInBuffer), &gInBuffer))) + { + LogBuildError(programs[j]); + return error; + } + + if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id], + 1, NULL, &localCount, NULL, 0, + NULL, NULL))) + { + vlog_error("FAILED -- could not execute kernel\n"); + return error; + } + } + + // Get that moving + if ((error = clFlush(gQueue))) vlog("clFlush failed\n"); + + // Calculate the correctly rounded reference result + cl_half *r = (cl_half *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + if (!strcmp(name, "nan")) + r[j] = reference_nanh(p[j]); + else + r[j] = HFF(f->func.f_u(p[j])); + } + // Read the data back + for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++) + { + if ((error = + clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0, + bufferSize, gOut[j], 0, NULL, NULL))) + { + vlog_error("ReadArray failed %d\n", error); + return error; + } + } + + if (gSkipCorrectnessTesting) break; + + // Verify data + cl_ushort *t = (cl_ushort *)gOut_Ref; + for (size_t j = 0; j < bufferElements; j++) + { + for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++) + { + cl_ushort *q = (cl_ushort *)(gOut[k]); + + // If we aren't getting the correctly rounded result + if (t[j] != q[j]) + { + double test = cl_half_to_float(q[j]); + double correct; + if (!strcmp(name, "nan")) + correct = cl_half_to_float(reference_nanh(p[j])); + else + correct = f->func.f_u(p[j]); + + float err = Ulp_Error_Half(q[j], correct); + int fail = !(fabsf(err) <= half_ulps); + + if (fail) + { + if (ftz) + { + // retry per section 6.5.3.2 + if (IsHalfResultSubnormal(correct, half_ulps)) + { + fail = fail && (test != 0.0f); + if (!fail) err = 0.0f; + } + } + } + if (fabsf(err) > maxError) + { + maxError = fabsf(err); + maxErrorVal = p[j]; + } + if (fail) + { + vlog_error( + "\n%s%s: %f ulp error at 0x%04x \nExpected: %a " + "(0x%04x) \nActual: %a (0x%04x)\n", + f->name, sizeNames[k], err, p[j], + cl_half_to_float(r[j]), r[j], test, q[j]); + return -1; + } + } + } + } + + if (0 == (i & 0x0fffffff)) + { + if (gVerboseBruteForce) + { + vlog("base:%14" PRIu64 " step:%10" PRIu64 + " bufferSize:%10zd \n", + i, step, bufferSize); + } + else + { + vlog("."); + } + fflush(stdout); + } + } + + if (!gSkipCorrectnessTesting) + { + if (gWimpyMode) + vlog("Wimp pass"); + else + vlog("passed"); + } + + if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal); + vlog("\n"); + + return error; +} diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h index 652d990a2..8e9b3a1ad 100644 --- a/test_conformance/math_brute_force/utility.h +++ b/test_conformance/math_brute_force/utility.h @@ -1,5 +1,5 @@ // -// Copyright (c) 2017 The Khronos Group Inc. +// Copyright (c) 2017-2024 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include "harness/testHarness.h" #include "harness/ThreadPool.h" #include "harness/conversions.h" +#include "CL/cl_half.h" #define BUFFER_SIZE (1024 * 1024 * 2) #define EMBEDDED_REDUCTION_FACTOR (64) @@ -61,10 +62,21 @@ extern int gFastRelaxedDerived; extern int gWimpyMode; extern int gHostFill; extern int gIsInRTZMode; +extern int gHasHalf; +extern int gInfNanSupport; +extern int gIsEmbedded; extern int gVerboseBruteForce; extern uint32_t gMaxVectorSizeIndex; extern uint32_t gMinVectorSizeIndex; extern cl_device_fp_config gFloatCapabilities; +extern cl_device_fp_config gHalfCapabilities; +extern RoundingMode gFloatToHalfRoundingMode; + +extern cl_half_rounding_mode gHalfRoundingMode; + +#define HFF(num) cl_half_from_float(num, gHalfRoundingMode) +#define HFD(num) cl_half_from_double(num, gHalfRoundingMode) +#define HTF(num) cl_half_to_float(num) #define LOWER_IS_BETTER 0 #define HIGHER_IS_BETTER 1 @@ -115,6 +127,12 @@ inline int IsFloatResultSubnormal(double x, float ulps) return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126); } +inline int IsHalfResultSubnormal(float x, float ulps) +{ + x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps; + return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14); +} + inline int IsFloatResultSubnormalAbsError(double x, float abs_err) { x = x - abs_err; @@ -157,6 +175,26 @@ inline int IsFloatNaN(double x) return ((u.u & 0x7fffffffU) > 0x7F800000U); } +inline bool IsHalfNaN(const cl_half v) +{ + // Extract FP16 exponent and mantissa + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + + // NaN test + return (h_exp == 0x1F && h_mant != 0); +} + +inline bool IsHalfInfinity(const cl_half v) +{ + // Extract FP16 exponent and mantissa + uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F; + uint16_t h_mant = ((cl_half)v) & 0x3FF; + + // Inf test + return (h_exp == 0x1F && h_mant == 0); +} + cl_uint RoundUpToNextPowerOfTwo(cl_uint x); // Windows (since long double got deprecated) sets the x87 to 53-bit precision