From d25b828df75278c34e9c1e325cb7da34416dd3d8 Mon Sep 17 00:00:00 2001
From: "Wawiorko, Grzegorz" <grzegorz.wawiorko@intel.com>
Date: Wed, 8 Jun 2022 10:16:56 +0200
Subject: [PATCH 01/24] Enable fp16 in math bruteforce

---
 .../math_brute_force/CMakeLists.txt           |   8 +
 .../math_brute_force/binary_half.cpp          | 879 ++++++++++++++++++
 .../math_brute_force/binary_i_double.cpp      |   6 +-
 .../math_brute_force/binary_i_float.cpp       |   7 +-
 .../math_brute_force/binary_i_half.cpp        | 669 +++++++++++++
 .../binary_operator_double.cpp                |   3 +-
 .../binary_operator_float.cpp                 |   3 +-
 .../math_brute_force/function_list.cpp        |  91 +-
 .../math_brute_force/function_list.h          |   4 +
 .../math_brute_force/i_unary_half.cpp         | 306 ++++++
 .../math_brute_force/macro_binary_double.cpp  |   3 +-
 .../math_brute_force/macro_binary_float.cpp   |   3 +-
 .../math_brute_force/macro_binary_half.cpp    | 652 +++++++++++++
 .../math_brute_force/macro_unary_half.cpp     | 543 +++++++++++
 .../math_brute_force/mad_half.cpp             | 295 ++++++
 test_conformance/math_brute_force/main.cpp    |  65 +-
 .../math_brute_force/reference_math.cpp       |  34 +
 .../math_brute_force/reference_math.h         |   4 +
 .../math_brute_force/ternary_double.cpp       |   3 +-
 .../math_brute_force/ternary_float.cpp        |   3 +-
 .../math_brute_force/test_functions.h         |  36 +-
 .../math_brute_force/unary_half.cpp           | 600 ++++++++++++
 .../math_brute_force/unary_u_half.cpp         | 334 +++++++
 test_conformance/math_brute_force/utility.h   |   4 +
 24 files changed, 4500 insertions(+), 55 deletions(-)
 create mode 100644 test_conformance/math_brute_force/binary_half.cpp
 create mode 100644 test_conformance/math_brute_force/binary_i_half.cpp
 create mode 100644 test_conformance/math_brute_force/i_unary_half.cpp
 create mode 100644 test_conformance/math_brute_force/macro_binary_half.cpp
 create mode 100644 test_conformance/math_brute_force/macro_unary_half.cpp
 create mode 100644 test_conformance/math_brute_force/mad_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_u_half.cpp

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index 28d2716f85..1b9c28f85b 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -3,8 +3,10 @@ set(MODULE_NAME BRUTEFORCE)
 set(${MODULE_NAME}_SOURCES
     binary_double.cpp
     binary_float.cpp
+    binary_half.cpp
     binary_i_double.cpp
     binary_i_float.cpp
+    binary_i_half.cpp
     binary_operator_double.cpp
     binary_operator_float.cpp
     binary_two_results_i_double.cpp
@@ -14,12 +16,16 @@ set(${MODULE_NAME}_SOURCES
     function_list.h
     i_unary_double.cpp
     i_unary_float.cpp
+    i_unary_half.cpp
     macro_binary_double.cpp
     macro_binary_float.cpp
+    macro_binary_half.cpp
     macro_unary_double.cpp
     macro_unary_float.cpp
+    macro_unary_half.cpp
     mad_double.cpp
     mad_float.cpp
+    mad_half.cpp
     main.cpp
     reference_math.cpp
     reference_math.h
@@ -30,12 +36,14 @@ set(${MODULE_NAME}_SOURCES
     test_functions.h
     unary_double.cpp
     unary_float.cpp
+    unary_half.cpp
     unary_two_results_double.cpp
     unary_two_results_float.cpp
     unary_two_results_i_double.cpp
     unary_two_results_i_float.cpp
     unary_u_double.cpp
     unary_u_float.cpp
+    unary_u_half.cpp
     utility.cpp
     utility.h
 )
diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
new file mode 100644
index 0000000000..770472c5c6
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -0,0 +1,879 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+#include "reference_math.h"
+#include <cstring>
+
+
+static int BuildKernelHalf(const char *name, int vectorSize,
+                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                           bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global half",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in1, __global half",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global half* out, __global half* in, __global half* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 d0 = vload3( 0, in + 3 * i );\n"
+        "       half3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       half3 d0, d1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (half3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (half3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
+                           info->kernels[i], info->programs + i,
+                           info->relaxedMode);
+}
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
+} TestInfo;
+
+// A table of more difficult cases to get right
+static const cl_half specialValuesHalf[] = {
+    0xffff,
+    0x0000,
+    0x0001,
+    0x7c00 /*INFINITY*/,
+    0xfc00 /*-INFINITY*/,
+    0x8000 /*-0*/,
+    0x7bff /*HALF_MAX*/,
+    0x0400 /*HALF_MIN*/
+};
+
+static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
+                                   bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
+    test_info.isNextafter = isNextafter;
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+
+    vlog("\n");
+
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+
+    int isFDim = job->isFDim;
+    int skipNanInf = job->skipNanInf;
+    int isNextafter = job->isNextafter;
+    cl_ushort *t;
+    cl_half *r;
+    float *s = 0, *s2 = 0;
+
+    RoundingMode oldRoundMode;
+    cl_int copysign_test = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ushort *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            p[j] = specialValuesHalf[x];
+            p2[j] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = (cl_ushort)genrand_int32(d);
+        p2[j] = (cl_ushort)genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint16_t pattern = 0xdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting)
+    {
+        return CL_SUCCESS;
+    }
+
+    FPU_mode_type oldMode;
+    oldRoundMode = kRoundToNearestEven;
+    if (isFDim)
+    {
+        // Calculate the correctly rounded reference result
+        memset(&oldMode, 0, sizeof(oldMode));
+        if (ftz) ForceFTZ(&oldMode);
+
+        // Set the rounding mode to match the device
+        if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
+    }
+
+    if (!strcmp(name, "copysign")) copysign_test = 1;
+
+#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    t = (cl_ushort *)r;
+    s = (float *)malloc(buffer_elements * sizeof(float));
+    s2 = (float *)malloc(buffer_elements * sizeof(float));
+    for (j = 0; j < buffer_elements; j++)
+        for (j = 0; j < buffer_elements; j++)
+        {
+            s[j] = cl_half_to_float(p[j]);
+            s2[j] = cl_half_to_float(p2[j]);
+            if (isNextafter)
+                r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]),
+                                          CL_HALF_RTE);
+            else
+                r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE);
+        }
+
+    if (isFDim && ftz) RestoreFPState(&oldMode);
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ushort *)clEnqueueMapBuffer(
+        tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size,
+        0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ushort *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                double correct;
+                if (isNextafter)
+                    correct = reference_nextafterh(s[j], s2[j]);
+                else
+                    correct = ref_func(s[j], s2[j]);
+
+                float test = cl_half_to_float(q[j]);
+
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (skipNanInf)
+                {
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                        continue;
+                }
+                float err = Ulp_Error_Half(q[j], correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfSubnormal(
+                            cl_half_from_float(correct, CL_HALF_RTE)))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+                        if (isNextafter)
+                            correct2 = reference_nextafterh(0.0, s2[j]);
+                        else
+                            correct2 = ref_func(0.0, s2[j]);
+                        if (isNextafter)
+                            correct3 = reference_nextafterh(-0.0, s2[j]);
+                        else
+                            correct3 = ref_func(-0.0, s2[j]);
+                        if (skipNanInf)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfSubnormal(
+                                cl_half_from_float(correct2, CL_HALF_RTE))
+                            || IsHalfSubnormal(
+                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // allow to omit denorm values for platforms with no
+                        // denorm support for nextafter
+                        if (fail && (isNextafter)
+                            && (correct <= cl_half_to_float(0x3FF))
+                            && (correct >= cl_half_to_float(0x83FF)))
+                        {
+                            fail = fail && (q[j] != p[j]);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            double correct4, correct5;
+                            float err4, err5;
+
+                            if (isNextafter)
+                                correct2 = reference_nextafterh(0.0, 0.0);
+                            else
+                                correct2 = ref_func(0.0, 0.0);
+                            if (isNextafter)
+                                correct3 = reference_nextafterh(-0.0, 0.0);
+                            else
+                                correct3 = ref_func(-0.0, 0.0);
+                            if (isNextafter)
+                                correct4 = reference_nextafterh(0.0, -0.0);
+                            else
+                                correct4 = ref_func(0.0, -0.0);
+                            if (isNextafter)
+                                correct5 = reference_nextafterh(-0.0, -0.0);
+                            else
+                                correct5 = ref_func(-0.0, -0.0);
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (skipNanInf)
+                            {
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
+                                    continue;
+                            }
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            err4 = Ulp_Error_Half(q[j], correct4);
+                            err5 = Ulp_Error_Half(q[j], correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfSubnormal(
+                                    cl_half_from_float(correct2, CL_HALF_RTE))
+                                || IsHalfSubnormal(
+                                    cl_half_from_float(correct3, CL_HALF_RTE))
+                                || IsHalfSubnormal(
+                                    cl_half_from_float(correct4, CL_HALF_RTE))
+                                || IsHalfSubnormal(
+                                    cl_half_from_float(correct5, CL_HALF_RTE)))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // allow to omit denorm values for platforms with no
+                            // denorm support for nextafter
+                            if (fail && (isNextafter)
+                                && (correct <= cl_half_to_float(0x3FF))
+                                && (correct >= cl_half_to_float(0x83FF)))
+                            {
+                                fail = fail && (q[j] != p2[j]);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        if (isNextafter)
+                            correct2 = reference_nextafterh(s[j], 0.0);
+                        else
+                            correct2 = ref_func(s[j], 0.0);
+                        if (isNextafter)
+                            correct3 = reference_nextafterh(s[j], -0.0);
+                        else
+                            correct3 = ref_func(s[j], -0.0);
+                        if (skipNanInf)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfSubnormal(
+                                cl_half_from_float(correct2, CL_HALF_RTE))
+                            || IsHalfSubnormal(
+                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // allow to omit denorm values for platforms with no
+                        // denorm support for nextafter
+                        if (fail && (isNextafter)
+                            && (correct <= cl_half_to_float(0x3FF))
+                            && (correct >= cl_half_to_float(0x83FF)))
+                        {
+                            fail = fail && (q[j] != p2[j]);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), "
+                               "%a (0x%0.4x)}\nExpected: %a  (half 0x%0.4x) "
+                               "\nActual: %a (half 0x%0.4x) at index: %d\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               p2[j], cl_half_to_float(r[j]), r[j], test, q[j],
+                               j);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+exit:
+    if (s) free(s);
+    if (s2) free(s2);
+    return error;
+}
+
+
+int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode);
+}
+
+int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode)
+{
+    return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode);
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index f15c21ede2..8c83b9bf56 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -279,16 +279,14 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 const int specialValuesInt[] = {
     0,       1,  2,  3,  1022,  1023,  1024,   INT_MIN,
     INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX,
 };
 
-constexpr size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
+constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index 9e27b00730..527861c12b 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -269,8 +269,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 const int specialValuesInt[] = {
     0,           1,           2,           3,          126,        127,
@@ -279,9 +278,7 @@ const int specialValuesInt[] = {
     -0x04000001, -1465264071, -1488522147,
 };
 
-constexpr size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-
+constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt);
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
new file mode 100644
index 0000000000..63196f3243
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -0,0 +1,669 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <climits>
+#include <cstring>
+static int BuildKernelHalf(const char *name, int vectorSize,
+                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                           bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global half",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in1, __global int",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global half* out, __global half* in, __global int* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 d0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, i0 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       half3 d0;\n"
+        "       int3 i0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (half3)( in[3*i], NAN, NAN ); \n"
+        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, i0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
+                           info->kernels[i], info->programs + i,
+                           info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special values
+} TestInfo;
+
+
+// A table of more difficult cases to get right
+static const cl_half specialValuesHalf[] = {
+    0xffff,
+    0x0000,
+    0x0001,
+    0x7c00 /*INFINITY*/,
+    0xfc00 /*-INFINITY*/,
+    0x8000 /*-0*/,
+    0x7bff /*HALF_MAX*/,
+    0x0400 /*HALF_MIN*/
+};
+
+static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+static const int specialValuesInt3[] = { 0,       1,     2,      3,
+                                         1022,    1023,  1024,   INT_MIN,
+                                         INT_MAX, -1,    -2,     -3,
+                                         -1022,   -1023, -11024, -INT_MAX };
+static size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3);
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+
+    // Accumulate the arithmetic errors
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        if (test_info.tinfo[i].maxError > maxError)
+        {
+            maxError = test_info.tinfo[i].maxError;
+            maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+        }
+    }
+
+    if (error) goto exit;
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+
+    vlog("\n");
+
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ushort *t;
+    cl_half *r;
+    float *s = 0;
+    cl_int *s2;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ushort *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesInt3Count;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            p[j] = specialValuesHalf[x];
+            p2[j] = specialValuesInt3[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesInt3Count) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = (cl_ushort)genrand_int32(d);
+        p2[j] = genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_elements * sizeof(cl_half), p, 0,
+                                      NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_elements * sizeof(cl_int), p2, 0,
+                                      NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint16_t pattern = 0xdead;
+        memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half));
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    t = (cl_ushort *)r;
+    s = (float *)malloc(buffer_elements * sizeof(float));
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        r[j] = cl_half_from_float(func.f_fi(s[j], s2[j]), CL_HALF_RTE);
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ushort *)clEnqueueMapBuffer(
+        tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0,
+        buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ushort *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                float test = cl_half_to_float(q[j]);
+                double correct = func.f_fi(s[j], s2[j]);
+                float err = Ulp_Error_Half(q[j], correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfSubnormal(
+                            cl_half_from_float(correct, CL_HALF_RTE)))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+                        correct2 = func.f_fi(0.0, s2[j]);
+                        correct3 = func.f_fi(-0.0, s2[j]);
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfSubnormal(
+                                cl_half_from_float(correct2, CL_HALF_RTE))
+                            || IsHalfSubnormal(
+                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), "
+                               "%d}\nExpected: %a (half 0x%0.4x) \nActual: %a "
+                               "(half 0x%0.4x) at index: %d\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               cl_half_to_float(r[j]), r[j], test, q[j],
+                               (cl_uint)j);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            goto exit;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    if (s) free(s);
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index c407fdaaf1..6bcec98bc8 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -281,8 +281,7 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 7fbb07c280..3b1be29020 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -271,8 +271,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 917362852c..1dcd4d9001 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -32,33 +32,37 @@
 #define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                       \
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,             \
+            _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,       \
             RELAXED_OFF, _type                                                 \
     }
 #define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type,     \
                   _relaxed_embedded_ulp)                                       \
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp,    \
-            _rmode, RELAXED_ON, _type                                          \
+            _ulp, _ulp, _ulp, _embedded_ulp, _relaxed_ulp,                     \
+            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                   \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
         "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), { NULL },          \
-            { NULL }, { NULL }, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, \
-            _rmode, RELAXED_OFF, _type                                         \
+            { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY,     \
+            INFINITY, _rmode, RELAXED_OFF, _type                               \
     }
 #define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)   \
     {                                                                          \
         STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
+            _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
+            _type                                                              \
     }
 
 #define unaryF NULL
+#define unaryOF NULL
 #define i_unaryF NULL
 #define unaryF_u NULL
 #define macro_unaryF NULL
 #define binaryF NULL
+#define binaryOF NULL
+#define binaryF_nextafter NULL
 #define binaryOperatorF NULL
 #define binaryF_i NULL
 #define macro_binaryF NULL
@@ -80,7 +84,7 @@
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
             { (void*)reference_##_name##l }, { (void*)reference_##_name },     \
-            _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,             \
+            _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,       \
             RELAXED_OFF, _type                                                 \
     }
 #define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type,     \
@@ -88,19 +92,21 @@
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
             { (void*)reference_##_name##l },                                   \
-            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _embedded_ulp, \
-            _relaxed_ulp, _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type     \
+            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _ulp,          \
+            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,        \
+            RELAXED_ON, _type                                                  \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
         "half_" STRINGIFY(_name), "half_" STRINGIFY(_name),                    \
             { (void*)reference_##_name }, { NULL }, { NULL }, _ulp, _ulp,      \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
+            _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
+            _type                                                              \
     }
 #define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)   \
     {                                                                          \
         STRINGIFY(_name), _operator, { (void*)reference_##_name },             \
-            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp,             \
+            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _ulp,       \
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
 
@@ -108,85 +114,114 @@ static constexpr vtbl _unary = {
     "unary",
     TestFunc_Float_Float,
     TestFunc_Double_Double,
+    TestFunc_Half_Half,
 };
 
+static constexpr vtbl _unaryof = { "unaryof", TestFunc_Float_Float, NULL,
+                                   NULL };
+
 static constexpr vtbl _i_unary = {
     "i_unary",
     TestFunc_Int_Float,
     TestFunc_Int_Double,
+    TestFunc_Int_Half,
 };
 
 static constexpr vtbl _unary_u = {
     "unary_u",
     TestFunc_Float_UInt,
     TestFunc_Double_ULong,
+    TestFunc_Half_UShort,
 };
 
 static constexpr vtbl _macro_unary = {
     "macro_unary",
     TestMacro_Int_Float,
     TestMacro_Int_Double,
+    TestMacro_Int_Half,
 };
 
 static constexpr vtbl _binary = {
     "binary",
     TestFunc_Float_Float_Float,
     TestFunc_Double_Double_Double,
+    TestFunc_Half_Half_Half,
 };
 
+static constexpr vtbl _binary_nextafter = {
+    "binary",
+    TestFunc_Float_Float_Float,
+    TestFunc_Double_Double_Double,
+    TestFunc_Half_Half_Half_nextafter,
+};
+
+static constexpr vtbl _binaryof = { "binaryof", TestFunc_Float_Float_Float,
+                                    NULL, NULL };
+
 static constexpr vtbl _binary_operator = {
     "binaryOperator",
     TestFunc_Float_Float_Float_Operator,
     TestFunc_Double_Double_Double_Operator,
+    NULL,
 };
 
 static constexpr vtbl _binary_i = {
     "binary_i",
     TestFunc_Float_Float_Int,
     TestFunc_Double_Double_Int,
+    TestFunc_Half_Half_Int,
 };
 
 static constexpr vtbl _macro_binary = {
     "macro_binary",
     TestMacro_Int_Float_Float,
     TestMacro_Int_Double_Double,
+    TestMacro_Int_Half_Half,
 };
 
 static constexpr vtbl _ternary = {
     "ternary",
     TestFunc_Float_Float_Float_Float,
     TestFunc_Double_Double_Double_Double,
+    NULL,
 };
 
 static constexpr vtbl _unary_two_results = {
     "unary_two_results",
     TestFunc_Float2_Float,
     TestFunc_Double2_Double,
+    NULL,
 };
 
 static constexpr vtbl _unary_two_results_i = {
     "unary_two_results_i",
     TestFunc_FloatI_Float,
     TestFunc_DoubleI_Double,
+    NULL,
 };
 
 static constexpr vtbl _binary_two_results_i = {
     "binary_two_results_i",
     TestFunc_FloatI_Float_Float,
     TestFunc_DoubleI_Double_Double,
+    NULL,
 };
 
 static constexpr vtbl _mad_tbl = {
     "ternary",
     TestFunc_mad_Float,
     TestFunc_mad_Double,
+    TestFunc_mad_Half,
 };
 
 #define unaryF &_unary
+#define unaryOF &_unaryof
 #define i_unaryF &_i_unary
 #define unaryF_u &_unary_u
 #define macro_unaryF &_macro_unary
 #define binaryF &_binary
+#define binaryF_nextafter &_binary_nextafter
+#define binaryOF &_binaryof
 #define binaryOperatorF &_binary_operator
 #define binaryF_i &_binary_i
 #define macro_binaryF &_macro_binary
@@ -278,7 +313,7 @@ const Func functionList[] = {
     ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF),
     ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
     ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
-    ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
     ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF,
               8192.0f), // in derived mode the ulp error is calculated as
                         // exp2(y*log2(x)) and in non-derived it is the same as
@@ -308,6 +343,7 @@ const Func functionList[] = {
       { NULL },
       3.0f,
       0.0f,
+      0.0f,
       4.0f,
       INFINITY,
       INFINITY,
@@ -322,6 +358,7 @@ const Func functionList[] = {
       0.0f,
       0.0f,
       0.0f,
+      0.0f,
       INFINITY,
       INFINITY,
       FTZ_OFF,
@@ -339,20 +376,20 @@ const Func functionList[] = {
     //                                    sure this requirement is realistic
     ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF),
 
-    HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryF),
-    HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryF),
-    HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryF),
+    HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF),
+    HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryOF),
+    HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
 
     // basic operations
     OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
@@ -364,6 +401,7 @@ const Func functionList[] = {
       { (void*)reference_relaxed_divide },
       2.5f,
       0.0f,
+      0.0f,
       3.0f,
       2.5f,
       INFINITY,
@@ -378,6 +416,7 @@ const Func functionList[] = {
       0.0f,
       0.0f,
       0.0f,
+      0.0f,
       0.f,
       INFINITY,
       FTZ_OFF,
diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h
index 95a2945932..6ea0fa9e2b 100644
--- a/test_conformance/math_brute_force/function_list.h
+++ b/test_conformance/math_brute_force/function_list.h
@@ -70,6 +70,9 @@ struct vtbl
     int (*DoubleTestFunc)(
         const struct Func *, MTdata,
         bool); // may be NULL if function is single precision only
+    int (*HalfTestFunc)(
+        const struct Func *, MTdata,
+        bool); // may be NULL if function is single precision only
 };
 
 struct Func
@@ -82,6 +85,7 @@ struct Func
     fptr rfunc;
     float float_ulps;
     float double_ulps;
+    float half_ulps;
     float float_embedded_ulps;
     float relaxed_error;
     float relaxed_embedded_error;
diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
new file mode 100644
index 0000000000..245528e102
--- /dev/null
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -0,0 +1,306 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
+                           cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global int",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global int* out, __global half* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 f0 = vload3( 0, in + 3 * i );\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(half)). Assume power of two "
+        "buffer size \n"
+        "       half3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (half3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       int3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
+                           info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ;
+    size_t bufferSize = BUFFER_SIZE;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+    uint64_t bufferElements = bufferSize / sizeof(cl_int);
+    float *s = 0;
+
+    int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // This test is not using ThreadPool so we need to disable FTZ here
+    // for reference computations
+    FPU_mode_type oldMode;
+    DisableFTZ(&oldMode);
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                   f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+    {
+        return error;
+    }
+    s = (float *)malloc(bufferElements * sizeof(float));
+
+    for (i = 0; i < (1ULL << 16); i += step)
+    {
+        // Init input array
+        cl_ushort *p = (cl_ushort *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferElements; j++)
+                p[j] = (cl_ushort)i + j * scale;
+        }
+        else
+        {
+            for (j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_int);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        int *r = (int *)gOut_Ref;
+        for (j = 0; j < bufferElements; j++)
+        {
+            s[j] = cl_half_to_float(p[j]);
+            r[j] = f->func.i_f(s[j]);
+        }
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (j = 0; j < bufferElements; j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    if (ftz && IsHalfSubnormal(p[j]))
+                    {
+                        unsigned int correct0 = f->func.i_f(0.0);
+                        unsigned int correct1 = f->func.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
+                    }
+
+                    uint32_t err = t[j] - q[j];
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%0.4x): "
+                               "*%d vs. %d\n",
+                               f->name, sizeNames[k], err, s[j], p[j], t[j],
+                               q[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+
+exit:
+    if (s) free(s);
+    RestoreFPState(&oldMode);
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index d3e8071fb3..42a813160a 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -270,8 +270,7 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index 6c7c8c05e7..c7f3538e68 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -260,8 +260,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
new file mode 100644
index 0000000000..72abb10571
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -0,0 +1,652 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+
+static int BuildKernelHalf(const char *name, int vectorSize,
+                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                           bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global short",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in1, __global half",
+                        sizeNames[vectorSize],
+                        "* in2 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global short* out, __global half* in, __global half* in2)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 f0 = vload3( 0, in + 3 * i );\n"
+        "       half3 f1 = vload3( 0, in2 + 3 * i );\n"
+        "       short3 i0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       half3 f0, f1;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (half3)( in[3*i], NAN, NAN ); \n"
+        "               f1 = (half3)( in2[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               f1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       short3 i0 = ",
+        name,
+        "( f0, f1 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
+                           info->kernels[i], info->programs + i,
+                           info->relaxedMode);
+}
+
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem inBuf2; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdata d;
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+// A table of more difficult cases to get right
+static const cl_half specialValuesHalf[] = {
+    0xffff,
+    0x0000,
+    0x0001,
+    0x7c00 /*INFINITY*/,
+    0xfc00 /*-INFINITY*/,
+    0x8000 /*-0*/,
+    0x7bff /*HALF_MAX*/,
+    0x0400 /*HALF_MIN*/
+};
+
+static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+exit:
+    // Release
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            free_mtdata(test_info.tinfo[i].d);
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            clReleaseMemObject(test_info.tinfo[i].inBuf2);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_short *t, *r;
+    float *s = 0, *s2 = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_short *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_short *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Init input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            p[j] = specialValuesHalf[x];
+            p2[j] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = (cl_ushort)genrand_int32(d);
+        p2[j] = (cl_ushort)genrand_int32(d);
+    }
+
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        goto exit;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            goto exit;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint16_t pattern = 0xdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            goto exit;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            goto exit;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_short *)gOut_Ref + thread_id * buffer_elements;
+    t = (cl_short *)r;
+    s = (float *)malloc(buffer_elements * sizeof(float));
+    s2 = (float *)malloc(buffer_elements * sizeof(float));
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        s2[j] = cl_half_to_float(p2[j]);
+        r[j] = (short)func.i_ff(s[j], s2[j]);
+    }
+
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_short *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_short *q = (cl_short *)out[0];
+
+        // If we aren't getting the correctly rounded result
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            if (ftz)
+            {
+                if (IsHalfSubnormal(p[j]))
+                {
+                    if (IsHalfSubnormal(p2[j]))
+                    {
+                        short correct = (short)func.i_ff(0.0f, 0.0f);
+                        short correct2 = (short)func.i_ff(0.0f, -0.0f);
+                        short correct3 = (short)func.i_ff(-0.0f, 0.0f);
+                        short correct4 = (short)func.i_ff(-0.0f, -0.0f);
+
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
+                            continue;
+                    }
+                    else
+                    {
+                        short correct = (short)func.i_ff(0.0f, s2[j]);
+                        short correct2 = (short)func.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+                else if (IsHalfSubnormal(p2[j]))
+                {
+                    short correct = (short)func.i_ff(s[j], 0.0f);
+                    short correct2 = (short)func.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            short err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error(
+                "\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a "
+                "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n",
+                name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j);
+            error = -1;
+            goto exit;
+        }
+
+
+        for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
+             k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            short correct = (short)-func.i_ff(0.0f, 0.0f);
+                            short correct2 = (short)-func.i_ff(0.0f, -0.0f);
+                            short correct3 = (short)-func.i_ff(-0.0f, 0.0f);
+                            short correct4 = (short)-func.i_ff(-0.0f, -0.0f);
+
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
+                                continue;
+                        }
+                        else
+                        {
+                            short correct = (short)-func.i_ff(0.0f, s2[j]);
+                            short correct2 = (short)-func.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        short correct = (short)-func.i_ff(s[j], 0.0f);
+                        short correct2 = (short)-func.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+
+                cl_ushort err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a "
+                           "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x "
+                           "(index: %d)\n",
+                           name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j);
+                error = -1;
+                goto exit;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            goto exit;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+exit:
+    if (s) free(s);
+    if (s2) free(s2);
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
new file mode 100644
index 0000000000..31c7d65ce7
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -0,0 +1,543 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+static int BuildKernelHalf(const char *name, int vectorSize,
+                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                           bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global short",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global short* out, __global half* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 f0 = vload3( 0, in + 3 * i );\n"
+        "       short3 i0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( i0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       short3 i0;\n"
+        "       half3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (half3)( in[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (half3)( in[3*i], in[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       i0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = i0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = i0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
+                           info->kernels[i], info->programs + i,
+                           info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+
+} TestInfo;
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz = f->ftz || gForceFTZ;
+
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+exit:
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    cl_uint j, k;
+    cl_int error = CL_SUCCESS;
+    const char *name = job->f->name;
+    float *s = 0;
+
+    int signbit_test = 0;
+    if (!strcmp(name, "signbit")) signbit_test = 1;
+
+#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_short *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_short *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint16_t pattern = 0xdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements;
+    cl_short *t = (cl_short *)r;
+    s = (float *)malloc(buffer_elements * sizeof(float));
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        if (!strcmp(name, "isnormal"))
+        {
+            if ((IsHalfSubnormal(p[j]) == 0) && !((p[j] & 0x7fffU) >= 0x7c00U)
+                && ((p[j] & 0x7fffU) != 0x0000U))
+                r[j] = 1;
+            else
+                r[j] = 0;
+        }
+        else
+            r[j] = (short)ref_func(s[j]);
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_short *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_short *q = out[0];
+
+
+        // If we aren't getting the correctly rounded result
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            // If we aren't getting the correctly rounded result
+            if (ftz)
+            {
+                if (IsHalfSubnormal(p[j]))
+                {
+                    short correct = (short)ref_func(+0.0f);
+                    short correct2 = (short)ref_func(-0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            short err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %d ulp error at %a (0x%0.4x)\nExpected: "
+                       "%d vs. %d\n",
+                       name, err, s[j], p[j], t[j], q[j]);
+            error = -1;
+            goto exit;
+        }
+
+
+        for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
+             k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        short correct = (short)-ref_func(+0.0f);
+                        short correct2 = (short)-ref_func(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+
+                short err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s%s: %d ulp error at %a "
+                           "(0x%0.4x)\nExpected: %d \nActual: %d\n",
+                           name, sizeNames[k], err, s[j], p[j], -t[j], q[j]);
+                error = -1;
+                goto exit;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            goto exit;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+exit:
+    if (s) free(s);
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
new file mode 100644
index 0000000000..a36e8d6653
--- /dev/null
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -0,0 +1,295 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+
+static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
+                           cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global half",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in1, __global half",
+                        sizeNames[vectorSize],
+                        "* in2,  __global half",
+                        sizeNames[vectorSize],
+                        "* in3 )\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in1[i], in2[i], in3[i] );\n"
+                        "}\n" };
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global half* out, __global half* in, __global half* in2, __global "
+        "half* in3)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 d0 = vload3( 0, in + 3 * i );\n"
+        "       half3 d1 = vload3( 0, in2 + 3 * i );\n"
+        "       half3 d2 = vload3( 0, in3 + 3 * i );\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       vstore3( d0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       half3 d0, d1, d2;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               d0 = (half3)( in[3*i], NAN, NAN ); \n"
+        "               d1 = (half3)( in2[3*i], NAN, NAN ); \n"
+        "               d2 = (half3)( in3[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n"
+        "               d2 = (half3)( in3[3*i], in3[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       d0 = ",
+        name,
+        "( d0, d1, d2 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = d0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = d0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
+                           info->programs + i, info->relaxedMode);
+}
+
+int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    //    int ftz = f->ftz || gForceFTZ;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    float maxErrorVal3 = 0.0f;
+    size_t bufferSize = BUFFER_SIZE;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    uint64_t step = bufferSize / sizeof(cl_half);
+    if (gWimpyMode)
+    {
+        step = (1ULL << 32) * gWimpyReductionFactor / (512);
+    }
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                   f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+    {
+        return error;
+    }
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_ushort *p = (cl_ushort *)gIn;
+        cl_ushort *p2 = (cl_ushort *)gIn2;
+        cl_ushort *p3 = (cl_ushort *)gIn3;
+        for (j = 0; j < bufferSize / sizeof(cl_ushort); j++)
+        {
+            p[j] = (cl_ushort)genrand_int32(d);
+            p2[j] = (cl_ushort)genrand_int32(d);
+            p3[j] = (cl_ushort)genrand_int32(d);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint16_t pattern = 0xdead;
+            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_half) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
+                                        &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
+                                        &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data - no verification possible. MAD is a random number
+        // generator.
+
+        if (0 == (i & 0x0fffffff))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("pass");
+
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    }
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 1a6e0c4e1c..5568701623 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -49,6 +49,8 @@
 #include "harness/testHarness.h"
 
 #define kPageSize 4096
+#define HALF_REQUIRED_FEATURES_1 (CL_FP_ROUND_TO_ZERO)
+#define HALF_REQUIRED_FEATURES_2 (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN)
 #define DOUBLE_REQUIRED_FEATURES                                               \
     (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO                  \
      | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)
@@ -80,6 +82,8 @@ static int gTestFastRelaxed = 1;
 */
 int gFastRelaxedDerived = 1;
 static int gToggleCorrectlyRoundedDivideSqrt = 0;
+int gHasHalf = 0;
+cl_device_fp_config gHalfCapabilities = 0;
 int gDeviceILogb0 = 1;
 int gDeviceILogbNaN = 1;
 int gCheckTininessBeforeRounding = 1;
@@ -166,7 +170,6 @@ static int doTest(const char *name)
             return 0;
         }
     }
-
     {
         if (0 == strcmp("ilogb", func_data->name))
         {
@@ -235,6 +238,23 @@ static int doTest(const char *name)
                 }
             }
         }
+
+        if (gHasHalf && NULL != func_data->vtbl_ptr->HalfTestFunc)
+        {
+            gTestCount++;
+            vlog("%3d: ", gTestCount);
+            if (func_data->vtbl_ptr->HalfTestFunc(func_data, gMTdata,
+                                                  false /* relaxed mode*/))
+            {
+                gFailCount++;
+                error++;
+                if (gStopOnError)
+                {
+                    gSkipRestOfTests = true;
+                    return error;
+                }
+            }
+        }
     }
 
     return error;
@@ -407,6 +427,8 @@ static int ParseArgs(int argc, const char **argv)
 
                     case 'm': singleThreaded ^= 1; break;
 
+                    case 'g': gHasHalf ^= 1; break;
+
                     case 'r': gTestFastRelaxed ^= 1; break;
 
                     case 's': gStopOnError ^= 1; break;
@@ -539,6 +561,8 @@ static void PrintUsage(void)
     vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 "
          "on)\n");
     vlog("\t\t-f\tToggle float precision testing. (Default: on)\n");
+    vlog("\t\t-g\tToggle half precision testing. (Default: on if khr_fp_16 "
+         "on)\n");
     vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n");
     vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math "
          "precision. (Default: on)\n");
@@ -638,6 +662,44 @@ test_status InitCL(cl_device_id device)
 #endif
     }
 
+    gFloatToHalfRoundingMode = kRoundToNearestEven;
+    if (is_extension_available(gDevice, "cl_khr_fp16"))
+    {
+        gHasHalf ^= 1;
+#if defined(CL_DEVICE_HALF_FP_CONFIG)
+        if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_HALF_FP_CONFIG,
+                                     sizeof(gHalfCapabilities),
+                                     &gHalfCapabilities, NULL)))
+        {
+            vlog_error(
+                "ERROR: Unable to get device CL_DEVICE_HALF_FP_CONFIG. (%d)\n",
+                error);
+            return TEST_FAIL;
+        }
+        if (HALF_REQUIRED_FEATURES_1
+                != (gHalfCapabilities & HALF_REQUIRED_FEATURES_1)
+            && HALF_REQUIRED_FEATURES_2
+                != (gHalfCapabilities & HALF_REQUIRED_FEATURES_2))
+        {
+            char list[300] = "";
+            if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_NEAREST))
+                strncat(list, "CL_FP_ROUND_TO_NEAREST, ", sizeof(list) - 1);
+            if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_ZERO))
+                strncat(list, "CL_FP_ROUND_TO_ZERO, ", sizeof(list) - 1);
+            if (0 == (gHalfCapabilities & CL_FP_INF_NAN))
+                strncat(list, "CL_FP_INF_NAN, ", sizeof(list) - 1);
+            vlog_error("ERROR: required half features are missing: %s\n", list);
+
+            return TEST_FAIL;
+        }
+#else
+        vlog_error("FAIL: device says it supports cl_khr_fp16 but "
+                   "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n");
+        return TEST_FAIL;
+#endif
+    }
+
+
     uint32_t deviceFrequency = 0;
     size_t configSize = sizeof(deviceFrequency);
     if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY,
@@ -826,6 +888,7 @@ test_status InitCL(cl_device_id device)
              "Bruteforce_Ulp_Error_Double() for more details.\n\n");
     }
 
+    vlog("\tTesting half precision? %s\n", no_yes[0 != gHasHalf]);
     vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]);
     if (gIsEmbedded)
         vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]);
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 16db3d672a..5c9015b053 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -4698,6 +4698,40 @@ double reference_nextafter(double xx, double yy)
     return a.f;
 }
 
+cl_half reference_nanh(cl_ushort x)
+{
+    cl_ushort u;
+    cl_half h;
+    u = x | 0x7e00U;
+    memcpy(&h, &u, sizeof(cl_half));
+    return h;
+}
+
+float reference_nextafterh(float xx, float yy)
+{
+    cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE);
+    cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE);
+    float x = cl_half_to_float(tmp_a);
+    float y = cl_half_to_float(tmp_b);
+
+    // take care of nans
+    if (x != x) return x;
+
+    if (y != y) return y;
+
+    if (x == y) return y;
+
+    short a_h = cl_half_from_float(x, CL_HALF_RTE);
+    short b_h = cl_half_from_float(y, CL_HALF_RTE);
+
+    if (a_h & 0x8000) a_h = 0x8000 - a_h;
+    if (b_h & 0x8000) b_h = 0x8000 - b_h;
+
+    a_h += (a_h < b_h) ? 1 : -1;
+    a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h;
+
+    return cl_half_to_float(a_h);
+}
 
 long double reference_nextafterl(long double xx, long double yy)
 {
diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h
index 78b245105e..b9b2e46957 100644
--- a/test_conformance/math_brute_force/reference_math.h
+++ b/test_conformance/math_brute_force/reference_math.h
@@ -18,8 +18,10 @@
 
 #if defined(__APPLE__)
 #include <OpenCL/opencl.h>
+
 #else
 #include <CL/cl.h>
+#include "CL/cl_half.h"
 #endif
 
 // --  for testing float --
@@ -160,6 +162,8 @@ long double reference_fractl(long double, long double*);
 long double reference_fmal(long double, long double, long double);
 long double reference_madl(long double, long double, long double);
 long double reference_nextafterl(long double, long double);
+float reference_nextafterh(float, float);
+cl_half reference_nanh(cl_ushort);
 long double reference_recipl(long double);
 long double reference_rootnl(long double, int);
 long double reference_rsqrtl(long double);
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 8af136ac27..10cca4c105 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -204,8 +204,7 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 } // anonymous namespace
 
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index c69083ada1..cbcb0e2ef1 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -212,8 +212,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 } // anonymous namespace
 
diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h
index 78aef9c9a6..91cca16339 100644
--- a/test_conformance/math_brute_force/test_functions.h
+++ b/test_conformance/math_brute_force/test_functions.h
@@ -24,6 +24,9 @@ int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode);
 // double foo(double)
 int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half)
+int TestFunc_Half_Half(const Func *f, MTdata, bool relaxedMode);
+
 // int foo(float)
 int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode);
 
@@ -36,6 +39,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode);
 // double foo(ulong)
 int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode);
 
+// half (Ushort)
+int TestFunc_Half_UShort(const Func *f, MTdata, bool relaxedMode);
+
 // Returns {0, 1} for scalar and {0, -1} for vector.
 // int foo(float)
 int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
@@ -44,21 +50,34 @@ int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
 // int foo(double)
 int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode);
 
+// int foo(half,half)
+int TestMacro_Int_Half_Half(const Func *f, MTdata, bool relaxedMode);
+
+// int foo(half)
+int TestMacro_Int_Half(const Func *f, MTdata, bool relaxedMode);
+
+// int foo(half)
+int TestFunc_Int_Half(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, float)
 int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, double)
 int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// Half foo(half, half)
+int TestFunc_Half_Half_Half(const Func *f, MTdata, bool relaxedMode);
 // Special handling for nextafter.
-// float foo(float, float)
-int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
-                                         bool relaxedMode);
+// Half foo(Half, Half)
+int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata, bool relaxedMode);
+
+// Half foo(Half, Half)
+int TestFunc_Half_Half_Half_common(const Func *f, MTdata, int isNextafter,
+                                   bool relaxedMode);
+
+// Half foo(Half, int)
+int TestFunc_Half_Half_Int(const Func *f, MTdata, bool relaxedMode);
 
-// Special handling for nextafter.
-// double foo(double, double)
-int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
-                                            bool relaxedMode);
 
 // float op float
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
@@ -115,4 +134,7 @@ int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode);
 // double mad(double, double, double)
 int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half mad(half, half, half)
+int TestFunc_mad_Half(const Func *f, MTdata, bool relaxedMode);
+
 #endif
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
new file mode 100644
index 0000000000..f60823ffaf
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -0,0 +1,600 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+static int BuildKernelHalf(const char *name, int vectorSize,
+                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
+                           bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global half",
+                        sizeNames[vectorSize],
+                        "* out, __global half",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global half* out, __global half* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       half3 f0 = vload3( 0, in + 3 * i );\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       half3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               f0 = (half3)( in[3*i], NAN, NAN ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( f0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
+                       relaxedMode);
+}
+
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_uint kernel_count;
+    cl_kernel **kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
+                           info->kernels[i], info->programs + i,
+                           info->relaxedMode);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    cl_mem inBuf; // input buffer for the thread
+    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double maxErrorValue; // position of the max error value.  Init to 0.
+    cl_command_queue tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+typedef struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_program programs[VECTOR_SIZE_COUNT]; // programs for various vector sizes
+    cl_kernel
+        *k[VECTOR_SIZE_COUNT]; // arrays of thread-specific kernels for each
+                               // worker thread:  k[vector_size][thread_id]
+    ThreadInfo *
+        tinfo; // An array of thread specific information for each worker thread
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isRangeLimited; // 1 if the function is only to be evaluated over a
+                        // range
+    float half_sin_cos_tan_limit;
+} TestInfo;
+
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
+
+int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info, 0, sizeof(test_info));
+    test_info.threadCount = GetThreadCount();
+
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    // cl_kernels aren't thread safe, so we make one for each vector size for
+    // every thread
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
+        test_info.k[i] = (cl_kernel *)malloc(array_size);
+        if (NULL == test_info.k[i])
+        {
+            vlog_error("Error: Unable to allocate storage for kernels!\n");
+            error = CL_OUT_OF_HOST_MEMORY;
+            goto exit;
+        }
+        memset(test_info.k[i], 0, array_size);
+    }
+    test_info.tinfo =
+        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
+    if (NULL == test_info.tinfo)
+    {
+        vlog_error(
+            "Error: Unable to allocate storage for thread specific data.\n");
+        error = CL_OUT_OF_HOST_MEMORY;
+        goto exit;
+    }
+    memset(test_info.tinfo, 0,
+           test_info.threadCount * sizeof(*test_info.tinfo));
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            goto exit;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                goto exit;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            goto exit;
+        }
+    }
+
+    // Check for special cases for unary float
+    test_info.isRangeLimited = 0;
+    test_info.half_sin_cos_tan_limit = 0;
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
+    }
+    else if (0 == strcmp(f->name, "half_tan"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { gMinVectorSizeIndex,
+                                       test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            goto exit;
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        if (error) goto exit;
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+exit:
+    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
+    {
+        clReleaseProgram(test_info.programs[i]);
+        if (test_info.k[i])
+        {
+            for (j = 0; j < test_info.threadCount; j++)
+                clReleaseKernel(test_info.k[i][j]);
+
+            free(test_info.k[i]);
+        }
+    }
+    if (test_info.tinfo)
+    {
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            clReleaseMemObject(test_info.tinfo[i].inBuf);
+            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
+            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
+        }
+
+        free(test_info.tinfo);
+    }
+
+    return error;
+}
+
+static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    const TestInfo *job = (const TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = job->tinfo + thread_id;
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    cl_uint j, k;
+    cl_int error = CL_SUCCESS;
+
+    int isRangeLimited = job->isRangeLimited;
+    float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
+    int ftz = job->ftz;
+
+    float *s = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ushort *out[VECTOR_SIZE_COUNT];
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (uint16_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    // Write the new values to the input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        p[j] = base + j * scale;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint16_t pattern = 0xdead;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            return error;
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    cl_ushort *t = (cl_ushort *)r;
+    s = (float *)malloc(buffer_elements * sizeof(float));
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = (float)cl_half_to_float(p[j]);
+        r[j] = cl_half_from_float(func.f_f(s[j]), CL_HALF_RTE);
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (uint16_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            goto exit;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (uint16_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        goto exit;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ushort *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                float test = cl_half_to_float(q[j]);
+                double correct = func.f_f(s[j]);
+                float err = Ulp_Error_Half(q[j], correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
+                if (isRangeLimited
+                    && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
+                    && fabsf(s[j]) < INFINITY)
+                {
+                    if (fabsf(test) <= half_sin_cos_tan_limit)
+                    {
+                        err = 0;
+                        fail = 0;
+                    }
+                }
+
+                if (fail)
+                {
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsHalfSubnormal(
+                                cl_half_from_float(correct, CL_HALF_RTE)))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsHalfSubnormal(p[j]))
+                        {
+                            double correct2 = func.f_f(0.0);
+                            double correct3 = func.f_f(-0.0);
+                            float err2 = Ulp_Error_Half(q[j], correct2);
+                            float err3 = Ulp_Error_Half(q[j], correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfSubnormal(
+                                    cl_half_from_float(correct2, CL_HALF_RTE))
+                                || IsHalfSubnormal(
+                                    cl_half_from_float(correct3, CL_HALF_RTE)))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at %a "
+                               "(0x%0.4x)\nExpected: %a (half 0x%0.4x) "
+                               "\nActual: %a (half 0x%0.4x)\n",
+                               job->f->name, sizeNames[k], err, s[j], p[j],
+                               t[j], cl_half_to_float(r[j]), test, q[j]);
+                    error = -1;
+                    goto exit;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            goto exit;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+exit:
+    if (s) free(s);
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
new file mode 100644
index 0000000000..f8e8b6231d
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -0,0 +1,334 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+#include "reference_math.h"
+
+#include <cstring>
+
+
+static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
+                           cl_program *p, bool relaxedMode)
+{
+    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+                        "__kernel void math_kernel",
+                        sizeNames[vectorSize],
+                        "( __global half",
+                        sizeNames[vectorSize],
+                        "* out, __global ushort",
+                        sizeNames[vectorSize],
+                        "* in)\n"
+                        "{\n"
+                        "   int i = get_global_id(0);\n"
+                        "   out[i] = ",
+                        name,
+                        "( in[i] );\n"
+                        "}\n" };
+
+    const char *c3[] = {
+        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
+        "__kernel void math_kernel",
+        sizeNames[vectorSize],
+        "( __global half* out, __global ushort* in)\n"
+        "{\n"
+        "   size_t i = get_global_id(0);\n"
+        "   if( i + 1 < get_global_size(0) )\n"
+        "   {\n"
+        "       ushort3 u0 = vload3( 0, in + 3 * i );\n"
+        "       half3 f0 = ",
+        name,
+        "( u0 );\n"
+        "       vstore3( f0, 0, out + 3*i );\n"
+        "   }\n"
+        "   else\n"
+        "   {\n"
+        "       size_t parity = i & 1;   // Figure out how many elements are "
+        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
+        "buffer size \n"
+        "       ushort3 u0;\n"
+        "       half3 f0;\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 1:\n"
+        "               u0 = (ushort3)( in[3*i], 0xdead, 0xdead ); \n"
+        "               break;\n"
+        "           case 0:\n"
+        "               u0 = (ushort3)( in[3*i], in[3*i+1], 0xdead ); \n"
+        "               break;\n"
+        "       }\n"
+        "       f0 = ",
+        name,
+        "( u0 );\n"
+        "       switch( parity )\n"
+        "       {\n"
+        "           case 0:\n"
+        "               out[3*i+1] = f0.y; \n"
+        "               // fall through\n"
+        "           case 1:\n"
+        "               out[3*i] = f0.x; \n"
+        "               break;\n"
+        "       }\n"
+        "   }\n"
+        "}\n"
+    };
+
+    const char **kern = c;
+    size_t kernSize = sizeof(c) / sizeof(c[0]);
+
+    if (sizeValues[vectorSize] == 3)
+    {
+        kern = c3;
+        kernSize = sizeof(c3) / sizeof(c3[0]);
+    }
+
+    char testName[32];
+    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
+             sizeNames[vectorSize]);
+
+    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
+}
+
+typedef struct BuildKernelInfo
+{
+    cl_uint offset; // the first vector size to build
+    cl_kernel *kernels;
+    cl_program *programs;
+    const char *nameInCode;
+    bool relaxedMode;
+} BuildKernelInfo;
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo *info = (BuildKernelInfo *)p;
+    cl_uint i = info->offset + job_id;
+    return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
+                           info->programs + i, info->relaxedMode);
+}
+
+
+int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
+{
+    uint64_t i;
+    uint32_t j, k;
+    int error;
+    cl_program programs[VECTOR_SIZE_COUNT];
+    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal = 0.0f;
+    size_t bufferSize = BUFFER_SIZE;
+    size_t bufferElements = bufferSize / sizeof(cl_half);
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+    int scale = (int)((1ULL << 32) / (16 * bufferElements) + 1);
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    const char *name = f->name;
+    float half_ulps = f->half_ulps;
+    if (gWimpyMode)
+    {
+        step = (1ULL << 32) * gWimpyReductionFactor / (512);
+    }
+
+    // Init the kernels
+    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
+                                   f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+    {
+        return error;
+    }
+
+    for (i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_ushort *p = (cl_ushort *)gIn;
+        if (gWimpyMode)
+        {
+            for (j = 0; j < bufferElements; j++) p[j] = i + j * scale;
+        }
+        else
+        {
+            for (j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint16_t pattern = 0xdead;
+            memset_pattern4(gOut[j], &pattern, bufferSize);
+            if ((error =
+                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
+                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                           error, j);
+                goto exit;
+            }
+        }
+
+        // Run the kernels
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_half);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
+                                        &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
+                                        &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                goto exit;
+            }
+
+            if ((error =
+                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
+                                            &localCount, NULL, 0, NULL, NULL)))
+            {
+                vlog_error("FAILURE -- could not execute kernel\n");
+                goto exit;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        cl_half *r = (cl_half *)gOut_Ref;
+        for (j = 0; j < bufferElements; j++)
+        {
+            if (!strcmp(name, "nan"))
+                r[j] = reference_nanh(p[j]);
+            else
+                r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE);
+        }
+        // Read the data back
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                goto exit;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+
+        // Verify data
+        cl_ushort *t = (cl_ushort *)gOut_Ref;
+        for (j = 0; j < bufferElements; j++)
+        {
+            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_ushort *q = (cl_ushort *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    double test = cl_half_to_float(q[j]);
+                    double correct;
+                    if (!strcmp(name, "nan"))
+                        correct = cl_half_to_float(reference_nanh(p[j]));
+                    else
+                        correct = f->func.f_u(p[j]);
+
+                    float err = Ulp_Error_Half(q[j], correct);
+                    int fail = !(fabsf(err) <= half_ulps);
+
+                    if (fail)
+                    {
+                        if (ftz)
+                        {
+                            // retry per section 6.5.3.2
+                            if (IsHalfSubnormal(
+                                    cl_half_from_float(correct, CL_HALF_RTE)))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = p[j];
+                    }
+                    if (fail)
+                    {
+                        vlog_error(
+                            "\n%s%s: %f ulp error at 0x%0.4x \nExpected: %a "
+                            "(0x%0.4x) \nActual: %a (0x%0.4x)\n",
+                            f->name, sizeNames[k], err, p[j],
+                            cl_half_to_float(r[j]), r[j], test, q[j]);
+                        error = -1;
+                        goto exit;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
+                     bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+exit:
+    // Release
+    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+    {
+        clReleaseKernel(kernels[k]);
+        clReleaseProgram(programs[k]);
+    }
+
+    return error;
+}
\ No newline at end of file
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index b4a59edb55..d4d8571509 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -22,6 +22,7 @@
 #include "harness/testHarness.h"
 #include "harness/ThreadPool.h"
 #include "harness/conversions.h"
+#include "CL/cl_half.h"
 
 #define BUFFER_SIZE (1024 * 1024 * 2)
 #define EMBEDDED_REDUCTION_FACTOR (64)
@@ -60,12 +61,15 @@ extern int gForceFTZ;
 extern int gFastRelaxedDerived;
 extern int gWimpyMode;
 extern int gIsInRTZMode;
+extern int gHasHalf;
 extern int gInfNanSupport;
 extern int gIsEmbedded;
 extern int gVerboseBruteForce;
 extern uint32_t gMaxVectorSizeIndex;
 extern uint32_t gMinVectorSizeIndex;
 extern cl_device_fp_config gFloatCapabilities;
+extern cl_device_fp_config gHalfCapabilities;
+extern RoundingMode gFloatToHalfRoundingMode;
 
 #define LOWER_IS_BETTER 0
 #define HIGHER_IS_BETTER 1

From 30467ce0031d9dcd85ff903ac4b3d31d3819cf11 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 23 Mar 2023 12:32:44 +0100
Subject: [PATCH 02/24] Added modernization of remaining half tests for
 consistency (issue #142, bruteforce)

---
 .../math_brute_force/i_unary_half.cpp         | 97 +++++++++++--------
 .../math_brute_force/mad_half.cpp             | 87 ++++++++++-------
 .../math_brute_force/unary_u_half.cpp         | 77 +++++++++------
 3 files changed, 156 insertions(+), 105 deletions(-)

diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index 245528e102..c274f3e916 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -14,12 +14,16 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 
 #include <cstring>
+#include <memory>
+#include <cinttypes>
 
+#if 0
 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
                            cl_program *p, bool relaxedMode)
 {
@@ -118,19 +122,33 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
                            info->programs + i, info->relaxedMode);
 }
+#else
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Int,
+                              ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+#endif
 
 int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    Programs programs;
+    KernelMatrix kernels;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
     int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ;
     size_t bufferSize = BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
     uint64_t bufferElements = bufferSize / sizeof(cl_int);
-    float *s = 0;
+    std::vector<float> s;
 
     int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
 
@@ -139,30 +157,31 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     // for reference computations
     FPU_mode_type oldMode;
     DisableFTZ(&oldMode);
+    std::shared_ptr<int> at_scope_exit(
+        nullptr, [&oldMode](int *) { RestoreFPState(&oldMode); });
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode };
-    if ((error = ThreadPool_Do(BuildKernel_HalfFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
-    s = (float *)malloc(bufferElements * sizeof(float));
+    s.resize(bufferElements);
 
-    for (i = 0; i < (1ULL << 16); i += step)
+    for (uint64_t i = 0; i < (1ULL << 16); i += step)
     {
         // Init input array
         cl_ushort *p = (cl_ushort *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferElements; j++)
+            for (size_t j = 0; j < bufferElements; j++)
                 p[j] = (cl_ushort)i + j * scale;
         }
         else
         {
-            for (j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
+            for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
         }
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
@@ -172,7 +191,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint32_t pattern = 0xffffdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
@@ -182,34 +201,34 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
-                goto exit;
+                return error;
             }
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_int);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
 
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
             {
                 vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
+                return error;
             }
         }
 
@@ -218,20 +237,20 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 
         // Calculate the correctly rounded reference result
         int *r = (int *)gOut_Ref;
-        for (j = 0; j < bufferElements; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
             s[j] = cl_half_to_float(p[j]);
             r[j] = f->func.i_f(s[j]);
         }
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
-                goto exit;
+                return error;
             }
         }
 
@@ -239,9 +258,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint32_t *t = (uint32_t *)gOut_Ref;
-        for (j = 0; j < bufferElements; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 uint32_t *q = (uint32_t *)(gOut[k]);
                 // If we aren't getting the correctly rounded result
@@ -260,8 +279,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
                                "*%d vs. %d\n",
                                f->name, sizeNames[k], err, s[j], p[j], t[j],
                                q[j]);
-                    error = -1;
-                    goto exit;
+                    return -1;
                 }
             }
         }
@@ -270,8 +288,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         {
             if (gVerboseBruteForce)
             {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10zd \n",
+                     i, step, bufferSize);
             }
             else
             {
@@ -292,6 +311,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     vlog("\n");
 
 
+#if 0
 exit:
     if (s) free(s);
     RestoreFPState(&oldMode);
@@ -301,6 +321,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
     }
+#endif
 
     return error;
-}
\ No newline at end of file
+}
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
index a36e8d6653..56115b10cd 100644
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -14,13 +14,14 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 
 #include <cstring>
 
-
+#if 0
 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
                            cl_program *p, bool relaxedMode)
 {
@@ -129,13 +130,28 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                            info->programs + i, info->relaxedMode);
 }
 
+#else
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetTernaryKernel(kernel_name, builtin, ParameterType::Half,
+                                ParameterType::Half, ParameterType::Half,
+                                ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+#endif
+
 int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    Programs programs;
+    KernelMatrix kernels;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
     float maxError = 0.0f;
     //    int ftz = f->ftz || gForceFTZ;
     float maxErrorVal = 0.0f;
@@ -150,21 +166,20 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         step = (1ULL << 32) * gWimpyReductionFactor / (512);
     }
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode };
-    if ((error = ThreadPool_Do(BuildKernel_HalfFn,
-                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                               &build_info)))
     {
-        return error;
+        BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
     }
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_ushort *p = (cl_ushort *)gIn;
         cl_ushort *p2 = (cl_ushort *)gIn2;
         cl_ushort *p3 = (cl_ushort *)gIn3;
-        for (j = 0; j < bufferSize / sizeof(cl_ushort); j++)
+        for (size_t j = 0; j < bufferSize / sizeof(cl_ushort); j++)
         {
             p[j] = (cl_ushort)genrand_int32(d);
             p2[j] = (cl_ushort)genrand_int32(d);
@@ -190,7 +205,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint16_t pattern = 0xdead;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -200,47 +215,47 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
-                goto exit;
+                return error;
             }
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeof(cl_half) * sizeValues[j];
             size_t localCount = (bufferSize + vectorSize - 1)
                 / vectorSize; // bufferSize / vectorSize  rounded up
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
-            if ((error = clSetKernelArg(kernels[j], 2, sizeof(gInBuffer2),
-                                        &gInBuffer2)))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
-            if ((error = clSetKernelArg(kernels[j], 3, sizeof(gInBuffer3),
-                                        &gInBuffer3)))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer3), &gInBuffer3)))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
 
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
             {
                 vlog_error("FAILED -- could not execute kernel\n");
-                goto exit;
+                return error;
             }
         }
 
@@ -248,14 +263,14 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
 
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
-                goto exit;
+                return error;
             }
         }
 
@@ -283,6 +298,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
     }
     vlog("\n");
 
+#if 0
 exit:
     // Release
     for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
@@ -290,6 +306,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
     }
+#endif
 
     return error;
-}
\ No newline at end of file
+}
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index f8e8b6231d..c063de7fa3 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -14,6 +14,7 @@
 // limitations under the License.
 //
 
+#include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
@@ -21,7 +22,7 @@
 
 #include <cstring>
 
-
+#if 0
 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
                            cl_program *p, bool relaxedMode)
 {
@@ -120,15 +121,28 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
                            info->programs + i, info->relaxedMode);
 }
+#else
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::UInt, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
 
+#endif
 
 int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
 {
-    uint64_t i;
-    uint32_t j, k;
     int error;
-    cl_program programs[VECTOR_SIZE_COUNT];
-    cl_kernel kernels[VECTOR_SIZE_COUNT];
+    Programs programs;
+    KernelMatrix kernels;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
     float maxErrorVal = 0.0f;
@@ -145,8 +159,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
     }
 
     // Init the kernels
-    BuildKernelInfo build_info = { gMinVectorSizeIndex, kernels, programs,
-                                   f->nameInCode };
+    BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernel_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -154,17 +167,17 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         return error;
     }
 
-    for (i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
     {
         // Init input array
         cl_ushort *p = (cl_ushort *)gIn;
         if (gWimpyMode)
         {
-            for (j = 0; j < bufferElements; j++) p[j] = i + j * scale;
+            for (size_t j = 0; j < bufferElements; j++) p[j] = i + j * scale;
         }
         else
         {
-            for (j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j;
+            for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -175,7 +188,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // write garbage into output arrays
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             uint16_t pattern = 0xdead;
             memset_pattern4(gOut[j], &pattern, bufferSize);
@@ -185,34 +198,34 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
                            error, j);
-                goto exit;
+                return error;
             }
         }
 
         // Run the kernels
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_half);
             size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
-            if ((error = clSetKernelArg(kernels[j], 0, sizeof(gOutBuffer[j]),
-                                        &gOutBuffer[j])))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
-            if ((error = clSetKernelArg(kernels[j], 1, sizeof(gInBuffer),
-                                        &gInBuffer)))
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
             {
                 LogBuildError(programs[j]);
-                goto exit;
+                return error;
             }
 
-            if ((error =
-                     clEnqueueNDRangeKernel(gQueue, kernels[j], 1, NULL,
-                                            &localCount, NULL, 0, NULL, NULL)))
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
             {
                 vlog_error("FAILURE -- could not execute kernel\n");
-                goto exit;
+                return error;
             }
         }
 
@@ -221,7 +234,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
 
         // Calculate the correctly rounded reference result
         cl_half *r = (cl_half *)gOut_Ref;
-        for (j = 0; j < bufferElements; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
             if (!strcmp(name, "nan"))
                 r[j] = reference_nanh(p[j]);
@@ -229,25 +242,24 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                 r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE);
         }
         // Read the data back
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
                                          bufferSize, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
-                goto exit;
+                return error;
             }
         }
 
         if (gSkipCorrectnessTesting) break;
 
-
         // Verify data
         cl_ushort *t = (cl_ushort *)gOut_Ref;
-        for (j = 0; j < bufferElements; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
-            for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
                 cl_ushort *q = (cl_ushort *)(gOut[k]);
 
@@ -289,8 +301,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                             "(0x%0.4x) \nActual: %a (0x%0.4x)\n",
                             f->name, sizeNames[k], err, p[j],
                             cl_half_to_float(r[j]), r[j], test, q[j]);
-                        error = -1;
-                        goto exit;
+                        return -1;
                     }
                 }
             }
@@ -322,6 +333,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
     if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
+#if 0
 exit:
     // Release
     for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
@@ -329,6 +341,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         clReleaseKernel(kernels[k]);
         clReleaseProgram(programs[k]);
     }
+#endif
 
     return error;
-}
\ No newline at end of file
+}

From 5f3f2ec3279083cffc991005ee60f5e881345774 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 23 Mar 2023 15:32:10 +0100
Subject: [PATCH 03/24] Added kernel types related corrections

---
 test_conformance/math_brute_force/binary_half.cpp   | 12 +++++++-----
 test_conformance/math_brute_force/binary_i_half.cpp | 12 +++++++-----
 test_conformance/math_brute_force/common.cpp        |  7 +++++++
 test_conformance/math_brute_force/common.h          |  3 +++
 .../math_brute_force/macro_binary_half.cpp          | 12 +++++++-----
 .../math_brute_force/macro_unary_half.cpp           | 13 ++++++++-----
 test_conformance/math_brute_force/unary_half.cpp    | 11 +++++++----
 test_conformance/math_brute_force/unary_u_half.cpp  |  8 +++++---
 8 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index 4200b07d95..e40e7727dd 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -132,10 +132,11 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                            info->kernels[i], info->programs + i,
                            info->relaxedMode);
 }
-#else
+#endif
 
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
@@ -147,8 +148,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
-
+////////////////////////////////////////////////////////////////////////////////
 
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
@@ -201,6 +201,8 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // A table of more difficult cases to get right
 static const cl_half specialValuesHalf[] = {
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
index 1e772799c5..c4c7288afe 100644
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -130,10 +130,11 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                            info->kernels[i], info->programs + i,
                            info->relaxedMode);
 }
-#else
+#endif
 
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
@@ -145,8 +146,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-
-#endif
+////////////////////////////////////////////////////////////////////////////////
 
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
@@ -197,6 +197,8 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 // A table of more difficult cases to get right
diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp
index 580bbacd9b..3bfde64838 100644
--- a/test_conformance/math_brute_force/common.cpp
+++ b/test_conformance/math_brute_force/common.cpp
@@ -30,6 +30,8 @@ const char *GetTypeName(ParameterType type)
         case ParameterType::Half: return "half";
         case ParameterType::Float: return "float";
         case ParameterType::Double: return "double";
+        case ParameterType::Short: return "short";
+        case ParameterType::UShort: return "ushort";
         case ParameterType::Int: return "int";
         case ParameterType::UInt: return "uint";
         case ParameterType::Long: return "long";
@@ -46,6 +48,9 @@ const char *GetUndefValue(ParameterType type)
         case ParameterType::Float:
         case ParameterType::Double: return "NAN";
 
+        case ParameterType::Short:
+        case ParameterType::UShort: return "0x5678";
+
         case ParameterType::Int:
         case ParameterType::UInt: return "0x12345678";
 
@@ -81,6 +86,8 @@ void EmitEnableExtension(std::ostringstream &kernel, ParameterType type)
             break;
 
         case ParameterType::Float:
+        case ParameterType::Short:
+        case ParameterType::UShort:
         case ParameterType::Int:
         case ParameterType::UInt:
         case ParameterType::Long:
diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h
index f70e7d232d..793a00fe92 100644
--- a/test_conformance/math_brute_force/common.h
+++ b/test_conformance/math_brute_force/common.h
@@ -39,6 +39,8 @@ enum class ParameterType
     Half,
     Float,
     Double,
+    Short,
+    UShort,
     Int,
     UInt,
     Long,
@@ -92,4 +94,5 @@ using SourceGenerator = std::string (*)(const std::string &kernel_name,
 cl_int BuildKernels(BuildKernelInfo &info, cl_uint job_id,
                     SourceGenerator generator);
 
+
 #endif /* COMMON_H */
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index 4060a8b5aa..5810380e43 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -123,23 +123,23 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                            info->kernels[i], info->programs + i,
                            info->relaxedMode);
 }
-#else
+#endif
 
+namespace {
 
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
                         cl_uint vector_size_index) {
-        return GetBinaryKernel(kernel_name, builtin, ParameterType::Int,
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Short,
                                ParameterType::Half, ParameterType::Half,
                                vector_size_index);
     };
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
+////////////////////////////////////////////////////////////////////////////////
 
 typedef struct ThreadInfo
 {
@@ -181,6 +181,8 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 // A table of more difficult cases to get right
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index d2ba1b36a7..0e93f53240 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -125,21 +125,22 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                            info->kernels[i], info->programs + i,
                            info->relaxedMode);
 }
-#else
+#endif
 
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
                         cl_uint vector_size_index) {
-        return GetUnaryKernel(kernel_name, builtin, ParameterType::Int,
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Short,
                               ParameterType::Half, vector_size_index);
     };
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
+////////////////////////////////////////////////////////////////////////////////
 
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
@@ -179,6 +180,8 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index 320ad12c49..d88cd72970 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -126,10 +126,11 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                            info->relaxedMode);
 }
 
-#else
+#endif
 
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
     auto generator = [](const std::string &kernel_name, const char *builtin,
@@ -140,7 +141,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
+////////////////////////////////////////////////////////////////////////////////
 
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
@@ -187,6 +188,8 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index c063de7fa3..94fb880ddc 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -21,6 +21,7 @@
 #include "reference_math.h"
 
 #include <cstring>
+#include <cinttypes>
 
 #if 0
 static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
@@ -130,7 +131,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     auto generator = [](const std::string &kernel_name, const char *builtin,
                         cl_uint vector_size_index) {
         return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
-                              ParameterType::UInt, vector_size_index);
+                              ParameterType::UShort, vector_size_index);
     };
     return BuildKernels(info, job_id, generator);
 }
@@ -311,8 +312,9 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         {
             if (gVerboseBruteForce)
             {
-                vlog("base:%14u step:%10zu  bufferSize:%10zd \n", i, step,
-                     bufferSize);
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10zd \n",
+                     i, step, bufferSize);
             }
             else
             {

From cf97168626e15b98a69dc79e83b217f2cbdc2ab1 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 23 Mar 2023 17:04:07 +0100
Subject: [PATCH 04/24] Added more fixes and general cleanup

---
 .../math_brute_force/binary_half.cpp          | 241 +++---------------
 .../math_brute_force/binary_i_half.cpp        | 217 ++--------------
 .../math_brute_force/i_unary_half.cpp         | 122 +--------
 .../math_brute_force/macro_binary_half.cpp    | 219 ++--------------
 .../math_brute_force/macro_unary_half.cpp     | 200 ++-------------
 .../math_brute_force/mad_half.cpp             | 127 +--------
 .../math_brute_force/unary_half.cpp           | 197 ++------------
 .../math_brute_force/unary_u_half.cpp         | 117 +--------
 8 files changed, 123 insertions(+), 1317 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index e40e7727dd..1aeb36afff 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,128 +14,20 @@
 // limitations under the License.
 //
 
+#include "harness/errorHelpers.h"
+
 #include "common.h"
 #include "function_list.h"
 #include "test_functions.h"
 #include "utility.h"
 #include "reference_math.h"
-#include <cstring>
-
-#include "harness/errorHelpers.h"
-
-#if 0
-
-static int BuildKernelHalf(const char *name, int vectorSize,
-                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                           bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global half",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in1, __global half",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global half* out, __global half* in, __global half* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 d0 = vload3( 0, in + 3 * i );\n"
-        "       half3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       half3 d0, d1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (half3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (half3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
 
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
-                           info->kernels[i], info->programs + i,
-                           info->relaxedMode);
-}
-#endif
+#include <cstring>
+#include <algorithm>
 
 namespace {
 
+////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -149,24 +41,24 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper inBuf2; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     float maxError; // max error value. Init to 0.
     double
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     double maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    MTdataHolder d;
 
-////////////////////////////////////////////////////////////////////////////////
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+};
 
+////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -185,7 +77,6 @@ struct TestInfoBase
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -216,10 +107,11 @@ static const cl_half specialValuesHalf[] = {
     0x0400 /*HALF_MIN*/
 };
 
+////////////////////////////////////////////////////////////////////////////////
 static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
-
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
 
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
                                    bool relaxedMode)
 {
@@ -260,40 +152,8 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
     test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
     test_info.isNextafter = isNextafter;
 
-#if 0
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-
-#else
-
     test_info.tinfo.resize(test_info.threadCount);
 
-#endif
-
     for (i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = { i * test_info.subBufferSize
@@ -340,7 +200,7 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
             return error;
         }
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
     }
 
     // Init the kernels
@@ -379,39 +239,10 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
 
     vlog("\n");
 
-#if 0
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-#endif
-
     return error;
 }
 
+////////////////////////////////////////////////////////////////////////////////
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
@@ -432,7 +263,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     int isNextafter = job->isNextafter;
     cl_ushort *t;
     cl_half *r;
-    float *s = 0, *s2 = 0;
+    std::vector<float> s(0), s2(0);
 
     RoundingMode oldRoundMode;
     cl_int copysign_test = 0;
@@ -495,14 +326,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                       buffer_size, p, 0, NULL, NULL)))
     {
         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
+        return error;
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
                                       buffer_size, p2, 0, NULL, NULL)))
     {
         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
+        return error;
     }
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -511,12 +342,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         if ((error = clWaitForEvents(1, e + j)))
         {
             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
         if ((error = clReleaseEvent(e[j])))
         {
             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
@@ -527,7 +358,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                              out[j], 0, NULL, NULL)))
         {
             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
 
         // run the kernel
@@ -560,7 +391,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                             &vectorCount, NULL, 0, NULL, NULL)))
         {
             vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
+            return error;
         }
     }
 
@@ -591,8 +422,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
     t = (cl_ushort *)r;
-    s = (float *)malloc(buffer_elements * sizeof(float));
-    s2 = (float *)malloc(buffer_elements * sizeof(float));
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
         for (j = 0; j < buffer_elements; j++)
         {
@@ -617,7 +448,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
                        error);
-            goto exit;
+            return error;
         }
     }
 
@@ -628,7 +459,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
+        return error;
     }
 
     // Verify data
@@ -872,7 +703,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                p2[j], cl_half_to_float(r[j]), r[j], test, q[j],
                                j);
                     error = -1;
-                    goto exit;
+                    return error;
                 }
             }
         }
@@ -893,12 +724,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
-
     if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
         {
-            vlog("base:%14u step:%10u scale:%10zu buf_elements:%10u ulps:%5.3f "
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
                  "ThreadCount:%2u\n",
                  base, job->step, job->scale, buffer_elements, job->ulps,
                  job->threadCount);
@@ -909,18 +739,17 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
-exit:
-    if (s) free(s);
-    if (s2) free(s2);
+
     return error;
 }
 
-
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode);
 }
 
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode);
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
index c4c7288afe..571683e5da 100644
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,118 +22,9 @@
 #include <climits>
 #include <cstring>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize,
-                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                           bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global half",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in1, __global int",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global half* out, __global half* in, __global int* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 d0 = vload3( 0, in + 3 * i );\n"
-        "       int3 i0 = vload3( 0, in2 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, i0 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       half3 d0;\n"
-        "       int3 i0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (half3)( in[3*i], NAN, NAN ); \n"
-        "               i0 = (int3)( in2[3*i], 0xdead, 0xdead ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               i0 = (int3)( in2[3*i], in2[3*i+1], 0xdead ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, i0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
-                           info->kernels[i], info->programs + i,
-                           info->relaxedMode);
-}
-#endif
-
 namespace {
 
+////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -147,24 +38,23 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper inBuf2; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     float maxError; // max error value. Init to 0.
     double
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
     MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
 } ThreadInfo;
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -176,12 +66,9 @@ struct TestInfoBase
     cl_uint scale; // stride between individual test values
     float ulps; // max_allowed ulps
     int ftz; // non-zero if running in flush to zero mode
-
-    // no special values
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -258,38 +145,8 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
 
-#if 0
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-#else
-
     test_info.tinfo.resize(test_info.threadCount);
 
-#endif
-
     for (i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = { i * test_info.subBufferSize
@@ -382,42 +239,12 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
     }
 
-
     vlog("\n");
 
-#if 0
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-#endif
-
     return error;
 }
 
+////////////////////////////////////////////////////////////////////////////////
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
@@ -433,7 +260,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     const char *name = job->f->name;
     cl_ushort *t;
     cl_half *r;
-    float *s = 0;
+    std::vector<float> s;
     cl_int *s2;
 
     // start the map of the output arrays
@@ -494,7 +321,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                       NULL, NULL)))
     {
         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
+        return error;
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
@@ -502,7 +329,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                       NULL, NULL)))
     {
         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
+        return error;
     }
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -511,12 +338,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         if ((error = clWaitForEvents(1, e + j)))
         {
             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
         if ((error = clReleaseEvent(e[j])))
         {
             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
@@ -527,7 +354,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                              out[j], 0, NULL, NULL)))
         {
             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
 
         // run the kernel
@@ -560,7 +387,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                             &vectorCount, NULL, 0, NULL, NULL)))
         {
             vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
+            return error;
         }
     }
 
@@ -572,7 +399,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
     t = (cl_ushort *)r;
-    s = (float *)malloc(buffer_elements * sizeof(float));
+    s.resize(buffer_elements);
     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
     for (j = 0; j < buffer_elements; j++)
     {
@@ -591,7 +418,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
                        error);
-            goto exit;
+            return error;
         }
     }
 
@@ -602,7 +429,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
+        return error;
     }
 
     // Verify data
@@ -672,7 +499,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                cl_half_to_float(r[j]), r[j], test, q[j],
                                (cl_uint)j);
                     error = -1;
-                    goto exit;
+                    return error;
                 }
             }
         }
@@ -685,13 +512,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
                        j, error);
-            goto exit;
+            return error;
         }
     }
 
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
-
     if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
@@ -707,8 +533,5 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
-
-exit:
-    if (s) free(s);
     return error;
 }
diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index c274f3e916..c78c03a494 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,107 +23,7 @@
 #include <memory>
 #include <cinttypes>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
-                           cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global int",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in)\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global int* out, __global half* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 f0 = vload3( 0, in + 3 * i );\n"
-        "       int3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(half)). Assume power of two "
-        "buffer size \n"
-        "       half3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (half3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       int3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_kernel *kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
-                           info->programs + i, info->relaxedMode);
-}
-#else
-
+////////////////////////////////////////////////////////////////////////////////
 static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                  void *p)
 {
@@ -136,8 +36,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
-
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -148,7 +47,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     size_t bufferSize = BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
     uint64_t bufferElements = bufferSize / sizeof(cl_int);
-    std::vector<float> s;
+    std::vector<float> s(0);
 
     int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
 
@@ -310,18 +209,5 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 
     vlog("\n");
 
-
-#if 0
-exit:
-    if (s) free(s);
-    RestoreFPState(&oldMode);
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-#endif
-
     return error;
 }
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index 5810380e43..8af034c437 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,112 +21,10 @@
 
 #include <cstring>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize,
-                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                           bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global short",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in1, __global half",
-                        sizeNames[vectorSize],
-                        "* in2 )\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i] );\n"
-                        "}\n" };
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global short* out, __global half* in, __global half* in2)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 f0 = vload3( 0, in + 3 * i );\n"
-        "       half3 f1 = vload3( 0, in2 + 3 * i );\n"
-        "       short3 i0 = ",
-        name,
-        "( f0, f1 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       half3 f0, f1;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (half3)( in[3*i], NAN, NAN ); \n"
-        "               f1 = (half3)( in2[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               f1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       short3 i0 = ",
-        name,
-        "( f0, f1 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
-                           info->kernels[i], info->programs + i,
-                           info->relaxedMode);
-}
-#endif
 
 namespace {
 
+////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -140,18 +38,17 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
-typedef struct ThreadInfo
+struct ThreadInfo
 {
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem inBuf2; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper inBuf2; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     MTdata d;
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+};
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -165,7 +62,6 @@ struct TestInfoBase
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -184,7 +80,6 @@ struct TestInfo : public TestInfoBase
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 // A table of more difficult cases to get right
 static const cl_half specialValuesHalf[] = {
     0xffff,
@@ -197,11 +92,11 @@ static const cl_half specialValuesHalf[] = {
     0x0400 /*HALF_MIN*/
 };
 
+////////////////////////////////////////////////////////////////////////////////
 static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
-
-
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
 
+////////////////////////////////////////////////////////////////////////////////
 int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfoBase test_info_base;
@@ -234,37 +129,8 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
 
-#if 0
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-#else
-
     test_info.tinfo.resize(test_info.threadCount);
 
-#endif
     for (i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = { i * test_info.subBufferSize
@@ -315,7 +181,6 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].d = init_genrand(genrand_int32(d));
     }
 
-
     // Init the kernels
     {
         BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
@@ -340,39 +205,10 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 
     vlog("\n");
 
-#if 0
-exit:
-    // Release
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            free_mtdata(test_info.tinfo[i].d);
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            clReleaseMemObject(test_info.tinfo[i].inBuf2);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-#endif
-
     return error;
 }
 
+////////////////////////////////////////////////////////////////////////////////
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
@@ -387,7 +223,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     cl_int error;
     const char *name = job->f->name;
     cl_short *t, *r;
-    float *s = 0, *s2 = 0;
+    std::vector<float> s(0), s2(0);
 
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
@@ -448,14 +284,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                       buffer_size, p, 0, NULL, NULL)))
     {
         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
+        return error;
     }
 
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
                                       buffer_size, p2, 0, NULL, NULL)))
     {
         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
-        goto exit;
+        return error;
     }
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -464,12 +300,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         if ((error = clWaitForEvents(1, e + j)))
         {
             vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
         if ((error = clReleaseEvent(e[j])))
         {
             vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
@@ -480,7 +316,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                              out[j], 0, NULL, NULL)))
         {
             vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            goto exit;
+            return error;
         }
 
         // run the kernel
@@ -513,7 +349,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                             &vectorCount, NULL, 0, NULL, NULL)))
         {
             vlog_error("FAILED -- could not execute kernel\n");
-            goto exit;
+            return error;
         }
     }
 
@@ -525,8 +361,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     r = (cl_short *)gOut_Ref + thread_id * buffer_elements;
     t = (cl_short *)r;
-    s = (float *)malloc(buffer_elements * sizeof(float));
-    s2 = (float *)malloc(buffer_elements * sizeof(float));
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = cl_half_to_float(p[j]);
@@ -546,7 +382,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
                        error);
-            goto exit;
+            return error;
         }
     }
 
@@ -557,7 +393,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
+        return error;
     }
 
     // Verify data
@@ -605,7 +441,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n",
                 name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j);
             error = -1;
-            goto exit;
+            return error;
         }
 
 
@@ -653,7 +489,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                            "(index: %d)\n",
                            name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j);
                 error = -1;
-                goto exit;
+                return error;
             }
         }
     }
@@ -665,7 +501,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
                        j, error);
-            goto exit;
+            return error;
         }
     }
 
@@ -688,8 +524,5 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         fflush(stdout);
     }
 
-exit:
-    if (s) free(s);
-    if (s2) free(s2);
     return error;
 }
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index 0e93f53240..36d3996efe 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -21,114 +21,9 @@
 
 #include <cstring>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize,
-                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                           bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global short",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in)\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global short* out, __global half* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 f0 = vload3( 0, in + 3 * i );\n"
-        "       short3 i0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( i0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       short3 i0;\n"
-        "       half3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (half3)( in[3*i], 0xdead, 0xdead ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (half3)( in[3*i], in[3*i+1], 0xdead ); \n"
-        "               break;\n"
-        "       }\n"
-        "       i0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = i0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = i0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
-                           info->kernels[i], info->programs + i,
-                           info->relaxedMode);
-}
-#endif
-
 namespace {
 
+////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -141,17 +36,16 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 // Thread specific data for a worker thread
-typedef struct ThreadInfo
+struct ThreadInfo
 {
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    cl_command_queue tQueue; // per thread command queue to improve performance
-} ThreadInfo;
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+};
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -164,7 +58,6 @@ struct TestInfoBase
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -183,9 +76,9 @@ struct TestInfo : public TestInfoBase
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
 
+////////////////////////////////////////////////////////////////////////////////
 int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfoBase test_info_base;
@@ -216,38 +109,8 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     test_info.f = f;
     test_info.ftz = f->ftz || gForceFTZ;
 
-#if 0
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-#else
-
     test_info.tinfo.resize(test_info.threadCount);
 
-#endif
-
     for (i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = { i * test_info.subBufferSize
@@ -310,36 +173,10 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 
     vlog("\n");
 
-#if 0
-exit:
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-#endif
-
     return error;
 }
 
+////////////////////////////////////////////////////////////////////////////////
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
@@ -353,7 +190,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     cl_uint j, k;
     cl_int error = CL_SUCCESS;
     const char *name = job->f->name;
-    float *s = 0;
+    std::vector<float> s(0);
 
     int signbit_test = 0;
     if (!strcmp(name, "signbit")) signbit_test = 1;
@@ -452,7 +289,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements;
     cl_short *t = (cl_short *)r;
-    s = (float *)malloc(buffer_elements * sizeof(float));
+    s.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = cl_half_to_float(p[j]);
@@ -479,7 +316,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
                        error);
-            goto exit;
+            return error;
         }
     }
     // Wait for the last buffer
@@ -489,16 +326,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
+        return error;
     }
 
-
     // Verify data
     for (j = 0; j < buffer_elements; j++)
     {
         cl_short *q = out[0];
 
-
         // If we aren't getting the correctly rounded result
         if (gMinVectorSizeIndex == 0 && t[j] != q[j])
         {
@@ -519,7 +354,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                        "%d vs. %d\n",
                        name, err, s[j], p[j], t[j], q[j]);
             error = -1;
-            goto exit;
+            return error;
         }
 
 
@@ -546,7 +381,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                            "(0x%0.4x)\nExpected: %d \nActual: %d\n",
                            name, sizeNames[k], err, s[j], p[j], -t[j], q[j]);
                 error = -1;
-                goto exit;
+                return error;
             }
         }
     }
@@ -558,13 +393,12 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
                        j, error);
-            goto exit;
+            return error;
         }
     }
 
     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
 
-
     if (0 == (base & 0x0fffffff))
     {
         if (gVerboseBruteForce)
@@ -580,7 +414,5 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
-exit:
-    if (s) free(s);
     return error;
 }
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
index 56115b10cd..ef6f2b776c 100644
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,117 +21,7 @@
 
 #include <cstring>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
-                           cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global half",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in1, __global half",
-                        sizeNames[vectorSize],
-                        "* in2,  __global half",
-                        sizeNames[vectorSize],
-                        "* in3 )\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in1[i], in2[i], in3[i] );\n"
-                        "}\n" };
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global half* out, __global half* in, __global half* in2, __global "
-        "half* in3)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 d0 = vload3( 0, in + 3 * i );\n"
-        "       half3 d1 = vload3( 0, in2 + 3 * i );\n"
-        "       half3 d2 = vload3( 0, in3 + 3 * i );\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       vstore3( d0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       half3 d0, d1, d2;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               d0 = (half3)( in[3*i], NAN, NAN ); \n"
-        "               d1 = (half3)( in2[3*i], NAN, NAN ); \n"
-        "               d2 = (half3)( in3[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               d0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               d1 = (half3)( in2[3*i], in2[3*i+1], NAN ); \n"
-        "               d2 = (half3)( in3[3*i], in3[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       d0 = ",
-        name,
-        "( d0, d1, d2 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = d0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = d0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_kernel *kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
-                           info->programs + i, info->relaxedMode);
-}
-
-#else
-
+////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -144,8 +34,7 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
-
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -298,15 +187,5 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
     }
     vlog("\n");
 
-#if 0
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-#endif
-
     return error;
 }
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index d88cd72970..f5de28d0d5 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,115 +21,9 @@
 
 #include <cstring>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize,
-                           cl_uint kernel_count, cl_kernel *k, cl_program *p,
-                           bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global half",
-                        sizeNames[vectorSize],
-                        "* out, __global half",
-                        sizeNames[vectorSize],
-                        "* in)\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global half* out, __global half* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       half3 f0 = vload3( 0, in + 3 * i );\n"
-        "       f0 = ",
-        name,
-        "( f0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       half3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               f0 = (half3)( in[3*i], NAN, NAN ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               f0 = (half3)( in[3*i], in[3*i+1], NAN ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( f0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernels(kern, (cl_uint)kernSize, testName, kernel_count, k, p,
-                       relaxedMode);
-}
-
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_uint kernel_count;
-    cl_kernel **kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernel_count,
-                           info->kernels[i], info->programs + i,
-                           info->relaxedMode);
-}
-
-#endif
-
 namespace {
 
+////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -142,19 +36,18 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
-    cl_mem inBuf; // input buffer for the thread
-    cl_mem outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
     float maxError; // max error value. Init to 0.
     double maxErrorValue; // position of the max error value.  Init to 0.
-    cl_command_queue tQueue; // per thread command queue to improve performance
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
 } ThreadInfo;
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -172,7 +65,6 @@ struct TestInfoBase
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -191,9 +83,9 @@ struct TestInfo : public TestInfoBase
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
 
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     TestInfoBase test_info_base;
@@ -229,38 +121,8 @@ int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     test_info.ftz =
         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
 
-#if 0
-    // cl_kernels aren't thread safe, so we make one for each vector size for
-    // every thread
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        size_t array_size = test_info.threadCount * sizeof(cl_kernel);
-        test_info.k[i] = (cl_kernel *)malloc(array_size);
-        if (NULL == test_info.k[i])
-        {
-            vlog_error("Error: Unable to allocate storage for kernels!\n");
-            error = CL_OUT_OF_HOST_MEMORY;
-            goto exit;
-        }
-        memset(test_info.k[i], 0, array_size);
-    }
-    test_info.tinfo =
-        (ThreadInfo *)malloc(test_info.threadCount * sizeof(*test_info.tinfo));
-    if (NULL == test_info.tinfo)
-    {
-        vlog_error(
-            "Error: Unable to allocate storage for thread specific data.\n");
-        error = CL_OUT_OF_HOST_MEMORY;
-        goto exit;
-    }
-    memset(test_info.tinfo, 0,
-           test_info.threadCount * sizeof(*test_info.tinfo));
-#else
-
     test_info.tinfo.resize(test_info.threadCount);
 
-#endif
-
     for (i = 0; i < test_info.threadCount; i++)
     {
         cl_buffer_region region = { i * test_info.subBufferSize
@@ -352,36 +214,10 @@ int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
-#if 0
-exit:
-    for (i = gMinVectorSizeIndex; i < gMaxVectorSizeIndex; i++)
-    {
-        clReleaseProgram(test_info.programs[i]);
-        if (test_info.k[i])
-        {
-            for (j = 0; j < test_info.threadCount; j++)
-                clReleaseKernel(test_info.k[i][j]);
-
-            free(test_info.k[i]);
-        }
-    }
-    if (test_info.tinfo)
-    {
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            clReleaseMemObject(test_info.tinfo[i].inBuf);
-            for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-                clReleaseMemObject(test_info.tinfo[i].outBuf[j]);
-            clReleaseCommandQueue(test_info.tinfo[i].tQueue);
-        }
-
-        free(test_info.tinfo);
-    }
-#endif
-
     return error;
 }
 
+////////////////////////////////////////////////////////////////////////////////
 static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
@@ -399,7 +235,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
     int ftz = job->ftz;
 
-    float *s = 0;
+    std::vector<float> s(0);
 
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
@@ -496,7 +332,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // Calculate the correctly rounded reference result
     cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
     cl_ushort *t = (cl_ushort *)r;
-    s = (float *)malloc(buffer_elements * sizeof(float));
+    s.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = (float)cl_half_to_float(p[j]);
@@ -514,7 +350,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
                        error);
-            goto exit;
+            return error;
         }
     }
     // Wait for the last buffer
@@ -524,7 +360,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     if (error || NULL == out[j])
     {
         vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        goto exit;
+        return error;
     }
 
     // Verify data
@@ -602,9 +438,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                "(0x%0.4x)\nExpected: %a (half 0x%0.4x) "
                                "\nActual: %a (half 0x%0.4x)\n",
                                job->f->name, sizeNames[k], err, s[j], p[j],
-                               t[j], cl_half_to_float(r[j]), test, q[j]);
+                               cl_half_to_float(r[j]), t[j], test, q[j]);
                     error = -1;
-                    goto exit;
+                    return error;
                 }
             }
         }
@@ -617,7 +453,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         {
             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
                        j, error);
-            goto exit;
+            return error;
         }
     }
 
@@ -639,7 +475,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         }
         fflush(stdout);
     }
-exit:
-    if (s) free(s);
+
     return error;
 }
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index 94fb880ddc..842e85a9b0 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
+// Copyright (c) 2023 The Khronos Group Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -23,107 +23,7 @@
 #include <cstring>
 #include <cinttypes>
 
-#if 0
-static int BuildKernelHalf(const char *name, int vectorSize, cl_kernel *k,
-                           cl_program *p, bool relaxedMode)
-{
-    const char *c[] = { "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-                        "__kernel void math_kernel",
-                        sizeNames[vectorSize],
-                        "( __global half",
-                        sizeNames[vectorSize],
-                        "* out, __global ushort",
-                        sizeNames[vectorSize],
-                        "* in)\n"
-                        "{\n"
-                        "   int i = get_global_id(0);\n"
-                        "   out[i] = ",
-                        name,
-                        "( in[i] );\n"
-                        "}\n" };
-
-    const char *c3[] = {
-        "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n",
-        "__kernel void math_kernel",
-        sizeNames[vectorSize],
-        "( __global half* out, __global ushort* in)\n"
-        "{\n"
-        "   size_t i = get_global_id(0);\n"
-        "   if( i + 1 < get_global_size(0) )\n"
-        "   {\n"
-        "       ushort3 u0 = vload3( 0, in + 3 * i );\n"
-        "       half3 f0 = ",
-        name,
-        "( u0 );\n"
-        "       vstore3( f0, 0, out + 3*i );\n"
-        "   }\n"
-        "   else\n"
-        "   {\n"
-        "       size_t parity = i & 1;   // Figure out how many elements are "
-        "left over after BUFFER_SIZE % (3*sizeof(float)). Assume power of two "
-        "buffer size \n"
-        "       ushort3 u0;\n"
-        "       half3 f0;\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 1:\n"
-        "               u0 = (ushort3)( in[3*i], 0xdead, 0xdead ); \n"
-        "               break;\n"
-        "           case 0:\n"
-        "               u0 = (ushort3)( in[3*i], in[3*i+1], 0xdead ); \n"
-        "               break;\n"
-        "       }\n"
-        "       f0 = ",
-        name,
-        "( u0 );\n"
-        "       switch( parity )\n"
-        "       {\n"
-        "           case 0:\n"
-        "               out[3*i+1] = f0.y; \n"
-        "               // fall through\n"
-        "           case 1:\n"
-        "               out[3*i] = f0.x; \n"
-        "               break;\n"
-        "       }\n"
-        "   }\n"
-        "}\n"
-    };
-
-    const char **kern = c;
-    size_t kernSize = sizeof(c) / sizeof(c[0]);
-
-    if (sizeValues[vectorSize] == 3)
-    {
-        kern = c3;
-        kernSize = sizeof(c3) / sizeof(c3[0]);
-    }
-
-    char testName[32];
-    snprintf(testName, sizeof(testName) - 1, "math_kernel%s",
-             sizeNames[vectorSize]);
-
-    return MakeKernel(kern, (cl_uint)kernSize, testName, k, p, relaxedMode);
-}
-
-typedef struct BuildKernelInfo
-{
-    cl_uint offset; // the first vector size to build
-    cl_kernel *kernels;
-    cl_program *programs;
-    const char *nameInCode;
-    bool relaxedMode;
-} BuildKernelInfo;
-
-static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
-                                 void *p)
-{
-    BuildKernelInfo *info = (BuildKernelInfo *)p;
-    cl_uint i = info->offset + job_id;
-    return BuildKernelHalf(info->nameInCode, i, info->kernels + i,
-                           info->programs + i, info->relaxedMode);
-}
-#else
-
+////////////////////////////////////////////////////////////////////////////////
 static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                  void *p)
 {
@@ -136,8 +36,7 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-#endif
-
+////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -335,15 +234,5 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
     if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
     vlog("\n");
 
-#if 0
-exit:
-    // Release
-    for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
-    {
-        clReleaseKernel(kernels[k]);
-        clReleaseProgram(programs[k]);
-    }
-#endif
-
     return error;
 }

From cc8db3b4098c18c04a217f5967de98020a7ff70c Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 23 Mar 2023 17:52:17 +0100
Subject: [PATCH 05/24] Corrected ULP values for half tests (issue #142,
 bruteforce)

---
 .../math_brute_force/function_list.cpp        | 234 +++++++++---------
 1 file changed, 117 insertions(+), 117 deletions(-)

diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 1dcd4d9001..cb5cf1131c 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -29,18 +29,18 @@
 // Only use ulps information in spir test
 #ifdef FUNCTION_LIST_ULPS_ONLY
 
-#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                       \
-    {                                                                          \
-        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,       \
-            RELAXED_OFF, _type                                                 \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)                          \
+    {                                                                                        \
+        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },                    \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,                \
+            RELAXED_OFF, _type                                                               \
     }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type,     \
-                  _relaxed_embedded_ulp)                                       \
-    {                                                                          \
-        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _ulp, _embedded_ulp, _relaxed_ulp,                     \
-            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                   \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type,        \
+                  _relaxed_embedded_ulp)                                                     \
+    {                                                                                        \
+        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },                    \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp,                              \
+            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                                 \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
@@ -48,11 +48,11 @@
             { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY,     \
             INFINITY, _rmode, RELAXED_OFF, _type                               \
     }
-#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)   \
-    {                                                                          \
-        STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
-            _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
-            _type                                                              \
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)      \
+    {                                                                                        \
+        STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp,               \
+            _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,               \
+            _type                                                                            \
     }
 
 #define unaryF NULL
@@ -80,21 +80,21 @@
 
 #else // FUNCTION_LIST_ULPS_ONLY
 
-#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                       \
-    {                                                                          \
-        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
-            { (void*)reference_##_name##l }, { (void*)reference_##_name },     \
-            _ulp, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,       \
-            RELAXED_OFF, _type                                                 \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)                         \
+    {                                                                                       \
+        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },                   \
+            { (void*)reference_##_name##l }, { (void*)reference_##_name },                  \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,               \
+            RELAXED_OFF, _type                                                              \
     }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type,     \
-                  _relaxed_embedded_ulp)                                       \
-    {                                                                          \
-        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
-            { (void*)reference_##_name##l },                                   \
-            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _ulp,          \
-            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,        \
-            RELAXED_ON, _type                                                  \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type,       \
+                  _relaxed_embedded_ulp)                                                    \
+    {                                                                                       \
+        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },                   \
+            { (void*)reference_##_name##l },                                                \
+            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp,                  \
+            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,                     \
+            RELAXED_ON, _type                                                               \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
@@ -103,11 +103,11 @@
             _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
             _type                                                              \
     }
-#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)   \
-    {                                                                          \
-        STRINGIFY(_name), _operator, { (void*)reference_##_name },             \
-            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _ulp,       \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)     \
+    {                                                                                       \
+        STRINGIFY(_name), _operator, { (void*)reference_##_name },                          \
+            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp,               \
+            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type                   \
     }
 
 static constexpr vtbl _unary = {
@@ -234,24 +234,24 @@ static constexpr vtbl _mad_tbl = {
 #endif // FUNCTION_LIST_ULPS_ONLY
 
 const Func functionList[] = {
-    ENTRY_EXT(acos, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(acosh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(acospi, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(asin, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(asinh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(asinpi, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(atan, 5.0f, 5.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(atanh, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY(atanpi, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY(atan2, 6.0f, 6.0f, FTZ_OFF, binaryF),
-    ENTRY(atan2pi, 6.0f, 6.0f, FTZ_OFF, binaryF),
-    ENTRY(cbrt, 2.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(ceil, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(copysign, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY_EXT(cos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(acosh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(acospi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(asinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(asinpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(atanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(atanpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(atan2, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF),
+    ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF),
+    ENTRY(cbrt, 2.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
+    ENTRY(ceil, 0.0f, 0.0f, 0.f, FTZ_OFF, unaryF),
+    ENTRY(copysign, 0.0f, 0.0f, 0.f, FTZ_OFF, binaryF),
+    ENTRY_EXT(cos, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(cosh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(cospi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(cosh, 4.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
+    ENTRY_EXT(cospi, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
     //                                  ENTRY( erfc,                  16.0f,
     //                                  16.0f,         FTZ_OFF,     unaryF),
@@ -260,81 +260,81 @@ const Func functionList[] = {
     //                                  16.0f,         16.0f,         FTZ_OFF,
     //                                  unaryF), //disabled for 1.0 due to lack
     //                                  of reference implementation
-    ENTRY_EXT(exp, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF,
+    ENTRY_EXT(exp, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF,
               4.0f), // relaxed error is actually overwritten in unary.c as it
                      // is 3+floor(fabs(2*x))
-    ENTRY_EXT(exp2, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF,
+    ENTRY_EXT(exp2, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF,
               4.0f), // relaxed error is actually overwritten in unary.c as it
                      // is 3+floor(fabs(2*x))
-    ENTRY_EXT(exp10, 3.0f, 4.0f, 8192.0f, FTZ_OFF, unaryF,
+    ENTRY_EXT(exp10, 3.0f, 4.0f, 2.f, 8192.0f, FTZ_OFF, unaryF,
               8192.0f), // relaxed error is actually overwritten in unary.c as
                         // it is 3+floor(fabs(2*x)) in derived mode,
     // in non-derived mode it uses the ulp error for half_exp10.
-    ENTRY(expm1, 3.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(fabs, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(fdim, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(floor, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(fma, 0.0f, 0.0f, FTZ_OFF, ternaryF),
-    ENTRY(fmax, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fmin, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fmod, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fract, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
-    ENTRY(frexp, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
-    ENTRY(hypot, 4.0f, 4.0f, FTZ_OFF, binaryF),
-    ENTRY(ilogb, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
-    ENTRY(isequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isfinite, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isgreaterequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isinf, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isless, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(islessequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(islessgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isnan, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isnormal, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isnotequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isunordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(ldexp, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
-    ENTRY(lgamma, INFINITY, INFINITY, FTZ_OFF, unaryF),
-    ENTRY(lgamma_r, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i),
-    ENTRY_EXT(log, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY(expm1, 3.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
+    ENTRY(fabs, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(fdim, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(floor, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(fma, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF),
+    ENTRY(fmax, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fmin, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fmod, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fract, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
+    ENTRY(frexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
+    ENTRY(hypot, 4.0f, 4.0f, 2.0f, FTZ_OFF, binaryF),
+    ENTRY(ilogb, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
+    ENTRY(isequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isfinite, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isinf, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isless, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(islessequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isnan, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isnormal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
+    ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF),
+    ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i),
+    ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY_EXT(log2, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY_EXT(log10, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY(log1p, 2.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(logb, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function,
+    ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function,
               INFINITY), // in fast-relaxed-math mode it has to be either
                          // exactly rounded fma or exactly rounded a*b+c
-    ENTRY(maxmag, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
-    ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
-    ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
-    ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF,
+    ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(minmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(modf, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
+    ENTRY(nan, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
+    ENTRY(nextafter, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
+    ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 8192.0f, FTZ_OFF, binaryF,
               8192.0f), // in derived mode the ulp error is calculated as
                         // exp2(y*log2(x)) and in non-derived it is the same as
                         // half_pow
-    ENTRY(pown, 16.0f, 16.0f, FTZ_OFF, binaryF_i),
-    ENTRY(powr, 16.0f, 16.0f, FTZ_OFF, binaryF),
+    ENTRY(pown, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i),
+    ENTRY(powr, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF),
     //                                  ENTRY( reciprocal,            1.0f,
     //                                  1.0f,         FTZ_OFF,     unaryF),
-    ENTRY(remainder, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(remquo, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
-    ENTRY(rint, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(rootn, 16.0f, 16.0f, FTZ_OFF, binaryF_i),
-    ENTRY(round, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(rsqrt, 2.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(signbit, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY_EXT(sin, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(remainder, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(remquo, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
+    ENTRY(rint, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(rootn, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i),
+    ENTRY(round, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, FTZ_OFF, unaryF),
+    ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY_EXT(sincos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results,
+    ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(sinh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(sinpi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
     { "sqrt",
       "sqrt",
@@ -365,16 +365,16 @@ const Func functionList[] = {
       RELAXED_OFF,
       unaryF },
     ENTRY_EXT(
-        tan, 5.0f, 5.0f, 8192.0f, FTZ_OFF, unaryF,
+        tan, 5.0f, 5.0f, 2.0f, 8192.0f, FTZ_OFF, unaryF,
         8192.0f), // in derived mode it the ulp error is calculated as sin/cos
                   // and in non-derived mode it is the same as half_tan.
-    ENTRY(tanh, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY(tanpi, 6.0f, 6.0f, FTZ_OFF, unaryF),
+    ENTRY(tanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(tanpi, 6.0f, 6.0f, 2.0f, FTZ_OFF, unaryF),
     //                                    ENTRY( tgamma,                 16.0f,
     //                                    16.0f,         FTZ_OFF,     unaryF),
     //                                    // Commented this out until we can be
     //                                    sure this requirement is realistic
-    ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(trunc, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
 
     HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
     HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF),
@@ -392,8 +392,8 @@ const Func functionList[] = {
     HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
 
     // basic operations
-    OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
     { "divide",
       "/",
       { (void*)reference_divide },
@@ -422,10 +422,10 @@ const Func functionList[] = {
       FTZ_OFF,
       RELAXED_OFF,
       binaryOperatorF },
-    OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, FTZ_OFF,
+    OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, FTZ_OFF,
                    unaryF), // A simple copy operation
-    OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
 };
 
 const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]);

From dd42b073830b7588774d2834d611ca212972f65f Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 23 Mar 2023 18:57:04 +0100
Subject: [PATCH 06/24] Corrected presubmit check for clang format

---
 .../math_brute_force/function_list.cpp        | 81 ++++++++++---------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index cb5cf1131c..bfe3ff8ae0 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -29,18 +29,18 @@
 // Only use ulps information in spir test
 #ifdef FUNCTION_LIST_ULPS_ONLY
 
-#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)                          \
-    {                                                                                        \
-        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },                    \
-            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,                \
-            RELAXED_OFF, _type                                                               \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)            \
+    {                                                                          \
+        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,  \
+            RELAXED_OFF, _type                                                 \
     }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type,        \
-                  _relaxed_embedded_ulp)                                                     \
-    {                                                                                        \
-        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },                    \
-            _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp,                              \
-            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                                 \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \
+                  _type, _relaxed_embedded_ulp)                                \
+    {                                                                          \
+        STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp,                \
+            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                   \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
@@ -48,11 +48,12 @@
             { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY,     \
             INFINITY, _rmode, RELAXED_OFF, _type                               \
     }
-#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)      \
-    {                                                                                        \
-        STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp,               \
-            _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,               \
-            _type                                                                            \
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp,       \
+                       _rmode, _type)                                          \
+    {                                                                          \
+        STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
+            _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \
+            _type                                                              \
     }
 
 #define unaryF NULL
@@ -80,21 +81,21 @@
 
 #else // FUNCTION_LIST_ULPS_ONLY
 
-#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)                         \
-    {                                                                                       \
-        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },                   \
-            { (void*)reference_##_name##l }, { (void*)reference_##_name },                  \
-            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,               \
-            RELAXED_OFF, _type                                                              \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)            \
+    {                                                                          \
+        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
+            { (void*)reference_##_name##l }, { (void*)reference_##_name },     \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,  \
+            RELAXED_OFF, _type                                                 \
     }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, _type,       \
-                  _relaxed_embedded_ulp)                                                    \
-    {                                                                                       \
-        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },                   \
-            { (void*)reference_##_name##l },                                                \
-            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp,                  \
-            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,                     \
-            RELAXED_ON, _type                                                               \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \
+                  _type, _relaxed_embedded_ulp)                                \
+    {                                                                          \
+        STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
+            { (void*)reference_##_name##l },                                   \
+            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp,     \
+            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,        \
+            RELAXED_ON, _type                                                  \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
@@ -103,11 +104,12 @@
             _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
             _type                                                              \
     }
-#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)     \
-    {                                                                                       \
-        STRINGIFY(_name), _operator, { (void*)reference_##_name },                          \
-            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp,               \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type                   \
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp,       \
+                       _rmode, _type)                                          \
+    {                                                                          \
+        STRINGIFY(_name), _operator, { (void*)reference_##_name },             \
+            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp,  \
+            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
 
 static constexpr vtbl _unary = {
@@ -297,7 +299,8 @@ const Func functionList[] = {
     ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
     ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
     ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF),
-    ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i),
+    ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF,
+          unaryF_two_results_i),
     ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
     ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
@@ -306,7 +309,8 @@ const Func functionList[] = {
               4.76837158203125e-7f), // relaxed ulp 2^-21
     ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
     ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function,
+    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF,
+              mad_function,
               INFINITY), // in fast-relaxed-math mode it has to be either
                          // exactly rounded fma or exactly rounded a*b+c
     ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
@@ -331,7 +335,8 @@ const Func functionList[] = {
     ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
     ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results,
+    ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF,
+              unaryF_two_results,
               0.00048828125f), // relaxed ulp 2^-11
     ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
     ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,

From 04033bf8bd958c9c053e753363c4588f7dc75ccf Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 18 Apr 2023 22:10:51 +0200
Subject: [PATCH 07/24] Added support for ternary, unary_two_result and
 unary_two_result_i tests for cl_half (issue #142, bruteforce)

---
 .../math_brute_force/CMakeLists.txt           |   3 +
 .../math_brute_force/function_list.cpp        |   6 +-
 test_conformance/math_brute_force/main.cpp    |  12 +
 .../math_brute_force/ternary_half.cpp         | 774 ++++++++++++++++++
 .../math_brute_force/test_functions.h         |   9 +
 .../unary_two_results_half.cpp                | 527 ++++++++++++
 .../unary_two_results_i_half.cpp              | 368 +++++++++
 test_conformance/math_brute_force/utility.h   |  25 +
 8 files changed, 1721 insertions(+), 3 deletions(-)
 create mode 100644 test_conformance/math_brute_force/ternary_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_two_results_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_two_results_i_half.cpp

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index ec5b3dae6c..50f1fd00fc 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -34,14 +34,17 @@ set(${MODULE_NAME}_SOURCES
     sleep.h
     ternary_double.cpp
     ternary_float.cpp
+    ternary_half.cpp
     test_functions.h
     unary_double.cpp
     unary_float.cpp
     unary_half.cpp
     unary_two_results_double.cpp
     unary_two_results_float.cpp
+    unary_two_results_half.cpp
     unary_two_results_i_double.cpp
     unary_two_results_i_float.cpp
+    unary_two_results_i_half.cpp
     unary_u_double.cpp
     unary_u_float.cpp
     unary_u_half.cpp
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index bfe3ff8ae0..67ed0d8ac1 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -185,21 +185,21 @@ static constexpr vtbl _ternary = {
     "ternary",
     TestFunc_Float_Float_Float_Float,
     TestFunc_Double_Double_Double_Double,
-    NULL,
+    TestFunc_Half_Half_Half_Half,
 };
 
 static constexpr vtbl _unary_two_results = {
     "unary_two_results",
     TestFunc_Float2_Float,
     TestFunc_Double2_Double,
-    NULL,
+    TestFunc_Half2_Half,
 };
 
 static constexpr vtbl _unary_two_results_i = {
     "unary_two_results_i",
     TestFunc_FloatI_Float,
     TestFunc_DoubleI_Double,
-    NULL,
+    TestFunc_HalfI_Half,
 };
 
 static constexpr vtbl _binary_two_results_i = {
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index b31d1e5ef0..e5d3545ca1 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -108,6 +108,8 @@ cl_device_fp_config gFloatCapabilities = 0;
 int gWimpyReductionFactor = 32;
 int gVerboseBruteForce = 0;
 
+cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE;
+
 static int ParseArgs(int argc, const char **argv);
 static void PrintUsage(void);
 static void PrintFunctions(void);
@@ -694,6 +696,16 @@ test_status InitCL(cl_device_id device)
 
             return TEST_FAIL;
         }
+
+        if ((gHalfCapabilities & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            gHalfRoundingMode = CL_HALF_RTE;
+        }
+        else // due to above condition it must be RTZ
+        {
+            gHalfRoundingMode = CL_HALF_RTZ;
+        }
+
 #else
         vlog_error("FAIL: device says it supports cl_khr_fp16 but "
                    "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n");
diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
new file mode 100644
index 0000000000..0d8bb8cf2f
--- /dev/null
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -0,0 +1,774 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <cstring>
+
+#define CORRECTLY_ROUNDED 0
+#define FLUSHED 1
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetTernaryKernel(kernel_name, builtin, ParameterType::Half,
+                                ParameterType::Half, ParameterType::Half,
+                                ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// A table of more difficult cases to get right
+static const cl_half specialValuesHalf[] = {
+    0xffff,
+    0x0000,
+    0x0001,
+    0x7c00 /*INFINITY*/,
+    0xfc00 /*-INFINITY*/,
+    0x8000 /*-0*/,
+    0x7bff /*HALF_MAX*/,
+    0x0400 /*HALF_MIN*/
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    float maxErrorVal3 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half);
+
+    cl_uchar overflow[half_buffer_size];
+    float half_ulps = f->half_ulps;
+    int skipNanInf = (0 == strcmp("fma", f->nameInCode));
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
+                                relaxedMode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *hp0 = (cl_half *)gIn;
+        cl_half *hp1 = (cl_half *)gIn2;
+        cl_half *hp2 = (cl_half *)gIn3;
+        size_t idx = 0;
+
+        if (i == 0)
+        { // test edge cases
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; idx < half_buffer_size; idx++)
+            {
+                hp0[idx] = specialValuesHalf[x];
+                hp1[idx] = specialValuesHalf[y];
+                hp2[idx] = specialValuesHalf[z];
+
+                if (++x >= specialValuesHalfCount)
+                {
+                    x = 0;
+                    if (++y >= specialValuesHalfCount)
+                    {
+                        y = 0;
+                        if (++z >= specialValuesHalfCount) break;
+                    }
+                }
+            }
+            if (idx == half_buffer_size)
+                vlog_error("Test Error: not all special cases tested!\n");
+        }
+
+        auto any_value = [&d]() {
+            float t = (float)((double)genrand_int32(d) / (double)0xFFFFFFFF);
+            return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX);
+        };
+
+        for (; idx < half_buffer_size; idx++)
+        {
+            hp0[idx] = any_value();
+            hp1[idx] = any_value();
+            hp2[idx] = any_value();
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
+                               error);
+                    return error;
+                }
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_half) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer3), &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue)))
+        {
+            vlog("clFlush failed\n");
+            return error;
+        }
+
+        // Calculate the correctly rounded reference result
+        cl_half *res = (cl_half *)gOut_Ref;
+        if (skipNanInf)
+        {
+            for (size_t j = 0; j < half_buffer_size; j++)
+            {
+                feclearexcept(FE_OVERFLOW);
+                res[j] = HFF((float)f->func.f_fma(
+                    HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
+        }
+        else
+        {
+            for (size_t j = 0; j < half_buffer_size; j++)
+                res[j] = HFF((float)f->func.f_fma(
+                    HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
+        }
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (size_t j = 0; j < half_buffer_size; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    int fail;
+                    cl_half test = ((cl_half *)q)[j];
+                    float ref1 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
+                                               HTF(hp2[j]), CORRECTLY_ROUNDED);
+                    cl_half correct = HFF(ref1);
+
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
+                    if (skipNanInf)
+                    {
+                        if (overflow[j] || IsHalfInfinity(correct)
+                            || IsHalfNaN(correct) || IsHalfInfinity(hp0[j])
+                            || IsHalfNaN(hp0[j]) || IsHalfInfinity(hp1[j])
+                            || IsHalfNaN(hp1[j]) || IsHalfInfinity(hp2[j])
+                            || IsHalfNaN(hp2[j]))
+                            continue;
+                    }
+
+                    float err =
+                        test != correct ? Ulp_Error_Half(test, ref1) : 0.f;
+                    fail = !(fabsf(err) <= half_ulps);
+
+                    if (fail && (ftz || relaxedMode))
+                    {
+                        // retry per section 6.5.3.2  with flushing on
+                        if (0.0f == test
+                            && 0.0f
+                                == f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
+                                                 HTF(hp2[j]), FLUSHED))
+                        {
+                            fail = 0;
+                            err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (fail && IsHalfSubnormal(hp0[j]))
+                        { // look at me,
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                            float ref2 =
+                                f->func.f_fma(0.0f, HTF(hp1[j]), HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct2 = HFF(ref2);
+                            float ref3 =
+                                f->func.f_fma(-0.0f, HTF(hp1[j]), HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct3 = HFF(ref3);
+
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correct2)
+                                    || IsHalfNaN(correct2)
+                                    || IsHalfInfinity(correct3)
+                                    || IsHalfNaN(correct3))
+                                    continue;
+                            }
+
+                            float err2 = test != correct2
+                                ? Ulp_Error_Half(test, ref2)
+                                : 0.f;
+                            float err3 = test != correct3
+                                ? Ulp_Error_Half(test, ref3)
+                                : 0.f;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps))
+                                    && (!(fabsf(err3) <= half_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(0.0f, HTF(hp1[j]),
+                                                         HTF(hp2[j]), FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(-0.0f, HTF(hp1[j]),
+                                                         HTF(hp2[j]), FLUSHED)))
+                            {
+                                fail = 0;
+                                err = 0.0f;
+                            }
+
+                            // try with first two args as zero
+                            if (IsHalfSubnormal(hp1[j]))
+                            { // its fun to have fun,
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                ref2 = f->func.f_fma(0.0f, 0.0f, HTF(hp2[j]),
+                                                     CORRECTLY_ROUNDED);
+                                correct2 = HFF(ref2);
+                                ref3 = f->func.f_fma(-0.0f, 0.0f, HTF(hp2[j]),
+                                                     CORRECTLY_ROUNDED);
+                                correct3 = HFF(ref3);
+                                float ref4 =
+                                    f->func.f_fma(0.0f, -0.0f, HTF(hp2[j]),
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct4 = HFF(ref4);
+                                float ref5 =
+                                    f->func.f_fma(-0.0f, -0.0f, HTF(hp2[j]),
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct5 = HFF(ref5);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
+                                {
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsHalfInfinity(correct2)
+                                        || IsHalfNaN(correct2)
+                                        || IsHalfInfinity(correct3)
+                                        || IsHalfNaN(correct3)
+                                        || IsHalfInfinity(correct4)
+                                        || IsHalfNaN(correct4)
+                                        || IsHalfInfinity(correct5)
+                                        || IsHalfNaN(correct5))
+                                        continue;
+                                }
+
+                                err2 = test != correct2
+                                    ? Ulp_Error_Half(test, ref2)
+                                    : 0.f;
+                                err3 = test != correct3
+                                    ? Ulp_Error_Half(test, ref3)
+                                    : 0.f;
+                                float err4 = test != correct4
+                                    ? Ulp_Error_Half(test, ref4)
+                                    : 0.f;
+                                float err5 = test != correct5
+                                    ? Ulp_Error_Half(test, ref5)
+                                    : 0.f;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= half_ulps))
+                                        && (!(fabsf(err3) <= half_ulps))
+                                        && (!(fabsf(err4) <= half_ulps))
+                                        && (!(fabsf(err5) <= half_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, 0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, 0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, -0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, -0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)))
+                                {
+                                    fail = 0;
+                                    err = 0.0f;
+                                }
+
+                                if (IsHalfSubnormal(hp2[j]))
+                                {
+                                    if (test == 0.0f) // 0*0+0 is 0
+                                    {
+                                        fail = 0;
+                                        err = 0.0f;
+                                    }
+                                }
+                            }
+                            else if (IsHalfSubnormal(hp2[j]))
+                            {
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                ref2 = f->func.f_fma(0.0f, HTF(hp1[j]), 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct2 = HFF(ref2);
+                                ref3 = f->func.f_fma(-0.0f, HTF(hp1[j]), 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct3 = HFF(ref3);
+                                float ref4 =
+                                    f->func.f_fma(0.0f, HTF(hp1[j]), -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct4 = HFF(ref4);
+                                float ref5 =
+                                    f->func.f_fma(-0.0f, HTF(hp1[j]), -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct5 = HFF(ref5);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
+                                {
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsHalfInfinity(correct2)
+                                        || IsHalfNaN(correct2)
+                                        || IsHalfInfinity(correct3)
+                                        || IsHalfNaN(correct3)
+                                        || IsHalfInfinity(correct4)
+                                        || IsHalfNaN(correct4)
+                                        || IsHalfInfinity(correct5)
+                                        || IsHalfNaN(correct5))
+                                        continue;
+                                }
+
+                                err2 = test != correct2
+                                    ? Ulp_Error_Half(test, ref2)
+                                    : 0.f;
+                                err3 = test != correct3
+                                    ? Ulp_Error_Half(test, ref3)
+                                    : 0.f;
+                                float err4 = test != correct4
+                                    ? Ulp_Error_Half(test, ref4)
+                                    : 0.f;
+                                float err5 = test != correct5
+                                    ? Ulp_Error_Half(test, ref5)
+                                    : 0.f;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= half_ulps))
+                                        && (!(fabsf(err3) <= half_ulps))
+                                        && (!(fabsf(err4) <= half_ulps))
+                                        && (!(fabsf(err5) <= half_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, HTF(hp1[j]),
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, HTF(hp1[j]),
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, HTF(hp1[j]),
+                                                             -0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, HTF(hp1[j]),
+                                                             -0.0f, FLUSHED)))
+                                {
+                                    fail = 0;
+                                    err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsHalfSubnormal(hp1[j]))
+                        {
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                            float ref2 =
+                                f->func.f_fma(HTF(hp0[j]), 0.0f, HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct2 = HFF(ref2);
+                            float ref3 =
+                                f->func.f_fma(HTF(hp0[j]), -0.0f, HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct3 = HFF(ref3);
+
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correct2)
+                                    || IsHalfNaN(correct2)
+                                    || IsHalfInfinity(correct3)
+                                    || IsHalfNaN(correct3))
+                                    continue;
+                            }
+
+                            float err2 = test != correct2
+                                ? Ulp_Error_Half(test, ref2)
+                                : 0.f;
+                            float err3 = test != correct3
+                                ? Ulp_Error_Half(test, ref3)
+                                : 0.f;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps))
+                                    && (!(fabsf(err3) <= half_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(HTF(hp0[j]), 0.0f,
+                                                         HTF(hp2[j]), FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(HTF(hp0[j]), -0.0f,
+                                                         HTF(hp2[j]), FLUSHED)))
+                            {
+                                fail = 0;
+                                err = 0.0f;
+                            }
+
+                            // try with second two args as zero
+                            if (IsHalfSubnormal(hp2[j]))
+                            {
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                ref2 = f->func.f_fma(HTF(hp0[j]), 0.0f, 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct2 = HFF(ref2);
+                                ref3 = f->func.f_fma(HTF(hp0[j]), -0.0f, 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct3 = HFF(ref3);
+                                float ref4 =
+                                    f->func.f_fma(HTF(hp0[j]), 0.0f, -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct4 = HFF(ref4);
+                                float ref5 =
+                                    f->func.f_fma(HTF(hp0[j]), -0.0f, -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct5 = HFF(ref5);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
+                                {
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsHalfInfinity(correct2)
+                                        || IsHalfNaN(correct2)
+                                        || IsHalfInfinity(correct3)
+                                        || IsHalfNaN(correct3)
+                                        || IsHalfInfinity(correct4)
+                                        || IsHalfNaN(correct4)
+                                        || IsHalfInfinity(correct5)
+                                        || IsHalfNaN(correct5))
+                                        continue;
+                                }
+
+                                err2 = test != correct2
+                                    ? Ulp_Error_Half(test, ref2)
+                                    : 0.f;
+                                err3 = test != correct3
+                                    ? Ulp_Error_Half(test, ref3)
+                                    : 0.f;
+                                float err4 = test != correct4
+                                    ? Ulp_Error_Half(test, ref4)
+                                    : 0.f;
+                                float err5 = test != correct5
+                                    ? Ulp_Error_Half(test, ref5)
+                                    : 0.f;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= half_ulps))
+                                        && (!(fabsf(err3) <= half_ulps))
+                                        && (!(fabsf(err4) <= half_ulps))
+                                        && (!(fabsf(err5) <= half_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), 0.0f,
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), -0.0f,
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), 0.0f,
+                                                             -0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), -0.0f,
+                                                             -0.0f, FLUSHED)))
+                                {
+                                    fail = 0;
+                                    err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsHalfSubnormal(hp2[j]))
+                        {
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                            float ref2 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
+                                                       0.0f, CORRECTLY_ROUNDED);
+                            cl_half correct2 = HFF(ref2);
+                            float ref3 =
+                                f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), -0.0f,
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct3 = HFF(ref3);
+
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correct2)
+                                    || IsHalfNaN(correct2)
+                                    || IsHalfInfinity(correct3)
+                                    || IsHalfNaN(correct3))
+                                    continue;
+                            }
+
+                            float err2 = test != correct2
+                                ? Ulp_Error_Half(test, correct2)
+                                : 0.f;
+                            float err3 = test != correct3
+                                ? Ulp_Error_Half(test, correct3)
+                                : 0.f;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps))
+                                    && (!(fabsf(err3) <= half_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(HTF(hp0[j]),
+                                                         HTF(hp1[j]), 0.0f,
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(HTF(hp0[j]),
+                                                         HTF(hp1[j]), -0.0f,
+                                                         FLUSHED)))
+                            {
+                                fail = 0;
+                                err = 0.0f;
+                            }
+                        }
+                    }
+
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = HTF(hp0[j]);
+                        maxErrorVal2 = HTF(hp1[j]);
+                        maxErrorVal3 = HTF(hp2[j]);
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error(
+                            "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
+                            "({0x%4.4x, 0x%4.4x, 0x%4.4x}): *%a vs. %a\n",
+                            f->name, sizeNames[k], err, HTF(hp0[j]),
+                            HTF(hp1[j]), HTF(hp2[j]), hp0[j], hp1[j], hp2[j],
+                            HTF(res[j]), HTF(test));
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h
index 91cca16339..16f57013ce 100644
--- a/test_conformance/math_brute_force/test_functions.h
+++ b/test_conformance/math_brute_force/test_functions.h
@@ -108,18 +108,27 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata,
                                          bool relaxedMode);
 
+// half foo(half, half, half)
+int TestFunc_Half_Half_Half_Half(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, float*)
 int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, double*)
 int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, half*)
+int TestFunc_Half2_Half(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, int*)
 int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, int*)
 int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, int*)
+int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode);
+
 // float foo(float, float, int*)
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
 
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
new file mode 100644
index 0000000000..3f8d71168d
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -0,0 +1,527 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::Half, ParameterType::Half,
+                              vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+} // anonymous namespace
+
+int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError0 = 0.0f;
+    float maxError1 = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal0 = 0.0f;
+    float maxErrorVal1 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half);
+
+    cl_uchar overflow[half_buffer_size];
+    int isFract = 0 == strcmp("fract", f->nameInCode);
+    int skipNanInf = isFract;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    float half_ulps = f->half_ulps;
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
+                                relaxedMode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *pIn = (cl_half *)gIn;
+        {
+            const unsigned m_size = 0x1ff;
+            const unsigned e_size = 0xf;
+            const unsigned s_size = 0x2;
+            const unsigned sclamp = 0xffff;
+
+            for (size_t j = 0; j < half_buffer_size; j++)
+            {
+                unsigned ind = j % (s_size * e_size * m_size);
+                unsigned val = (((ind / (e_size * m_size)) << 15)
+                                | (((ind / m_size) % e_size + 1) << 10)
+                                | (ind % m_size + 1))
+                    & sclamp;
+                pIn[j] = val;
+
+                if (relaxedMode && strcmp(f->name, "sincos") == 0)
+                {
+                    float pj = HTF(pIn[j]);
+                    if (fabs(pj) > M_PI) pIn[j] = 0x7e00; // HALF_NAN
+                }
+            }
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_half);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue)))
+        {
+            vlog_error("clFlush failed\n");
+            return error;
+        }
+
+        FPU_mode_type oldMode;
+        RoundingMode oldRoundMode = kRoundToNearestEven;
+        if (isFract)
+        {
+            // Calculate the correctly rounded reference result
+            memset(&oldMode, 0, sizeof(oldMode));
+            if (ftz || relaxedMode) ForceFTZ(&oldMode);
+
+            // Set the rounding mode to match the device
+            if (gIsInRTZMode)
+                oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        }
+
+        // Calculate the correctly rounded reference result
+        cl_half *ref1 = (cl_half *)gOut_Ref;
+        cl_half *ref2 = (cl_half *)gOut_Ref2;
+
+        if (skipNanInf)
+        {
+            for (size_t j = 0; j < half_buffer_size; j++)
+            {
+                double dd;
+                feclearexcept(FE_OVERFLOW);
+
+                if (relaxedMode)
+                    ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd));
+                else
+                    ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
+
+                ref2[j] = HFF((float)dd);
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
+        }
+        else
+        {
+            for (size_t j = 0; j < half_buffer_size; j++)
+            {
+                double dd;
+                if (relaxedMode)
+                    ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd));
+                else
+                    ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
+
+                ref2[j] = HFF((float)dd);
+            }
+        }
+
+        if (isFract && ftz) RestoreFPState(&oldMode);
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting)
+        {
+            if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+            break;
+        }
+
+        // Verify data
+        for (size_t j = 0; j < half_buffer_size; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *test1 = (cl_half *)gOut[k];
+                cl_half *test2 = (cl_half *)gOut2[k];
+
+                // If we aren't getting the correctly rounded result
+                if (ref1[j] != test1[j] || ref2[j] != test2[j])
+                {
+                    double fp_correct1 = 0, fp_correct2 = 0;
+                    float err = 0, err2 = 0;
+
+                    if (relaxedMode)
+                        fp_correct1 = f->rfunc.f_fpf(HTF(pIn[j]), &fp_correct2);
+                    else
+                        fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2);
+
+                    cl_half correct1 = HFF(fp_correct1);
+                    cl_half correct2 = HFF(fp_correct2);
+
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
+                    if (relaxedMode || skipNanInf)
+                    {
+                        if (skipNanInf && overflow[j]) continue;
+                        // Note: no double rounding here.  Reference functions
+                        // calculate in single precision.
+                        if (IsHalfInfinity(correct1) || IsHalfNaN(correct1)
+                            || IsHalfInfinity(correct2) || IsHalfNaN(correct2)
+                            || IsHalfInfinity(pIn[j]) || IsHalfNaN(pIn[j]))
+                            continue;
+                    }
+
+                    // If we are in fast relaxed math, we
+                    // have a different calculation for the
+                    // subnormal threshold.
+                    typedef int (*CheckForSubnormal)(double, float);
+                    CheckForSubnormal isFloatResultSubnormalPtr;
+                    if (relaxedMode)
+                    {
+                        err = Abs_Error(HTF(test1[j]), fp_correct1);
+                        err2 = Abs_Error(HTF(test2[j]), fp_correct2);
+                        isFloatResultSubnormalPtr =
+                            &IsFloatResultSubnormalAbsError;
+                    }
+                    else
+                    {
+                        err = Ulp_Error_Half(test1[j], fp_correct1);
+                        err2 = Ulp_Error_Half(test2[j], fp_correct2);
+                        isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
+                    }
+                    int fail =
+                        !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps);
+
+                    if (ftz || relaxedMode)
+                    {
+                        // retry per section 6.5.3.2
+                        if ((*isFloatResultSubnormalPtr)(fp_correct1,
+                                                         half_ulps))
+                        {
+                            if ((*isFloatResultSubnormalPtr)(fp_correct2,
+                                                             half_ulps))
+                            {
+                                fail = fail
+                                    && !(HTF(test1[j]) == 0.0f
+                                         && HTF(test2[j]) == 0.0f);
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    err2 = 0.0f;
+                                }
+                            }
+                            else
+                            {
+                                fail = fail
+                                    && !(HTF(test1[j]) == 0.0f
+                                         && fabsf(err2) <= half_ulps);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                        else if ((*isFloatResultSubnormalPtr)(fp_correct2,
+                                                              half_ulps))
+                        {
+                            fail = fail
+                                && !(HTF(test2[j]) == 0.0f
+                                     && fabsf(err) <= half_ulps);
+                            if (!fail) err2 = 0.0f;
+                        }
+
+
+                        // retry per section 6.5.3.3
+                        if (IsHalfSubnormal(pIn[j]))
+                        {
+                            double fp_correctp, fp_correctn;
+                            double fp_correct2p, fp_correct2n;
+                            float errp, err2p, errn, err2n;
+
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+                            if (relaxedMode)
+                            {
+                                fp_correctp =
+                                    f->rfunc.f_fpf(0.0, &fp_correct2p);
+                                fp_correctn =
+                                    f->rfunc.f_fpf(-0.0, &fp_correct2n);
+                            }
+                            else
+                            {
+                                fp_correctp = f->func.f_fpf(0.0, &fp_correct2p);
+                                fp_correctn =
+                                    f->func.f_fpf(-0.0, &fp_correct2n);
+                            }
+
+                            cl_half correctp = HFF(fp_correctp);
+                            cl_half correctn = HFF(fp_correctn);
+                            cl_half correct2p = HFF(fp_correct2p);
+                            cl_half correct2n = HFF(fp_correct2n);
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correctp)
+                                    || IsHalfNaN(correctp)
+                                    || IsHalfInfinity(correctn)
+                                    || IsHalfNaN(correctn)
+                                    || IsHalfInfinity(correct2p)
+                                    || IsHalfNaN(correct2p)
+                                    || IsHalfInfinity(correct2n)
+                                    || IsHalfNaN(correct2n))
+                                    continue;
+                            }
+
+                            if (relaxedMode)
+                            {
+                                errp = Abs_Error(HTF(test1[j]), fp_correctp);
+                                err2p = Abs_Error(HTF(test1[j]), fp_correct2p);
+                                errn = Abs_Error(HTF(test1[j]), fp_correctn);
+                                err2n = Abs_Error(HTF(test1[j]), fp_correct2n);
+                            }
+                            else
+                            {
+                                errp = Ulp_Error_Half(test1[j], fp_correctp);
+                                err2p = Ulp_Error_Half(test1[j], fp_correct2p);
+                                errn = Ulp_Error_Half(test1[j], fp_correctn);
+                                err2n = Ulp_Error_Half(test1[j], fp_correct2n);
+                            }
+
+                            fail = fail
+                                && ((!(fabsf(errp) <= half_ulps))
+                                    && (!(fabsf(err2p) <= half_ulps))
+                                    && ((!(fabsf(errn) <= half_ulps))
+                                        && (!(fabsf(err2n) <= half_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
+
+                            // retry per section 6.5.3.4
+                            if ((*isFloatResultSubnormalPtr)(fp_correctp,
+                                                             half_ulps)
+                                || (*isFloatResultSubnormalPtr)(fp_correctn,
+                                                                half_ulps))
+                            {
+                                if ((*isFloatResultSubnormalPtr)(fp_correct2p,
+                                                                 half_ulps)
+                                    || (*isFloatResultSubnormalPtr)(
+                                        fp_correct2n, half_ulps))
+                                {
+                                    fail = fail
+                                        && !(HTF(test1[j]) == 0.0f
+                                             && HTF(test2[j]) == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail
+                                        && !(HTF(test1[j]) == 0.0f
+                                             && fabsf(err2) <= half_ulps);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                            else if ((*isFloatResultSubnormalPtr)(fp_correct2p,
+                                                                  half_ulps)
+                                     || (*isFloatResultSubnormalPtr)(
+                                         fp_correct2n, half_ulps))
+                            {
+                                fail = fail
+                                    && !(HTF(test2[j]) == 0.0f
+                                         && (fabsf(err) <= half_ulps));
+                                if (!fail) err2 = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError0)
+                    {
+                        maxError0 = fabsf(err);
+                        maxErrorVal0 = HTF(pIn[j]);
+                    }
+                    if (fabsf(err2) > maxError1)
+                    {
+                        maxError1 = fabsf(err2);
+                        maxErrorVal1 = HTF(pIn[j]);
+                    }
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
+                                   "*{%a, %a} vs. {%a, %a}\n",
+                                   f->name, sizeNames[k], err, err2,
+                                   HTF(pIn[j]), HTF(ref1[j]), HTF(ref2[j]),
+                                   HTF(test1[j]), HTF(test2[j]));
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
new file mode 100644
index 0000000000..241377ddac
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -0,0 +1,368 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::Int, ParameterType::Half,
+                              vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+cl_ulong abs_cl_long(cl_long i)
+{
+    cl_long mask = i >> 63;
+    return (i ^ mask) - mask;
+}
+
+} // anonymous namespace
+
+int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    // sizeof(cl_half) < sizeof (int32_t)
+    // to prevent overflowing gOut_Ref2 it is necessary to use
+    // bigger type as denominator for buffer size calculation
+    constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(int32_t);
+
+    cl_ulong maxiError = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    float half_ulps = f->half_ulps;
+
+    maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0;
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
+                                relaxedMode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *pIn = (cl_half *)gIn;
+
+        {
+            const unsigned m_size = 0x1ff;
+            const unsigned e_size = 0xf;
+            const unsigned s_size = 0x2;
+            const unsigned sclamp = 0xffff;
+
+            for (size_t j = 0; j < half_buffer_size; j++)
+            {
+                unsigned ind = j % (s_size * e_size * m_size);
+                unsigned val = (((ind / (e_size * m_size)) << 15)
+                                | (((ind / m_size) % e_size + 1) << 10)
+                                | (ind % m_size + 1))
+                    & sclamp;
+                pIn[j] = val;
+            }
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xffffdead;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            // sizeof(cl_half) < sizeof (int32_t)
+            // to prevent overflowing gOut_Ref2 it is necessary to use
+            // bigger type as denominator for buffer size calculation
+            size_t vectorSize = sizeValues[j] * sizeof(int32_t);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue)))
+        {
+            vlog_error("clFlush failed\n");
+            return error;
+        }
+
+        // Calculate the correctly rounded reference result
+        cl_half *ref1 = (cl_half *)gOut_Ref;
+        int32_t *ref2 = (int32_t *)gOut_Ref2;
+        for (size_t j = 0; j < half_buffer_size; j++)
+            ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j));
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        for (size_t j = 0; j < half_buffer_size; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *test1 = (cl_half *)(gOut[k]);
+                int32_t *test2 = (int32_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (ref1[j] != test1[j] || ref2[j] != test2[j])
+                {
+                    cl_half test = ((cl_half *)test1)[j];
+                    int correct2 = INT_MIN;
+                    float fp_correct =
+                        (float)f->func.f_fpI(HTF(pIn[j]), &correct2);
+                    cl_half correct = HFF(fp_correct);
+                    float err = correct != test
+                        ? Ulp_Error_Half(test, fp_correct)
+                        : 0.f;
+                    cl_long iErr = (int64_t)test2[j] - (int64_t)correct2;
+                    int fail = !(fabsf(err) <= half_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz || relaxedMode)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsFloatResultSubnormal(fp_correct, half_ulps))
+                        {
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsHalfSubnormal(pIn[j]))
+                        {
+                            int correct5, correct6;
+                            double fp_correct3 = f->func.f_fpI(0.0, &correct5);
+                            double fp_correct4 = f->func.f_fpI(-0.0, &correct6);
+
+                            float err2 = Ulp_Error_Half(test, fp_correct3);
+                            float err3 = Ulp_Error_Half(test, fp_correct4);
+
+                            cl_long iErr2 =
+                                (long long)test2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)test2[j] - (long long)correct6;
+
+                            // Did +0 work?
+                            if (fabsf(err2) <= half_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
+                            {
+                                err = err2;
+                                iErr = iErr2;
+                                fail = 0;
+                            }
+                            // Did -0 work?
+                            else if (fabsf(err3) <= half_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
+                            {
+                                err = err3;
+                                iErr = iErr3;
+                                fail = 0;
+                            }
+
+                            // retry per section 6.5.3.4
+                            if (fail
+                                && (IsFloatResultSubnormal(correct2, half_ulps)
+                                    || IsFloatResultSubnormal(fp_correct3,
+                                                              half_ulps)))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    iErr = 0;
+                                }
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = pIn[j];
+                    }
+                    if (llabs(iErr) > maxError2)
+                    {
+                        maxError2 = llabs(iErr);
+                        maxErrorVal2 = pIn[j];
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
+                                   "*{%a, %d} vs. {%a, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   HTF(pIn[j]), HTF(ref1[j]),
+                                   ((int *)gOut_Ref2)[j], HTF(test), test2[j]);
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index 2d04eb4a79..c21f88c4cb 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -72,6 +72,11 @@ extern cl_device_fp_config gFloatCapabilities;
 extern cl_device_fp_config gHalfCapabilities;
 extern RoundingMode gFloatToHalfRoundingMode;
 
+extern cl_half_rounding_mode gHalfRoundingMode;
+
+#define HFF(num) cl_half_from_float(num, gHalfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+
 #define LOWER_IS_BETTER 0
 #define HIGHER_IS_BETTER 1
 
@@ -166,6 +171,26 @@ inline int IsFloatNaN(double x)
     return ((u.u & 0x7fffffffU) > 0x7F800000U);
 }
 
+inline bool IsHalfNaN(const cl_half v)
+{
+    // Extract FP16 exponent and mantissa
+    uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+    uint16_t h_mant = ((cl_half)v) & 0x3FF;
+
+    // NaN test
+    return (h_exp == 0x1F && h_mant != 0);
+}
+
+inline bool IsHalfInfinity(const cl_half v)
+{
+    // Extract FP16 exponent and mantissa
+    uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+    uint16_t h_mant = ((cl_half)v) & 0x3FF;
+
+    // Inf test
+    return (h_exp == 0x1F && h_mant == 0);
+}
+
 cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision

From 70cef0c88120f52a9477b20b0f91bcc9bbaebbeb Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 9 May 2023 12:50:01 +0200
Subject: [PATCH 08/24] Added missing condition due to vendor's review

---
 test_conformance/math_brute_force/macro_unary_half.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index 36d3996efe..3d59380206 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -107,7 +107,8 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     }
 
     test_info.f = f;
-    test_info.ftz = f->ftz || gForceFTZ;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
 
     test_info.tinfo.resize(test_info.threadCount);
 

From 4e9938e2e2767555dc84b2b1189724a0eaef1653 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Wed, 10 May 2023 14:15:55 +0200
Subject: [PATCH 09/24] code format correction

---
 test_conformance/math_brute_force/common.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp
index 39b4c950a1..cd2efc7477 100644
--- a/test_conformance/math_brute_force/common.cpp
+++ b/test_conformance/math_brute_force/common.cpp
@@ -100,7 +100,6 @@ void EmitEnableExtension(std::ostringstream &kernel,
 
     if (needsFp64) kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
     if (needsFp16) kernel << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
-
 }
 
 std::string GetBuildOptions(bool relaxed_mode)

From b9ae99b0a514caae20f9de1de4e821e94e299904 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 2 Jun 2023 13:32:33 +0200
Subject: [PATCH 10/24] Added check for lack of support for denormals in
 binary_half scenario

---
 .../math_brute_force/binary_half.cpp          | 131 ++++++++++++------
 .../math_brute_force/reference_math.cpp       |  15 +-
 .../math_brute_force/reference_math.h         |   2 +-
 3 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index 1aeb36afff..bf165542b3 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -501,6 +501,13 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                     if (IsHalfSubnormal(
                             cl_half_from_float(correct, CL_HALF_RTE)))
                     {
+                        if (isNextafter)
+                        {
+                            correct = reference_nextafterh(s[j], s2[j], false);
+                            err = Ulp_Error_Half(q[j], correct);
+                            fail = !(fabsf(err) <= ulps);
+                        }
+
                         fail = fail && (test != 0.0f);
                         if (!fail) err = 0.0f;
                     }
@@ -510,13 +517,15 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         double correct2, correct3;
                         float err2, err3;
                         if (isNextafter)
+                        {
                             correct2 = reference_nextafterh(0.0, s2[j]);
-                        else
-                            correct2 = ref_func(0.0, s2[j]);
-                        if (isNextafter)
                             correct3 = reference_nextafterh(-0.0, s2[j]);
+                        }
                         else
+                        {
+                            correct2 = ref_func(0.0, s2[j]);
                             correct3 = ref_func(-0.0, s2[j]);
+                        }
                         if (skipNanInf)
                         {
                             // Note: no double rounding here.  Reference
@@ -528,11 +537,14 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                 continue;
                         }
 
-                        err2 = Ulp_Error_Half(q[j], correct2);
-                        err3 = Ulp_Error_Half(q[j], correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
+                        auto check_error = [&]() {
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                        };
+                        check_error();
                         if (fabsf(err2) < fabsf(err)) err = err2;
                         if (fabsf(err3) < fabsf(err)) err = err3;
 
@@ -542,6 +554,15 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             || IsHalfSubnormal(
                                 cl_half_from_float(correct3, CL_HALF_RTE)))
                         {
+                            if (fail && isNextafter)
+                            {
+                                correct2 =
+                                    reference_nextafterh(0.0, s2[j], false);
+                                correct3 =
+                                    reference_nextafterh(-0.0, s2[j], false);
+                                check_error();
+                            }
+
                             fail = fail && (test != 0.0f);
                             if (!fail) err = 0.0f;
                         }
@@ -563,21 +584,19 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             float err4, err5;
 
                             if (isNextafter)
+                            {
                                 correct2 = reference_nextafterh(0.0, 0.0);
-                            else
-                                correct2 = ref_func(0.0, 0.0);
-                            if (isNextafter)
                                 correct3 = reference_nextafterh(-0.0, 0.0);
-                            else
-                                correct3 = ref_func(-0.0, 0.0);
-                            if (isNextafter)
                                 correct4 = reference_nextafterh(0.0, -0.0);
-                            else
-                                correct4 = ref_func(0.0, -0.0);
-                            if (isNextafter)
                                 correct5 = reference_nextafterh(-0.0, -0.0);
+                            }
                             else
+                            {
+                                correct2 = ref_func(0.0, 0.0);
+                                correct3 = ref_func(-0.0, 0.0);
+                                correct4 = ref_func(0.0, -0.0);
                                 correct5 = ref_func(-0.0, -0.0);
+                            }
 
                             // Per section 10 paragraph 6, accept any result if
                             // an input or output is a infinity or NaN or
@@ -596,19 +615,23 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                     || IsFloatNaN(correct5))
                                     continue;
                             }
-                            err2 = Ulp_Error_Half(q[j], correct2);
-                            err3 = Ulp_Error_Half(q[j], correct3);
-                            err4 = Ulp_Error_Half(q[j], correct4);
-                            err5 = Ulp_Error_Half(q[j], correct5);
-                            fail = fail
-                                && ((!(fabsf(err2) <= ulps))
-                                    && (!(fabsf(err3) <= ulps))
-                                    && (!(fabsf(err4) <= ulps))
-                                    && (!(fabsf(err5) <= ulps)));
-                            if (fabsf(err2) < fabsf(err)) err = err2;
-                            if (fabsf(err3) < fabsf(err)) err = err3;
-                            if (fabsf(err4) < fabsf(err)) err = err4;
-                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            auto check_error4 = [&]() {
+                                err2 = Ulp_Error_Half(q[j], correct2);
+                                err3 = Ulp_Error_Half(q[j], correct3);
+                                err4 = Ulp_Error_Half(q[j], correct4);
+                                err5 = Ulp_Error_Half(q[j], correct5);
+                                fail = fail
+                                    && ((!(fabsf(err2) <= ulps))
+                                        && (!(fabsf(err3) <= ulps))
+                                        && (!(fabsf(err4) <= ulps))
+                                        && (!(fabsf(err5) <= ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+                            };
+                            check_error4();
 
                             // retry per section 6.5.3.4
                             if (IsHalfSubnormal(
@@ -620,6 +643,19 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                 || IsHalfSubnormal(
                                     cl_half_from_float(correct5, CL_HALF_RTE)))
                             {
+                                if (fail && isNextafter)
+                                {
+                                    correct2 =
+                                        reference_nextafterh(0.0, 0.0, false);
+                                    correct3 =
+                                        reference_nextafterh(-0.0, 0.0, false);
+                                    correct4 =
+                                        reference_nextafterh(0.0, -0.0, false);
+                                    correct5 =
+                                        reference_nextafterh(-0.0, -0.0, false);
+                                    check_error4();
+                                }
+
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
                             }
@@ -641,13 +677,16 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         float err2, err3;
 
                         if (isNextafter)
+                        {
                             correct2 = reference_nextafterh(s[j], 0.0);
-                        else
-                            correct2 = ref_func(s[j], 0.0);
-                        if (isNextafter)
                             correct3 = reference_nextafterh(s[j], -0.0);
+                        }
                         else
+                        {
+                            correct2 = ref_func(s[j], 0.0);
                             correct3 = ref_func(s[j], -0.0);
+                        }
+
                         if (skipNanInf)
                         {
                             // Note: no double rounding here.  Reference
@@ -658,13 +697,16 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                 continue;
                         }
 
-                        err2 = Ulp_Error_Half(q[j], correct2);
-                        err3 = Ulp_Error_Half(q[j], correct3);
-                        fail = fail
-                            && ((!(fabsf(err2) <= ulps))
-                                && (!(fabsf(err3) <= ulps)));
-                        if (fabsf(err2) < fabsf(err)) err = err2;
-                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        auto check_error = [&]() {
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                        };
+                        check_error();
 
                         // retry per section 6.5.3.4
                         if (IsHalfSubnormal(
@@ -672,6 +714,15 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             || IsHalfSubnormal(
                                 cl_half_from_float(correct3, CL_HALF_RTE)))
                         {
+                            if (fail && isNextafter)
+                            {
+                                correct2 =
+                                    reference_nextafterh(s[j], 0.0, false);
+                                correct3 =
+                                    reference_nextafterh(s[j], -0.0, false);
+                                check_error();
+                            }
+
                             fail = fail && (test != 0.0f);
                             if (!fail) err = 0.0f;
                         }
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 5ba8bfb859..7fa0c54a32 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -4708,7 +4708,7 @@ cl_half reference_nanh(cl_ushort x)
     return h;
 }
 
-float reference_nextafterh(float xx, float yy)
+float reference_nextafterh(float xx, float yy, bool allow_denorms)
 {
     cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE);
     cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE);
@@ -4731,6 +4731,19 @@ float reference_nextafterh(float xx, float yy)
     a_h += (a_h < b_h) ? 1 : -1;
     a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h;
 
+
+    if (!allow_denorms && IsHalfSubnormal(a_h))
+    {
+        auto sgn = [](float val) { return (0.f < val) - (val < 0.f); };
+
+        bool signs = sgn(xx) == sgn(yy);
+        bool zeros = (fabs(yy) == 0.f) && (fabs(xx) == 0.f);
+        if ((fabs(yy) > fabs(xx) && signs) || (zeros && !signs))
+            a_h = (a_h & 0x8000) ? 0x8400 : 0x0400;
+        else
+            a_h = 0;
+    }
+
     return cl_half_to_float(a_h);
 }
 
diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h
index b9b2e46957..175ee73120 100644
--- a/test_conformance/math_brute_force/reference_math.h
+++ b/test_conformance/math_brute_force/reference_math.h
@@ -162,7 +162,7 @@ long double reference_fractl(long double, long double*);
 long double reference_fmal(long double, long double, long double);
 long double reference_madl(long double, long double, long double);
 long double reference_nextafterl(long double, long double);
-float reference_nextafterh(float, float);
+float reference_nextafterh(float, float, bool allow_denormals = true);
 cl_half reference_nanh(cl_ushort);
 long double reference_recipl(long double);
 long double reference_rootnl(long double, int);

From 5b313bd94ce316d02e9d885a5729d7c69549b84d Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Mon, 5 Jun 2023 14:13:41 +0200
Subject: [PATCH 11/24] Corrected procedure to compute nextafter cl_half for
 flush-to-zero mode

---
 test_conformance/math_brute_force/reference_math.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index 7fa0c54a32..c31221e3ab 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -4724,6 +4724,7 @@ float reference_nextafterh(float xx, float yy, bool allow_denorms)
 
     short a_h = cl_half_from_float(x, CL_HALF_RTE);
     short b_h = cl_half_from_float(y, CL_HALF_RTE);
+    short oa_h = a_h;
 
     if (a_h & 0x8000) a_h = 0x8000 - a_h;
     if (b_h & 0x8000) b_h = 0x8000 - b_h;
@@ -4731,14 +4732,9 @@ float reference_nextafterh(float xx, float yy, bool allow_denorms)
     a_h += (a_h < b_h) ? 1 : -1;
     a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h;
 
-
     if (!allow_denorms && IsHalfSubnormal(a_h))
     {
-        auto sgn = [](float val) { return (0.f < val) - (val < 0.f); };
-
-        bool signs = sgn(xx) == sgn(yy);
-        bool zeros = (fabs(yy) == 0.f) && (fabs(xx) == 0.f);
-        if ((fabs(yy) > fabs(xx) && signs) || (zeros && !signs))
+        if (cl_half_to_float(0x7fff & oa_h) < cl_half_to_float(0x7fff & a_h))
             a_h = (a_h & 0x8000) ? 0x8400 : 0x0400;
         else
             a_h = 0;

From 0c937a9050b1b80d9631b734026f87819b3f7cdd Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Mon, 5 Jun 2023 14:25:30 +0200
Subject: [PATCH 12/24] Added correction for external check of reference value
 for nextafter test

---
 .../math_brute_force/binary_half.cpp          | 42 ++++++-------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index bf165542b3..dffd7d0956 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -616,22 +616,19 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                     continue;
                             }
 
-                            auto check_error4 = [&]() {
-                                err2 = Ulp_Error_Half(q[j], correct2);
-                                err3 = Ulp_Error_Half(q[j], correct3);
-                                err4 = Ulp_Error_Half(q[j], correct4);
-                                err5 = Ulp_Error_Half(q[j], correct5);
-                                fail = fail
-                                    && ((!(fabsf(err2) <= ulps))
-                                        && (!(fabsf(err3) <= ulps))
-                                        && (!(fabsf(err4) <= ulps))
-                                        && (!(fabsf(err5) <= ulps)));
-                                if (fabsf(err2) < fabsf(err)) err = err2;
-                                if (fabsf(err3) < fabsf(err)) err = err3;
-                                if (fabsf(err4) < fabsf(err)) err = err4;
-                                if (fabsf(err5) < fabsf(err)) err = err5;
-                            };
-                            check_error4();
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            err4 = Ulp_Error_Half(q[j], correct4);
+                            err5 = Ulp_Error_Half(q[j], correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
                             if (IsHalfSubnormal(
@@ -643,19 +640,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                                 || IsHalfSubnormal(
                                     cl_half_from_float(correct5, CL_HALF_RTE)))
                             {
-                                if (fail && isNextafter)
-                                {
-                                    correct2 =
-                                        reference_nextafterh(0.0, 0.0, false);
-                                    correct3 =
-                                        reference_nextafterh(-0.0, 0.0, false);
-                                    correct4 =
-                                        reference_nextafterh(0.0, -0.0, false);
-                                    correct5 =
-                                        reference_nextafterh(-0.0, -0.0, false);
-                                    check_error4();
-                                }
-
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
                             }

From 867df9f12ffc2235e580ae8f31add4a7dd2237bb Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 13 Jun 2023 09:14:06 +0200
Subject: [PATCH 13/24] Added correction due to code review request

---
 test_conformance/math_brute_force/ternary_half.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
index 0d8bb8cf2f..3739199ac1 100644
--- a/test_conformance/math_brute_force/ternary_half.cpp
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -262,12 +262,12 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         if (gSkipCorrectnessTesting) break;
 
         // Verify data
-        uint32_t *t = (uint32_t *)gOut_Ref;
+        uint16_t *t = (uint16_t *)gOut_Ref;
         for (size_t j = 0; j < half_buffer_size; j++)
         {
             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
-                uint32_t *q = (uint32_t *)(gOut[k]);
+                uint16_t *q = (uint16_t *)(gOut[k]);
 
                 // If we aren't getting the correctly rounded result
                 if (t[j] != q[j])

From 015e3b6b960ef8e540d95db2e42adc1e04c0a505 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 22 Jun 2023 11:42:48 +0200
Subject: [PATCH 14/24] Changed quantity of tests performed for half in unary
 and macro_unary procedures from basic

---
 test_conformance/math_brute_force/macro_unary_half.cpp | 4 +++-
 test_conformance/math_brute_force/unary_half.cpp       | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index 3d59380206..755b772cd6 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -103,7 +103,9 @@ int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     }
     else
     {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
     }
 
     test_info.f = f;
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index f5de28d0d5..5b0eab4c63 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -113,7 +113,9 @@ int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     }
     else
     {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
     }
 
     test_info.f = f;

From 1122f310ed32ce102947e89398b75de9751b9e72 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Wed, 8 Nov 2023 23:11:43 +0100
Subject: [PATCH 15/24] Added corrections related to code review:

-added binary_operator_half.cpp and binary_two_results_i_half.cpp
-address sanitizer errors fixed
-extending list of special half values
-removed unnecessary relaxed math references in half tests
-corrected conditions to verify ulp narrowing of computation results
-several refactoring and cosmetics corrections
---
 .../math_brute_force/CMakeLists.txt           |   2 +
 .../math_brute_force/binary_half.cpp          | 376 +++++-----
 .../math_brute_force/binary_i_half.cpp        | 334 +++++----
 .../math_brute_force/binary_operator_half.cpp | 663 ++++++++++++++++++
 .../binary_two_results_i_half.cpp             | 485 +++++++++++++
 .../math_brute_force/function_list.cpp        |   4 +-
 .../math_brute_force/i_unary_half.cpp         |   8 +-
 .../math_brute_force/macro_binary_half.cpp    | 272 ++++---
 .../math_brute_force/macro_unary_half.cpp     | 221 +++---
 .../math_brute_force/mad_half.cpp             |  10 +-
 .../math_brute_force/ternary_half.cpp         |  24 +-
 .../math_brute_force/test_functions.h         |   6 +
 .../math_brute_force/unary_half.cpp           | 303 ++++----
 .../unary_two_results_half.cpp                | 124 +---
 .../unary_two_results_i_half.cpp              |  29 +-
 .../math_brute_force/unary_u_half.cpp         |  17 +-
 test_conformance/math_brute_force/utility.h   |   6 +
 17 files changed, 1962 insertions(+), 922 deletions(-)
 create mode 100644 test_conformance/math_brute_force/binary_operator_half.cpp
 create mode 100644 test_conformance/math_brute_force/binary_two_results_i_half.cpp

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index f0fca7b4fd..d53911e433 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -9,8 +9,10 @@ set(${MODULE_NAME}_SOURCES
     binary_i_half.cpp
     binary_operator_double.cpp
     binary_operator_float.cpp
+    binary_operator_half.cpp
     binary_two_results_i_double.cpp
     binary_two_results_i_float.cpp
+    binary_two_results_i_half.cpp
     common.cpp
     common.h
     function_list.cpp
diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index dffd7d0956..4b495c9532 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -27,7 +27,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -40,7 +39,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 struct ThreadInfo
 {
@@ -58,7 +56,6 @@ struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -76,7 +73,6 @@ struct TestInfoBase
     int isNextafter;
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -92,158 +88,24 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
 // A table of more difficult cases to get right
-static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
-////////////////////////////////////////////////////////////////////////////////
-static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
-                                   bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    double maxErrorVal2 = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->half_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
-    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
-    test_info.isNextafter = isNextafter;
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
-    }
+size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-            }
-        }
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -254,7 +116,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     fptr func = job->f->func;
     int ftz = job->ftz;
     MTdata d = tinfo->d;
-    cl_uint j, k;
     cl_int error;
     const char *name = job->f->name;
 
@@ -264,6 +125,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     cl_ushort *t;
     cl_half *r;
     std::vector<float> s(0), s2(0);
+    cl_uint j = 0;
 
     RoundingMode oldRoundMode;
     cl_int copysign_test = 0;
@@ -352,12 +214,13 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
         {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
             return error;
         }
 
@@ -425,24 +288,24 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     s.resize(buffer_elements);
     s2.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
-        for (j = 0; j < buffer_elements; j++)
-        {
-            s[j] = cl_half_to_float(p[j]);
-            s2[j] = cl_half_to_float(p2[j]);
-            if (isNextafter)
-                r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]),
-                                          CL_HALF_RTE);
-            else
-                r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE);
-        }
+    {
+        s[j] = cl_half_to_float(p[j]);
+        s2[j] = cl_half_to_float(p2[j]);
+        if (isNextafter)
+            r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]),
+                                      CL_HALF_RTE);
+        else
+            r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE);
+    }
 
     if (isFDim && ftz) RestoreFPState(&oldMode);
     // Read the data back -- no need to wait for the first N-1 buffers. This is
     // an in order queue.
-    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
         out[j] = (cl_ushort *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
             buffer_size, 0, NULL, NULL, &error);
         if (error || NULL == out[j])
         {
@@ -452,21 +315,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         }
     }
 
-    // Wait for the last buffer
-    out[j] = (cl_ushort *)clEnqueueMapBuffer(
-        tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0, buffer_size,
-        0, NULL, NULL, &error);
-    if (error || NULL == out[j])
-    {
-        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
-        return error;
-    }
-
     // Verify data
 
     for (j = 0; j < buffer_elements; j++)
     {
-        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
         {
             cl_ushort *q = out[k];
 
@@ -498,8 +351,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if (IsHalfSubnormal(
-                            cl_half_from_float(correct, CL_HALF_RTE)))
+                    if (IsHalfResultSubnormal(correct, ulps))
                     {
                         if (isNextafter)
                         {
@@ -549,10 +401,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct2, CL_HALF_RTE))
-                            || IsHalfSubnormal(
-                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
                         {
                             if (fail && isNextafter)
                             {
@@ -631,14 +481,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             if (fabsf(err5) < fabsf(err)) err = err5;
 
                             // retry per section 6.5.3.4
-                            if (IsHalfSubnormal(
-                                    cl_half_from_float(correct2, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct3, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct4, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct5, CL_HALF_RTE)))
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps)
+                                || IsHalfResultSubnormal(correct4, ulps)
+                                || IsHalfResultSubnormal(correct5, ulps))
                             {
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
@@ -693,10 +539,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         check_error();
 
                         // retry per section 6.5.3.4
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct2, CL_HALF_RTE))
-                            || IsHalfSubnormal(
-                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
                         {
                             if (fail && isNextafter)
                             {
@@ -731,9 +575,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 }
                 if (fail)
                 {
-                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), "
-                               "%a (0x%0.4x)}\nExpected: %a  (half 0x%0.4x) "
-                               "\nActual: %a (half 0x%0.4x) at index: %d\n",
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x) at index: %zu\n",
                                name, sizeNames[k], err, s[j], p[j], s2[j],
                                p2[j], cl_half_to_float(r[j]), r[j], test, q[j],
                                j);
@@ -778,13 +622,143 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     return error;
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
+                                   bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
+    test_info.isNextafter = isNextafter;
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error(
+                    "Error: Unable to create sub-buffer of gOutBuffer[%d] "
+                    "for region {%zd, %zd}\n",
+                    (int)j, region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
+
 int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode)
 {
     return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode);
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
index 571683e5da..dcfd285515 100644
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -24,7 +24,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -37,7 +36,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -54,7 +52,6 @@ typedef struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 } ThreadInfo;
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -68,7 +65,6 @@ struct TestInfoBase
     int ftz; // non-zero if running in flush to zero mode
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -84,168 +80,29 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
 // A table of more difficult cases to get right
-static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
-static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
-static const int specialValuesInt3[] = { 0,       1,     2,      3,
-                                         1022,    1023,  1024,   INT_MIN,
-                                         INT_MAX, -1,    -2,     -3,
-                                         -1022,   -1023, -11024, -INT_MAX };
-static size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3);
+const int specialValuesInt3[] = { 0,     1,       2,       3,       1022, 1023,
+                                  1024,  INT_MIN, INT_MAX, -1,      -2,   -3,
+                                  -1022, -1023,   -11024,  -INT_MAX };
+size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3);
 
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-    cl_int maxErrorVal2 = 0;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->half_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-        cl_buffer_region region2 = { i * test_info.subBufferSize
-                                         * sizeof(cl_int),
-                                     test_info.subBufferSize * sizeof(cl_int) };
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    // Run the kernels
-    if (!gSkipCorrectnessTesting)
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-
-    // Accumulate the arithmetic errors
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        if (test_info.tinfo[i].maxError > maxError)
-        {
-            maxError = test_info.tinfo[i].maxError;
-            maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
-        }
-    }
-
-    test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-    if (!gSkipCorrectnessTesting)
-    {
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-
-        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
-    }
-
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -348,7 +205,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half));
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -404,7 +261,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = cl_half_to_float(p[j]);
-        r[j] = cl_half_from_float(func.f_fi(s[j], s2[j]), CL_HALF_RTE);
+        r[j] = HFF(func.f_fi(s[j], s2[j]));
     }
 
     // Read the data back -- no need to wait for the first N-1 buffers. This is
@@ -450,8 +307,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 if (fail && ftz)
                 {
                     // retry per section 6.5.3.2
-                    if (IsHalfSubnormal(
-                            cl_half_from_float(correct, CL_HALF_RTE)))
+                    if (IsHalfResultSubnormal(correct, ulps))
                     {
                         fail = fail && (test != 0.0f);
                         if (!fail) err = 0.0f;
@@ -473,10 +329,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         if (fabsf(err3) < fabsf(err)) err = err3;
 
                         // retry per section 6.5.3.4
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct2, CL_HALF_RTE))
-                            || IsHalfSubnormal(
-                                cl_half_from_float(correct3, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
                         {
                             fail = fail && (test != 0.0f);
                             if (!fail) err = 0.0f;
@@ -492,9 +346,9 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 }
                 if (fail)
                 {
-                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%0.4x), "
-                               "%d}\nExpected: %a (half 0x%0.4x) \nActual: %a "
-                               "(half 0x%0.4x) at index: %d\n",
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%d}\nExpected: %a (half 0x%04x) \nActual: %a "
+                               "(half 0x%04x) at index: %d\n",
                                name, sizeNames[k], err, s[j], p[j], s2[j],
                                cl_half_to_float(r[j]), r[j], test, q[j],
                                (cl_uint)j);
@@ -535,3 +389,139 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     }
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+
+    // Accumulate the arithmetic errors
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        if (test_info.tinfo[i].maxError > maxError)
+        {
+            maxError = test_info.tinfo[i].maxError;
+            maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+        }
+    }
+
+    test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
new file mode 100644
index 0000000000..2d31964747
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -0,0 +1,663 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Half, ParameterType::Half,
+                               vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+struct ThreadInfo
+{
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
+
+    // max error value. Init to 0.
+    float maxError;
+    // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue;
+    // position of the max error value (param 2).  Init to 0.
+    double maxErrorValue2;
+    MTdataHolder d;
+
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+};
+
+struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special fields
+};
+
+// A table of more difficult cases to get right
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_int error;
+
+    const char *name = job->f->name;
+    cl_half *r = 0;
+    std::vector<float> s(0), s2(0);
+    RoundingMode oldRoundMode;
+
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_half *out[VECTOR_SIZE_COUNT];
+
+    // start the map of the output arrays
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+            buffer_size, 0, NULL, e + j, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+
+    bool divide = strcmp(name, "divide") == 0;
+
+    // Init input array
+    cl_half *p = (cl_half *)gIn + thread_id * buffer_elements;
+    cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
+    cl_uint idx = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
+    {
+        // Insert special values
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; idx < buffer_elements; idx++)
+        {
+            p[idx] = specialValuesHalf[x];
+            p2[idx] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+
+            if (divide)
+            {
+                cl_half pj = p[idx] & 0x7fff;
+                cl_half p2j = p2[idx] & 0x7fff;
+                // Replace values outside [2^-7, 2^7] with QNaN
+                if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
+                if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
+            }
+        }
+    }
+
+    // Init any remaining values
+    for (; idx < buffer_elements; idx++)
+    {
+        p[idx] = (cl_half)genrand_int32(d);
+        p2[idx] = (cl_half)genrand_int32(d);
+
+        if (divide)
+        {
+            cl_half pj = p[idx] & 0x7fff;
+            cl_half p2j = p2[idx] & 0x7fff;
+            // Replace values outside [2^-7, 2^7] with QNaN
+            if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
+            if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
+        }
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        // Wait for the map to finish
+        if ((error = clWaitForEvents(1, e + j)))
+        {
+            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+            return error;
+        }
+        if ((error = clReleaseEvent(e[j])))
+        {
+            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+            return error;
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xACDCACDC;
+        memset_pattern4(out[j], &pattern, buffer_size);
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
+                       error);
+            return error;
+        }
+
+        // Run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting)
+    {
+        return CL_SUCCESS;
+    }
+
+    // Calculate the correctly rounded reference result
+    FPU_mode_type oldMode;
+    memset(&oldMode, 0, sizeof(oldMode));
+    if (ftz) ForceFTZ(&oldMode);
+
+    // Set the rounding mode to match the device
+    oldRoundMode = kRoundToNearestEven;
+    if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
+
+    for (size_t j = 0; j < buffer_elements; j++)
+    {
+        s[j] = HTF(p[j]);
+        s2[j] = HTF(p2[j]);
+        r[j] = HFF(func.f_ff(s[j], s2[j]));
+    }
+
+    if (ftz) RestoreFPState(&oldMode);
+
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Verify data
+
+    for (size_t j = 0; j < buffer_elements; j++)
+    {
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_half *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (r[j] != q[j])
+            {
+                float test = HTF(q[j]);
+                float correct = func.f_ff(s[j], s2[j]);
+
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (!gInfNanSupport)
+                {
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                        continue;
+                }
+
+                float err = Ulp_Error_Half(q[j], correct);
+
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        correct2 = HTF(func.f_ff(0.0, s2[j]));
+                        correct3 = HTF(func.f_ff(-0.0, s2[j]));
+
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            double correct4, correct5;
+                            float err4, err5;
+
+                            correct2 = HTF(func.f_ff(0.0, 0.0));
+                            correct3 = HTF(func.f_ff(-0.0, 0.0));
+                            correct4 = HTF(func.f_ff(0.0, -0.0));
+                            correct5 = HTF(func.f_ff(-0.0, -0.0));
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (!gInfNanSupport)
+                            {
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
+                                    continue;
+                            }
+
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            err4 = Ulp_Error_Half(q[j], correct4);
+                            err5 = Ulp_Error_Half(q[j], correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps)
+                                || IsHalfResultSubnormal(correct4, ulps)
+                                || IsHalfResultSubnormal(correct5, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        correct2 = HTF(func.f_ff(s[j], 0.0));
+                        correct3 = HTF(func.f_ff(s[j], -0.0));
+
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x) at index: %zu\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               p2[j], HTF(r[j]), r[j], test, q[j], j);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
new file mode 100644
index 0000000000..3900e62d5a
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
@@ -0,0 +1,485 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Int, ParameterType::Half,
+                               ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+struct ComputeReferenceInfoF
+{
+    const cl_half *x;
+    const cl_half *y;
+    cl_half *r;
+    int32_t *i;
+    double (*f_ffpI)(double, double, int *);
+    cl_uint lim;
+    cl_uint count;
+};
+
+cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const cl_half *x = cri->x + off;
+    const cl_half *y = cri->y + off;
+    cl_half *r = cri->r + off;
+    int32_t *i = cri->i + off;
+    double (*f)(double, double, int *) = cri->f_ffpI;
+
+    if (off + count > lim) count = lim - off;
+
+    for (cl_uint j = 0; j < count; ++j)
+        r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j));
+
+    return CL_SUCCESS;
+}
+
+} // anonymous namespace
+
+int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    int64_t maxError2 = 0;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    // use larger type of output data to prevent overflowing buffer size
+    constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t);
+
+    cl_uint threadCount = GetThreadCount();
+
+    float half_ulps = f->half_ulps;
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *p = (cl_half *)gIn;
+        cl_half *p2 = (cl_half *)gIn2;
+        for (size_t j = 0; j < buffer_size; j++)
+        {
+            p[j] = (cl_half)genrand_int32(d);
+            p2[j] = (cl_half)genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          buffer_size * sizeof(cl_half), gIn, 0,
+                                          NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          buffer_size * sizeof(cl_half), gIn2,
+                                          0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+
+                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 BUFFER_SIZE, 0, NULL, NULL)))
+                {
+                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
+                               error);
+                    return error;
+                }
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            // align working group size with the bigger output type
+            size_t vectorSize = sizeValues[j] * sizeof(int32_t);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = p;
+            cri.y = p2;
+            cri.r = (cl_half *)gOut_Ref;
+            cri.i = (int32_t *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = buffer_size;
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
+            cl_half *r = (cl_half *)gOut_Ref;
+            int32_t *r2 = (int32_t *)gOut_Ref2;
+            for (size_t j = 0; j < buffer_size; j++)
+                r[j] =
+                    HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j));
+        }
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            cl_bool blocking =
+                (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0,
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        cl_half *t = (cl_half *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (size_t j = 0; j < buffer_size; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *q = (cl_half *)(gOut[k]);
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j])
+                    continue;
+
+                cl_half test = ((cl_half *)q)[j];
+                int correct2 = INT_MIN;
+                float correct =
+                    (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2);
+                float err = Ulp_Error_Half(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f)
+                    || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= half_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, half_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        int correct3i, correct4i;
+                        float correct3 =
+                            (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i);
+                        float correct4 =
+                            (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i);
+                        float err2 = Ulp_Error_Half(test, correct3);
+                        float err3 = Ulp_Error_Half(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= half_ulps && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, half_ulps)
+                            || IsHalfResultSubnormal(correct3, half_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                            double correct7 =
+                                f->func.f_ffpI(0.0, -0.0, &correct7i);
+                            double correct8 =
+                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Ulp_Error_Half(test, correct3);
+                            err3 = Ulp_Error_Half(test, correct4);
+                            float err4 = Ulp_Error_Half(test, correct7);
+                            float err5 = Ulp_Error_Half(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= half_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= half_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= half_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct3, half_ulps)
+                                || IsHalfResultSubnormal(correct4, half_ulps)
+                                || IsHalfResultSubnormal(correct7, half_ulps)
+                                || IsHalfResultSubnormal(correct8, half_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 =
+                            f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i);
+                        float err2 = Ulp_Error_Half(test, correct3);
+                        float err3 = Ulp_Error_Half(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= half_ulps && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, half_ulps)
+                            || IsHalfResultSubnormal(correct3, half_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = HTF(p[j]);
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = HTF(p[j]);
+                }
+
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: {%f, %" PRId64
+                               "} ulp error at {%a, %a} "
+                               "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, "
+                               "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n",
+                               f->name, sizeNames[k], err, iErr, HTF(p[j]),
+                               HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j],
+                               t2[j], HTF(test), q2[j], test, q2[j]);
+                    return -1;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 67ed0d8ac1..b2f3de82ef 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -164,7 +164,7 @@ static constexpr vtbl _binary_operator = {
     "binaryOperator",
     TestFunc_Float_Float_Float_Operator,
     TestFunc_Double_Double_Double_Operator,
-    NULL,
+    TestFunc_Half_Half_Half_Operator,
 };
 
 static constexpr vtbl _binary_i = {
@@ -206,7 +206,7 @@ static constexpr vtbl _binary_two_results_i = {
     "binary_two_results_i",
     TestFunc_FloatI_Float_Float,
     TestFunc_DoubleI_Double_Double,
-    NULL,
+    TestFunc_HalfI_Half_Half,
 };
 
 static constexpr vtbl _mad_tbl = {
diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index c78c03a494..ada2aa89a0 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -23,7 +23,8 @@
 #include <memory>
 #include <cinttypes>
 
-////////////////////////////////////////////////////////////////////////////////
+namespace {
+
 static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                  void *p)
 {
@@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
 int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -174,7 +176,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
 
                     uint32_t err = t[j] - q[j];
                     if (q[j] > t[j]) err = q[j] - t[j];
-                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%0.4x): "
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): "
                                "*%d vs. %d\n",
                                f->name, sizeNames[k], err, s[j], p[j], t[j],
                                q[j]);
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index 8af034c437..6157a9ebb6 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -21,10 +21,8 @@
 
 #include <cstring>
 
-
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -37,7 +35,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 struct ThreadInfo
 {
     clMemWrapper inBuf; // input buffer for the thread
@@ -48,7 +45,6 @@ struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -61,7 +57,6 @@ struct TestInfoBase
     int ftz; // non-zero if running in flush to zero mode
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -77,139 +72,24 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
 // A table of more difficult cases to get right
-static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
-////////////////////////////////////////////////////////////////////////////////
-static size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-        test_info.tinfo[i].inBuf2 =
-            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-    return error;
-}
+size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -310,7 +190,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -370,7 +250,6 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         r[j] = (short)func.i_ff(s[j], s2[j]);
     }
 
-
     // Read the data back -- no need to wait for the first N-1 buffers. This is
     // an in order queue.
     for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
@@ -437,8 +316,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
             short err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
             vlog_error(
-                "\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a "
-                "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x (index: %d)\n",
+                "\nERROR: %s: %d ulp error at {%a (0x%04x), %a "
+                "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n",
                 name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j);
             error = -1;
             return error;
@@ -484,8 +363,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
                 cl_ushort err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
-                vlog_error("\nERROR: %s: %d ulp error at {%a (0x%0.4x), %a "
-                           "(0x%0.4x)}\nExpected: 0x%0.4x \nActual: 0x%0.4x "
+                vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a "
+                           "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x "
                            "(index: %d)\n",
                            name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j);
                 error = -1;
@@ -526,3 +405,116 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     return error;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index 755b772cd6..ae359b3e57 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -23,7 +23,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 struct ThreadInfo
 {
@@ -45,7 +43,6 @@ struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -57,7 +54,6 @@ struct TestInfoBase
     int ftz; // non-zero if running in flush to zero mode
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -73,114 +69,7 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount =
-            std::max((cl_uint)1,
-                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
-    }
-
-    test_info.f = f;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -246,7 +135,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -353,7 +242,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
             short err = t[j] - q[j];
             if (q[j] > t[j]) err = q[j] - t[j];
-            vlog_error("\nERROR: %s: %d ulp error at %a (0x%0.4x)\nExpected: "
+            vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: "
                        "%d vs. %d\n",
                        name, err, s[j], p[j], t[j], q[j]);
             error = -1;
@@ -381,7 +270,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 short err = -t[j] - q[j];
                 if (q[j] > -t[j]) err = q[j] + t[j];
                 vlog_error("\nERROR: %s%s: %d ulp error at %a "
-                           "(0x%0.4x)\nExpected: %d \nActual: %d\n",
+                           "(0x%04x)\nExpected: %d \nActual: %d\n",
                            name, sizeNames[k], err, s[j], p[j], -t[j], q[j]);
                 error = -1;
                 return error;
@@ -419,3 +308,105 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     }
     return error;
 }
+
+} // anonymous namespace
+
+int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
index ef6f2b776c..5cb73d4b1f 100644
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -21,7 +21,8 @@
 
 #include <cstring>
 
-////////////////////////////////////////////////////////////////////////////////
+namespace {
+
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -34,7 +35,8 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
 int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -42,7 +44,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
     KernelMatrix kernels;
     const unsigned thread_id = 0; // Test is currently not multithreaded.
     float maxError = 0.0f;
-    //    int ftz = f->ftz || gForceFTZ;
+
     float maxErrorVal = 0.0f;
     float maxErrorVal2 = 0.0f;
     float maxErrorVal3 = 0.0f;
@@ -96,7 +98,7 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint16_t pattern = 0xdead;
+            uint32_t pattern = 0xACDCACDC;
             memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
index 3739199ac1..93dc612f7c 100644
--- a/test_conformance/math_brute_force/ternary_half.cpp
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -41,14 +41,17 @@ cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 
 // A table of more difficult cases to get right
 static const cl_half specialValuesHalf[] = {
-    0xffff,
-    0x0000,
-    0x0001,
-    0x7c00 /*INFINITY*/,
-    0xfc00 /*-INFINITY*/,
-    0x8000 /*-0*/,
-    0x7bff /*HALF_MAX*/,
-    0x0400 /*HALF_MIN*/
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
 };
 
 constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
@@ -78,8 +81,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
 
     // Init the kernels
-    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
-                                relaxedMode };
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -294,7 +296,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
                         test != correct ? Ulp_Error_Half(test, ref1) : 0.f;
                     fail = !(fabsf(err) <= half_ulps);
 
-                    if (fail && (ftz || relaxedMode))
+                    if (fail && ftz)
                     {
                         // retry per section 6.5.3.2  with flushing on
                         if (0.0f == test
diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h
index 16f57013ce..16b361d53a 100644
--- a/test_conformance/math_brute_force/test_functions.h
+++ b/test_conformance/math_brute_force/test_functions.h
@@ -87,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata,
                                            bool relaxedMode);
 
+// half op half
+int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, int)
 int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
 
@@ -135,6 +138,9 @@ int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
 // double foo(double, double, int*)
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, half, int*)
+int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode);
+
 // Special handling for mad.
 // float mad(float, float, float)
 int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode);
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index 5b0eab4c63..f6e914c8af 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -23,7 +23,6 @@
 
 namespace {
 
-////////////////////////////////////////////////////////////////////////////////
 cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
 {
     BuildKernelInfo &info = *(BuildKernelInfo *)p;
@@ -35,7 +34,6 @@ cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
 // Thread specific data for a worker thread
 typedef struct ThreadInfo
 {
@@ -47,7 +45,6 @@ typedef struct ThreadInfo
         tQueue; // per thread command queue to improve performance
 } ThreadInfo;
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfoBase
 {
     size_t subBufferSize; // Size of the sub-buffer in elements
@@ -64,7 +61,6 @@ struct TestInfoBase
     float half_sin_cos_tan_limit;
 };
 
-////////////////////////////////////////////////////////////////////////////////
 struct TestInfo : public TestInfoBase
 {
     TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
@@ -80,147 +76,7 @@ struct TestInfo : public TestInfoBase
     KernelMatrix k;
 };
 
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *p);
-
-////////////////////////////////////////////////////////////////////////////////
-int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
-{
-    TestInfoBase test_info_base;
-    cl_int error;
-    size_t i, j;
-    float maxError = 0.0f;
-    double maxErrorVal = 0.0;
-
-    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-
-    // Init test_info
-    memset(&test_info_base, 0, sizeof(test_info_base));
-    TestInfo test_info(test_info_base);
-
-    test_info.threadCount = GetThreadCount();
-
-    test_info.subBufferSize = BUFFER_SIZE
-        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
-    test_info.scale = getTestScale(sizeof(cl_half));
-    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
-    if (test_info.step / test_info.subBufferSize != test_info.scale)
-    {
-        // there was overflow
-        test_info.jobCount = 1;
-    }
-    else
-    {
-        test_info.jobCount =
-            std::max((cl_uint)1,
-                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
-    }
-
-    test_info.f = f;
-    test_info.ulps = f->half_ulps;
-    test_info.ftz =
-        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
-
-    test_info.tinfo.resize(test_info.threadCount);
-
-    for (i = 0; i < test_info.threadCount; i++)
-    {
-        cl_buffer_region region = { i * test_info.subBufferSize
-                                        * sizeof(cl_half),
-                                    test_info.subBufferSize * sizeof(cl_half) };
-        test_info.tinfo[i].inBuf =
-            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
-                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
-        {
-            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
-                       "region {%zd, %zd}\n",
-                       region.origin, region.size);
-            return error;
-        }
-
-        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
-        {
-            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
-                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
-                &region, &error);
-            if (error || NULL == test_info.tinfo[i].outBuf[j])
-            {
-                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
-                           "for region {%zd, %zd}\n",
-                           region.origin, region.size);
-                return error;
-            }
-        }
-        test_info.tinfo[i].tQueue =
-            clCreateCommandQueue(gContext, gDevice, 0, &error);
-        if (NULL == test_info.tinfo[i].tQueue || error)
-        {
-            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
-            return error;
-        }
-    }
-
-    // Check for special cases for unary float
-    test_info.isRangeLimited = 0;
-    test_info.half_sin_cos_tan_limit = 0;
-    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit = 1.0f
-            + test_info.ulps
-                * (FLT_EPSILON / 2.0f); // out of range results from finite
-                                        // inputs must be in [-1,1]
-    }
-    else if (0 == strcmp(f->name, "half_tan"))
-    {
-        test_info.isRangeLimited = 1;
-        test_info.half_sin_cos_tan_limit =
-            INFINITY; // out of range resut from finite inputs must be numeric
-    }
-
-    // Init the kernels
-    {
-        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
-                                       test_info.programs, f->nameInCode };
-        error = ThreadPool_Do(BuildKernel_HalfFn,
-                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
-                              &build_info);
-        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
-    }
-
-    if (!gSkipCorrectnessTesting)
-    {
-        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
-
-        // Accumulate the arithmetic errors
-        for (i = 0; i < test_info.threadCount; i++)
-        {
-            if (test_info.tinfo[i].maxError > maxError)
-            {
-                maxError = test_info.tinfo[i].maxError;
-                maxErrorVal = test_info.tinfo[i].maxErrorValue;
-            }
-        }
-
-        test_error(error, "ThreadPool_Do: TestHalf failed\n");
-
-        if (gWimpyMode)
-            vlog("Wimp pass");
-        else
-            vlog("passed");
-    }
-
-    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
-    vlog("\n");
-
-    return error;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
     size_t buffer_elements = job->subBufferSize;
@@ -288,7 +144,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint16_t pattern = 0xdead;
+        uint32_t pattern = 0xACDCACDC;
         memset_pattern4(out[j], &pattern, buffer_size);
         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
                                              out[j], 0, NULL, NULL)))
@@ -333,12 +189,11 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     // Calculate the correctly rounded reference result
     cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
-    cl_ushort *t = (cl_ushort *)r;
     s.resize(buffer_elements);
     for (j = 0; j < buffer_elements; j++)
     {
         s[j] = (float)cl_half_to_float(p[j]);
-        r[j] = cl_half_from_float(func.f_f(s[j]), CL_HALF_RTE);
+        r[j] = HFF(func.f_f(s[j]));
     }
 
     // Read the data back -- no need to wait for the first N-1 buffers. This is
@@ -373,7 +228,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
             cl_ushort *q = out[k];
 
             // If we aren't getting the correctly rounded result
-            if (t[j] != q[j])
+            if (r[j] != q[j])
             {
                 float test = cl_half_to_float(q[j]);
                 double correct = func.f_f(s[j]);
@@ -397,8 +252,7 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                     if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if (IsHalfSubnormal(
-                                cl_half_from_float(correct, CL_HALF_RTE)))
+                        if (IsHalfResultSubnormal(correct, ulps))
                         {
                             fail = fail && (test != 0.0f);
                             if (!fail) err = 0.0f;
@@ -418,10 +272,8 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             if (fabsf(err3) < fabsf(err)) err = err3;
 
                             // retry per section 6.5.3.4
-                            if (IsHalfSubnormal(
-                                    cl_half_from_float(correct2, CL_HALF_RTE))
-                                || IsHalfSubnormal(
-                                    cl_half_from_float(correct3, CL_HALF_RTE)))
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps))
                             {
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
@@ -437,10 +289,10 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 if (fail)
                 {
                     vlog_error("\nERROR: %s%s: %f ulp error at %a "
-                               "(0x%0.4x)\nExpected: %a (half 0x%0.4x) "
-                               "\nActual: %a (half 0x%0.4x)\n",
+                               "(half 0x%04x)\nExpected: %a (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x)\n",
                                job->f->name, sizeNames[k], err, s[j], p[j],
-                               cl_half_to_float(r[j]), t[j], test, q[j]);
+                               cl_half_to_float(r[j]), r[j], test, q[j]);
                     error = -1;
                     return error;
                 }
@@ -480,3 +332,138 @@ static cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     return error;
 }
+
+} // anonymous namespace
+
+int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+    }
+
+    // Check for special cases for unary float
+    test_info.isRangeLimited = 0;
+    test_info.half_sin_cos_tan_limit = 0;
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
+    }
+    else if (0 == strcmp(f->name, "half_tan"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index 3f8d71168d..18d4dadd0e 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -62,8 +62,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
     float half_ulps = f->half_ulps;
 
     // Init the kernels
-    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
-                                relaxedMode };
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -77,22 +76,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             const unsigned m_size = 0x1ff;
             const unsigned e_size = 0xf;
             const unsigned s_size = 0x2;
-            const unsigned sclamp = 0xffff;
 
             for (size_t j = 0; j < half_buffer_size; j++)
             {
                 unsigned ind = j % (s_size * e_size * m_size);
                 unsigned val = (((ind / (e_size * m_size)) << 15)
                                 | (((ind / m_size) % e_size + 1) << 10)
-                                | (ind % m_size + 1))
-                    & sclamp;
+                                | (ind % m_size + 1));
                 pIn[j] = val;
-
-                if (relaxedMode && strcmp(f->name, "sincos") == 0)
-                {
-                    float pj = HTF(pIn[j]);
-                    if (fabs(pj) > M_PI) pIn[j] = 0x7e00; // HALF_NAN
-                }
             }
         }
 
@@ -106,7 +97,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         // Write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xffffdead;
+            uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
                 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -200,7 +191,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         {
             // Calculate the correctly rounded reference result
             memset(&oldMode, 0, sizeof(oldMode));
-            if (ftz || relaxedMode) ForceFTZ(&oldMode);
+            if (ftz) ForceFTZ(&oldMode);
 
             // Set the rounding mode to match the device
             if (gIsInRTZMode)
@@ -218,11 +209,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                 double dd;
                 feclearexcept(FE_OVERFLOW);
 
-                if (relaxedMode)
-                    ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd));
-                else
-                    ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
-
+                ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
                 ref2[j] = HFF((float)dd);
                 overflow[j] =
                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
@@ -233,11 +220,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             for (size_t j = 0; j < half_buffer_size; j++)
             {
                 double dd;
-                if (relaxedMode)
-                    ref1[j] = HFF((float)f->rfunc.f_fpf(HTF(pIn[j]), &dd));
-                else
-                    ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
-
+                ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
                 ref2[j] = HFF((float)dd);
             }
         }
@@ -283,17 +266,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                     double fp_correct1 = 0, fp_correct2 = 0;
                     float err = 0, err2 = 0;
 
-                    if (relaxedMode)
-                        fp_correct1 = f->rfunc.f_fpf(HTF(pIn[j]), &fp_correct2);
-                    else
-                        fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2);
+                    fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2);
 
                     cl_half correct1 = HFF(fp_correct1);
                     cl_half correct2 = HFF(fp_correct2);
 
                     // Per section 10 paragraph 6, accept any result if an input
                     // or output is a infinity or NaN or overflow
-                    if (relaxedMode || skipNanInf)
+                    if (skipNanInf)
                     {
                         if (skipNanInf && overflow[j]) continue;
                         // Note: no double rounding here.  Reference functions
@@ -304,35 +284,18 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                             continue;
                     }
 
-                    // If we are in fast relaxed math, we
-                    // have a different calculation for the
-                    // subnormal threshold.
-                    typedef int (*CheckForSubnormal)(double, float);
-                    CheckForSubnormal isFloatResultSubnormalPtr;
-                    if (relaxedMode)
-                    {
-                        err = Abs_Error(HTF(test1[j]), fp_correct1);
-                        err2 = Abs_Error(HTF(test2[j]), fp_correct2);
-                        isFloatResultSubnormalPtr =
-                            &IsFloatResultSubnormalAbsError;
-                    }
-                    else
-                    {
-                        err = Ulp_Error_Half(test1[j], fp_correct1);
-                        err2 = Ulp_Error_Half(test2[j], fp_correct2);
-                        isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
-                    }
+                    err = Ulp_Error_Half(test1[j], fp_correct1);
+                    err2 = Ulp_Error_Half(test2[j], fp_correct2);
+
                     int fail =
                         !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps);
 
-                    if (ftz || relaxedMode)
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if ((*isFloatResultSubnormalPtr)(fp_correct1,
-                                                         half_ulps))
+                        if (IsHalfResultSubnormal(fp_correct1, half_ulps))
                         {
-                            if ((*isFloatResultSubnormalPtr)(fp_correct2,
-                                                             half_ulps))
+                            if (IsHalfResultSubnormal(fp_correct2, half_ulps))
                             {
                                 fail = fail
                                     && !(HTF(test1[j]) == 0.0f
@@ -351,8 +314,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                 if (!fail) err = 0.0f;
                             }
                         }
-                        else if ((*isFloatResultSubnormalPtr)(fp_correct2,
-                                                              half_ulps))
+                        else if (IsHalfResultSubnormal(fp_correct2, half_ulps))
                         {
                             fail = fail
                                 && !(HTF(test2[j]) == 0.0f
@@ -369,19 +331,8 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                             float errp, err2p, errn, err2n;
 
                             if (skipNanInf) feclearexcept(FE_OVERFLOW);
-                            if (relaxedMode)
-                            {
-                                fp_correctp =
-                                    f->rfunc.f_fpf(0.0, &fp_correct2p);
-                                fp_correctn =
-                                    f->rfunc.f_fpf(-0.0, &fp_correct2n);
-                            }
-                            else
-                            {
-                                fp_correctp = f->func.f_fpf(0.0, &fp_correct2p);
-                                fp_correctn =
-                                    f->func.f_fpf(-0.0, &fp_correct2n);
-                            }
+                            fp_correctp = f->func.f_fpf(0.0, &fp_correct2p);
+                            fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n);
 
                             cl_half correctp = HFF(fp_correctp);
                             cl_half correctn = HFF(fp_correctn);
@@ -408,20 +359,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                     continue;
                             }
 
-                            if (relaxedMode)
-                            {
-                                errp = Abs_Error(HTF(test1[j]), fp_correctp);
-                                err2p = Abs_Error(HTF(test1[j]), fp_correct2p);
-                                errn = Abs_Error(HTF(test1[j]), fp_correctn);
-                                err2n = Abs_Error(HTF(test1[j]), fp_correct2n);
-                            }
-                            else
-                            {
-                                errp = Ulp_Error_Half(test1[j], fp_correctp);
-                                err2p = Ulp_Error_Half(test1[j], fp_correct2p);
-                                errn = Ulp_Error_Half(test1[j], fp_correctn);
-                                err2n = Ulp_Error_Half(test1[j], fp_correct2n);
-                            }
+                            errp = Ulp_Error_Half(test1[j], fp_correctp);
+                            err2p = Ulp_Error_Half(test1[j], fp_correct2p);
+                            errn = Ulp_Error_Half(test1[j], fp_correctn);
+                            err2n = Ulp_Error_Half(test1[j], fp_correct2n);
 
                             fail = fail
                                 && ((!(fabsf(errp) <= half_ulps))
@@ -434,15 +375,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                             if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
 
                             // retry per section 6.5.3.4
-                            if ((*isFloatResultSubnormalPtr)(fp_correctp,
-                                                             half_ulps)
-                                || (*isFloatResultSubnormalPtr)(fp_correctn,
-                                                                half_ulps))
+                            if (IsHalfResultSubnormal(fp_correctp, half_ulps)
+                                || IsHalfResultSubnormal(fp_correctn,
+                                                         half_ulps))
                             {
-                                if ((*isFloatResultSubnormalPtr)(fp_correct2p,
-                                                                 half_ulps)
-                                    || (*isFloatResultSubnormalPtr)(
-                                        fp_correct2n, half_ulps))
+                                if (IsHalfResultSubnormal(fp_correct2p,
+                                                          half_ulps)
+                                    || IsHalfResultSubnormal(fp_correct2n,
+                                                             half_ulps))
                                 {
                                     fail = fail
                                         && !(HTF(test1[j]) == 0.0f
@@ -457,10 +397,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                     if (!fail) err = 0.0f;
                                 }
                             }
-                            else if ((*isFloatResultSubnormalPtr)(fp_correct2p,
-                                                                  half_ulps)
-                                     || (*isFloatResultSubnormalPtr)(
-                                         fp_correct2n, half_ulps))
+                            else if (IsHalfResultSubnormal(fp_correct2p,
+                                                           half_ulps)
+                                     || IsHalfResultSubnormal(fp_correct2n,
+                                                              half_ulps))
                             {
                                 fail = fail
                                     && !(HTF(test2[j]) == 0.0f
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
index 241377ddac..9a769447f6 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -72,8 +72,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
     maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0;
 
     // Init the kernels
-    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
-                                relaxedMode };
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
     if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
                                &build_info)))
@@ -88,15 +87,13 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
             const unsigned m_size = 0x1ff;
             const unsigned e_size = 0xf;
             const unsigned s_size = 0x2;
-            const unsigned sclamp = 0xffff;
 
             for (size_t j = 0; j < half_buffer_size; j++)
             {
                 unsigned ind = j % (s_size * e_size * m_size);
                 unsigned val = (((ind / (e_size * m_size)) << 15)
                                 | (((ind / m_size) % e_size + 1) << 10)
-                                | (ind % m_size + 1))
-                    & sclamp;
+                                | (ind % m_size + 1));
                 pIn[j] = val;
             }
         }
@@ -111,7 +108,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xffffdead;
+            uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
                 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -161,9 +158,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Run the kernels
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            // sizeof(cl_half) < sizeof (int32_t)
-            // to prevent overflowing gOut_Ref2 it is necessary to use
-            // bigger type as denominator for buffer size calculation
+            // align working group size with the bigger output type
             size_t vectorSize = sizeValues[j] * sizeof(int32_t);
             size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j][thread_id], 0,
@@ -211,15 +206,17 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Read the data back
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
+            cl_bool blocking =
+                (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
             if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 return error;
             }
             if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0,
                                          BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
@@ -251,10 +248,10 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
                     cl_long iErr = (int64_t)test2[j] - (int64_t)correct2;
                     int fail = !(fabsf(err) <= half_ulps
                                  && abs_cl_long(iErr) <= maxiError);
-                    if (ftz || relaxedMode)
+                    if (ftz)
                     {
                         // retry per section 6.5.3.2
-                        if (IsFloatResultSubnormal(fp_correct, half_ulps))
+                        if (IsHalfResultSubnormal(fp_correct, half_ulps))
                         {
                             fail = fail && !(test == 0.0f && iErr == 0);
                             if (!fail) err = 0.0f;
@@ -294,9 +291,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
 
                             // retry per section 6.5.3.4
                             if (fail
-                                && (IsFloatResultSubnormal(correct2, half_ulps)
-                                    || IsFloatResultSubnormal(fp_correct3,
-                                                              half_ulps)))
+                                && (IsHalfResultSubnormal(correct2, half_ulps)
+                                    || IsHalfResultSubnormal(fp_correct3,
+                                                             half_ulps)))
                             {
                                 fail = fail
                                     && !(test == 0.0f
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index 842e85a9b0..e2ff937051 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -23,7 +23,8 @@
 #include <cstring>
 #include <cinttypes>
 
-////////////////////////////////////////////////////////////////////////////////
+namespace {
+
 static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
                                  void *p)
 {
@@ -36,7 +37,8 @@ static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
     return BuildKernels(info, job_id, generator);
 }
 
-////////////////////////////////////////////////////////////////////////////////
+} // anonymous namespace
+
 int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
 {
     int error;
@@ -90,7 +92,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint16_t pattern = 0xdead;
+            uint32_t pattern = 0xACDCACDC;
             memset_pattern4(gOut[j], &pattern, bufferSize);
             if ((error =
                      clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
@@ -139,7 +141,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
             if (!strcmp(name, "nan"))
                 r[j] = reference_nanh(p[j]);
             else
-                r[j] = cl_half_from_float(f->func.f_u(p[j]), CL_HALF_RTE);
+                r[j] = HFF(f->func.f_u(p[j]));
         }
         // Read the data back
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
@@ -181,8 +183,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                         if (ftz)
                         {
                             // retry per section 6.5.3.2
-                            if (IsHalfSubnormal(
-                                    cl_half_from_float(correct, CL_HALF_RTE)))
+                            if (IsHalfResultSubnormal(correct, half_ulps))
                             {
                                 fail = fail && (test != 0.0f);
                                 if (!fail) err = 0.0f;
@@ -197,8 +198,8 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                     if (fail)
                     {
                         vlog_error(
-                            "\n%s%s: %f ulp error at 0x%0.4x \nExpected: %a "
-                            "(0x%0.4x) \nActual: %a (0x%0.4x)\n",
+                            "\n%s%s: %f ulp error at 0x%04x \nExpected: %a "
+                            "(0x%04x) \nActual: %a (0x%04x)\n",
                             f->name, sizeNames[k], err, p[j],
                             cl_half_to_float(r[j]), r[j], test, q[j]);
                         return -1;
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index d11ce6f368..264fc7a435 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -126,6 +126,12 @@ inline int IsFloatResultSubnormal(double x, float ulps)
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
+inline int IsHalfResultSubnormal(float x, float ulps)
+{
+    x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps;
+    return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14);
+}
+
 inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
 {
     x = x - abs_err;

From 9133686b4a30e0ab64c01cd43b2a5438b1bbc5f9 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Wed, 8 Nov 2023 23:29:17 +0100
Subject: [PATCH 16/24] Print format correction due to failed CI check

---
 test_conformance/math_brute_force/binary_half.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index 4b495c9532..fdf54268da 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -577,7 +577,7 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 {
                     vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
                                "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
-                               "\nActual: %a (half 0x%04x) at index: %zu\n",
+                               "\nActual: %a (half 0x%04x) at index: %u\n",
                                name, sizeNames[k], err, s[j], p[j], s2[j],
                                p2[j], cl_half_to_float(r[j]), r[j], test, q[j],
                                j);

From 11e45a793f70ae57211a2887c1eadc77a8cc13f4 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 9 Nov 2023 15:16:04 +0100
Subject: [PATCH 17/24] Corrected bug found in code review (fp16 bruteforce)

---
 test_conformance/math_brute_force/binary_half.cpp       | 2 +-
 test_conformance/math_brute_force/binary_i_half.cpp     | 2 +-
 test_conformance/math_brute_force/macro_binary_half.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index fdf54268da..3ddc64b89d 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -683,7 +683,7 @@ int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
index dcfd285515..97692c142a 100644
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -450,7 +450,7 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index 6157a9ebb6..842ef61f83 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -460,7 +460,7 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         test_info.tinfo[i].inBuf2 =
             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
-        if (error || NULL == test_info.tinfo[i].inBuf)
+        if (error || NULL == test_info.tinfo[i].inBuf2)
         {
             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
                        "region {%zd, %zd}\n",

From b5ed4f081bd09d8e563fde30793fb145f85e42f3 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 17 Nov 2023 08:36:55 +0100
Subject: [PATCH 18/24] Corrections related to code review (cl_khr_fp16 support
 according to #142)

-gHostFill missing support added
-special half values array extended
-cosmetics and unifying
---
 .../math_brute_force/binary_half.cpp          | 75 +++++++++++-------
 .../math_brute_force/binary_i_half.cpp        | 79 ++++++++++++-------
 .../math_brute_force/binary_operator_half.cpp | 73 ++++++++++-------
 .../binary_two_results_i_half.cpp             | 26 +++---
 .../math_brute_force/i_unary_half.cpp         | 32 +++++---
 .../math_brute_force/macro_binary_half.cpp    | 78 +++++++++++-------
 .../math_brute_force/macro_unary_half.cpp     | 67 ++++++++++------
 .../math_brute_force/mad_half.cpp             | 27 +++++--
 .../math_brute_force/ternary_half.cpp         | 35 ++++----
 .../math_brute_force/unary_half.cpp           | 68 +++++++++-------
 .../unary_two_results_half.cpp                | 20 ++---
 .../unary_two_results_i_half.cpp              | 26 +++---
 .../math_brute_force/unary_u_half.cpp         | 29 ++++---
 13 files changed, 375 insertions(+), 260 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
index 3ddc64b89d..f80a085370 100644
--- a/test_conformance/math_brute_force/binary_half.cpp
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -101,9 +101,14 @@ const cl_half specialValuesHalf[] = {
     0x3555, /*nearest value to 1/3*/
     0x3bff, /*largest number less than one*/
     0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
 };
 
-size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
 cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
@@ -133,21 +138,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ushort *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+
+    if (gHostFill)
     {
-        out[j] = (cl_ushort *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
+            out[j] = (cl_ushort *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
         }
-    }
 
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
 
     // Init input array
     cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
@@ -200,28 +210,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
+        if (gHostFill)
         {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint32_t pattern = 0xACDCACDC;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
         {
-            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
-                       error);
-            return error;
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
         // run the kernel
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
index 97692c142a..001e2b4f54 100644
--- a/test_conformance/math_brute_force/binary_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -47,7 +47,7 @@ typedef struct ThreadInfo
         maxErrorValue; // position of the max error value (param 1).  Init to 0.
     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
                            // to 0.
-    MTdata d;
+    MTdataHolder d;
     clCommandQueueWrapper
         tQueue; // per thread command queue to improve performance
 } ThreadInfo;
@@ -93,9 +93,14 @@ const cl_half specialValuesHalf[] = {
     0x3555, /*nearest value to 1/3*/
     0x3bff, /*largest number less than one*/
     0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
 };
 
-size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
 const int specialValuesInt3[] = { 0,     1,       2,       3,       1022, 1023,
                                   1024,  INT_MIN, INT_MAX, -1,      -2,   -3,
@@ -123,21 +128,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ushort *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+
+    if (gHostFill)
     {
-        out[j] = (cl_ushort *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
+            out[j] = (cl_ushort *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
         }
-    }
 
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
 
     // Init input array
     cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
@@ -191,27 +201,38 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
+        if (gHostFill)
         {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint32_t pattern = 0xACDCACDC;
-        memset_pattern4(out[j], &pattern, buffer_elements * sizeof(cl_half));
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
         {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
+            memset_pattern4(out[j], &pattern,
+                            buffer_elements * sizeof(cl_half));
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], &pattern, sizeof(pattern), 0,
+                buffer_elements * sizeof(cl_half), 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
         // run the kernel
@@ -479,7 +500,7 @@ int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
     }
 
 
diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
index 2d31964747..e7f53af871 100644
--- a/test_conformance/math_brute_force/binary_operator_half.cpp
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -93,6 +93,11 @@ const cl_half specialValuesHalf[] = {
     0x3555, /*nearest value to 1/3*/
     0x3bff, /*largest number less than one*/
     0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
 };
 
 constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
@@ -118,22 +123,25 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     cl_event e[VECTOR_SIZE_COUNT];
     cl_half *out[VECTOR_SIZE_COUNT];
 
-    // start the map of the output arrays
-    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    if (gHostFill)
     {
-        out[j] = (cl_ushort *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
+        // start the map of the output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
+            out[j] = (cl_ushort *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
         }
-    }
 
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
 
     bool divide = strcmp(name, "divide") == 0;
 
@@ -207,28 +215,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
+        if (gHostFill)
         {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint32_t pattern = 0xACDCACDC;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
         {
-            vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
-                       error);
-            return error;
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
         // Run the kernel
diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
index 3900e62d5a..bc2519e95b 100644
--- a/test_conformance/math_brute_force/binary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
@@ -159,23 +159,15 @@ int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode)
             }
             else
             {
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
-                               error);
-                    return error;
-                }
-
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
-                               error);
-                    return error;
-                }
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 1 failed!\n");
+
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 2 failed!\n");
             }
         }
 
diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index ada2aa89a0..d51f5ddb3f 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -48,8 +48,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ;
     size_t bufferSize = BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
-    uint64_t bufferElements = bufferSize / sizeof(cl_int);
-    std::vector<float> s(0);
+    size_t bufferElements = bufferSize / sizeof(cl_int);
 
     int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
 
@@ -69,7 +68,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
                                    &build_info)))
             return error;
     }
-    s.resize(bufferElements);
+    std::vector<float> s(bufferElements);
 
     for (uint64_t i = 0; i < (1ULL << 16); i += step)
     {
@@ -94,15 +93,26 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xffffdead;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
             {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                return error;
+                memset_pattern4(gOut[j], &pattern, bufferSize);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, bufferSize,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSize, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
             }
         }
 
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index 842ef61f83..bcda06e4e9 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -40,7 +40,7 @@ struct ThreadInfo
     clMemWrapper inBuf; // input buffer for the thread
     clMemWrapper inBuf2; // input buffer for the thread
     clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
-    MTdata d;
+    MTdataHolder d;
     clCommandQueueWrapper
         tQueue; // per thread command queue to improve performance
 };
@@ -85,9 +85,14 @@ const cl_half specialValuesHalf[] = {
     0x3555, /*nearest value to 1/3*/
     0x3bff, /*largest number less than one*/
     0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
 };
 
-size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
 
 cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 {
@@ -108,21 +113,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_short *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+
+    if (gHostFill)
     {
-        out[j] = (cl_short *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
+            out[j] = (cl_short *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
         }
-    }
 
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
 
     // Init input array
     cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
@@ -176,27 +186,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
+        if (gHostFill)
         {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint32_t pattern = 0xACDCACDC;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
         {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                             &pattern, sizeof(pattern), 0,
+                                             buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
         // run the kernel
@@ -489,7 +509,7 @@ int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
             return error;
         }
 
-        test_info.tinfo[i].d = init_genrand(genrand_int32(d));
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
     }
 
     // Init the kernels
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
index ae359b3e57..a755ddb15a 100644
--- a/test_conformance/math_brute_force/macro_unary_half.cpp
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -92,21 +92,26 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_short *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+
+    if (gHostFill)
     {
-        out[j] = (cl_short *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
+            out[j] = (cl_short *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
         }
-    }
 
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
 
     // Write the new values to the input array
     cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
@@ -121,27 +126,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
+        if (gHostFill)
         {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint32_t pattern = 0xACDCACDC;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
         {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
         // run the kernel
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
index 5cb73d4b1f..4545c93eab 100644
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -98,15 +98,26 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xACDCACDC;
-            memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
             {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                return error;
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
             }
         }
 
diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
index 93dc612f7c..ba6dd4d480 100644
--- a/test_conformance/math_brute_force/ternary_half.cpp
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -52,6 +52,11 @@ static const cl_half specialValuesHalf[] = {
     0x3555, /*nearest value to 1/3*/
     0x3bff, /*largest number less than one*/
     0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
 };
 
 constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
@@ -72,9 +77,9 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal3 = 0.0f;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
 
-    constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half);
+    constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half);
 
-    cl_uchar overflow[half_buffer_size];
+    cl_uchar overflow[bufferElements];
     float half_ulps = f->half_ulps;
     int skipNanInf = (0 == strcmp("fma", f->nameInCode));
 
@@ -99,7 +104,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         { // test edge cases
             uint32_t x, y, z;
             x = y = z = 0;
-            for (; idx < half_buffer_size; idx++)
+            for (; idx < bufferElements; idx++)
             {
                 hp0[idx] = specialValuesHalf[x];
                 hp1[idx] = specialValuesHalf[y];
@@ -115,7 +120,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
                     }
                 }
             }
-            if (idx == half_buffer_size)
+            if (idx == bufferElements)
                 vlog_error("Test Error: not all special cases tested!\n");
         }
 
@@ -124,7 +129,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
             return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX);
         };
 
-        for (; idx < half_buffer_size; idx++)
+        for (; idx < bufferElements; idx++)
         {
             hp0[idx] = any_value();
             hp1[idx] = any_value();
@@ -155,7 +160,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         // Write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xffffdead;
+            uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
                 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
@@ -171,14 +176,10 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
             }
             else
             {
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
-                               error);
-                    return error;
-                }
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
             }
         }
 
@@ -233,7 +234,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         cl_half *res = (cl_half *)gOut_Ref;
         if (skipNanInf)
         {
-            for (size_t j = 0; j < half_buffer_size; j++)
+            for (size_t j = 0; j < bufferElements; j++)
             {
                 feclearexcept(FE_OVERFLOW);
                 res[j] = HFF((float)f->func.f_fma(
@@ -244,7 +245,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         }
         else
         {
-            for (size_t j = 0; j < half_buffer_size; j++)
+            for (size_t j = 0; j < bufferElements; j++)
                 res[j] = HFF((float)f->func.f_fma(
                     HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
         }
@@ -265,7 +266,7 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
 
         // Verify data
         uint16_t *t = (uint16_t *)gOut_Ref;
-        for (size_t j = 0; j < half_buffer_size; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index f6e914c8af..0ac71df164 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -95,24 +95,28 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     std::vector<float> s(0);
 
-    // start the map of the output arrays
     cl_event e[VECTOR_SIZE_COUNT];
     cl_ushort *out[VECTOR_SIZE_COUNT];
-    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+
+    if (gHostFill)
     {
-        out[j] = (uint16_t *)clEnqueueMapBuffer(
-            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
-            buffer_size, 0, NULL, e + j, &error);
-        if (error || NULL == out[j])
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
-                       error);
-            return error;
+            out[j] = (uint16_t *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
         }
-    }
 
-    // Get that moving
-    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
 
     // Write the new values to the input array
     cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
@@ -130,27 +134,37 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
 
     for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
     {
-        // Wait for the map to finish
-        if ((error = clWaitForEvents(1, e + j)))
-        {
-            vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
-            return error;
-        }
-        if ((error = clReleaseEvent(e[j])))
+        if (gHostFill)
         {
-            vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
-            return error;
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
         }
 
         // Fill the result buffer with garbage, so that old results don't carry
         // over
-        uint32_t pattern = 0xACDCACDC;
-        memset_pattern4(out[j], &pattern, buffer_size);
-        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
-                                             out[j], 0, NULL, NULL)))
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
         {
-            vlog_error("Error: clEnqueueMapBuffer failed! err: %d\n", error);
-            return error;
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                             &pattern, sizeof(pattern), 0,
+                                             buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
         // run the kernel
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index 18d4dadd0e..86a1a3f0ab 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -124,23 +124,15 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             }
             else
             {
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
                                                  &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
-                               error);
-                    return error;
-                }
+                                                 BUFFER_SIZE, 0, NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 1 failed!\n");
 
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
                                                  &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
-                               error);
-                    return error;
-                }
+                                                 BUFFER_SIZE, 0, NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 2 failed!\n");
             }
         }
 
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
index 9a769447f6..ee6c5dd350 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -135,23 +135,15 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
             }
             else
             {
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
-                               error);
-                    return error;
-                }
-
-                if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL)))
-                {
-                    vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
-                               error);
-                    return error;
-                }
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 1 failed!\n");
+
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 2 failed!\n");
             }
         }
 
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index e2ff937051..083ab94dce 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -92,15 +92,26 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
         // write garbage into output arrays
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            uint32_t pattern = 0xACDCACDC;
-            memset_pattern4(gOut[j], &pattern, bufferSize);
-            if ((error =
-                     clEnqueueWriteBuffer(gQueue, gOutBuffer[j], CL_FALSE, 0,
-                                          bufferSize, gOut[j], 0, NULL, NULL)))
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
             {
-                vlog_error("\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
-                           error, j);
-                return error;
+                memset_pattern4(gOut[j], &pattern, bufferSize);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, bufferSize,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
+                                                 &pattern, sizeof(pattern), 0,
+                                                 bufferSize, 0, NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
             }
         }
 
@@ -126,7 +137,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
                                                 1, NULL, &localCount, NULL, 0,
                                                 NULL, NULL)))
             {
-                vlog_error("FAILURE -- could not execute kernel\n");
+                vlog_error("FAILED -- could not execute kernel\n");
                 return error;
             }
         }

From f51a0b5d612075c0a55efef4cf4feb547eb0fbc9 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 17 Nov 2023 08:37:46 +0100
Subject: [PATCH 19/24] clang format applied

---
 .../math_brute_force/macro_binary_half.cpp           |  4 ++--
 test_conformance/math_brute_force/unary_half.cpp     |  4 ++--
 .../math_brute_force/unary_two_results_half.cpp      | 12 ++++++------
 test_conformance/math_brute_force/unary_u_half.cpp   |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
index bcda06e4e9..d25342dda5 100644
--- a/test_conformance/math_brute_force/macro_binary_half.cpp
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -214,8 +214,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         else
         {
             error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                             &pattern, sizeof(pattern), 0,
-                                             buffer_size, 0, NULL, NULL);
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
             test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
index 0ac71df164..9b230f96bc 100644
--- a/test_conformance/math_brute_force/unary_half.cpp
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -162,8 +162,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         else
         {
             error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
-                                             &pattern, sizeof(pattern), 0,
-                                             buffer_size, 0, NULL, NULL);
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
             test_error(error, "clEnqueueFillBuffer failed!\n");
         }
 
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index 86a1a3f0ab..23889c66b7 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -124,14 +124,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             }
             else
             {
-                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL);
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer 1 failed!\n");
 
-                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 BUFFER_SIZE, 0, NULL, NULL);
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer 2 failed!\n");
             }
         }
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index 083ab94dce..388dadd4ba 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -108,9 +108,9 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
             }
             else
             {
-                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
-                                                 &pattern, sizeof(pattern), 0,
-                                                 bufferSize, 0, NULL, NULL);
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSize, 0,
+                                            NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer failed!\n");
             }
         }

From 207b7587a8ba1e35ff4f1cc2329f95b81bee874b Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 17 Nov 2023 11:50:28 +0100
Subject: [PATCH 20/24] consistency correction

---
 test_conformance/math_brute_force/mad_half.cpp     | 7 ++-----
 test_conformance/math_brute_force/unary_u_half.cpp | 4 ----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
index 4545c93eab..d8aefde386 100644
--- a/test_conformance/math_brute_force/mad_half.cpp
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -51,11 +51,8 @@ int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
     size_t bufferSize = BUFFER_SIZE;
 
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
-    uint64_t step = bufferSize / sizeof(cl_half);
-    if (gWimpyMode)
-    {
-        step = (1ULL << 32) * gWimpyReductionFactor / (512);
-    }
+    uint64_t step = getTestStep(sizeof(cl_half), bufferSize);
+
     // Init the kernels
     {
         BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index 388dadd4ba..04b2b16b2b 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -55,10 +55,6 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
     const char *name = f->name;
     float half_ulps = f->half_ulps;
-    if (gWimpyMode)
-    {
-        step = (1ULL << 32) * gWimpyReductionFactor / (512);
-    }
 
     // Init the kernels
     BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };

From c49e8259d27695bfcb44344773840390ff851119 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Sun, 19 Nov 2023 14:02:56 +0100
Subject: [PATCH 21/24] more consistency corrections for cl_fp16_khr supported
 tests

---
 .../unary_two_results_half.cpp                | 34 +++++++++----------
 .../unary_two_results_i_half.cpp              | 30 +++++++---------
 2 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index 23889c66b7..ae3a4a733a 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -51,9 +51,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal1 = 0.0f;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
 
-    constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(cl_half);
+    constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half);
+    int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
 
-    cl_uchar overflow[half_buffer_size];
+    cl_uchar overflow[bufferElements];
     int isFract = 0 == strcmp("fract", f->nameInCode);
     int skipNanInf = isFract;
 
@@ -68,23 +69,19 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                                &build_info)))
         return error;
 
-    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 16); i += step)
     {
         // Init input array
         cl_half *pIn = (cl_half *)gIn;
+        if (gWimpyMode)
         {
-            const unsigned m_size = 0x1ff;
-            const unsigned e_size = 0xf;
-            const unsigned s_size = 0x2;
-
-            for (size_t j = 0; j < half_buffer_size; j++)
-            {
-                unsigned ind = j % (s_size * e_size * m_size);
-                unsigned val = (((ind / (e_size * m_size)) << 15)
-                                | (((ind / m_size) % e_size + 1) << 10)
-                                | (ind % m_size + 1));
-                pIn[j] = val;
-            }
+            for (size_t j = 0; j < bufferElements; j++)
+                pIn[j] = (cl_ushort)i + j * scale;
+        }
+        else
+        {
+            for (size_t j = 0; j < bufferElements; j++)
+                pIn[j] = (cl_ushort)i + j;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -196,20 +193,21 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
 
         if (skipNanInf)
         {
-            for (size_t j = 0; j < half_buffer_size; j++)
+            for (size_t j = 0; j < bufferElements; j++)
             {
                 double dd;
                 feclearexcept(FE_OVERFLOW);
 
                 ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
                 ref2[j] = HFF((float)dd);
+
                 overflow[j] =
                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }
         }
         else
         {
-            for (size_t j = 0; j < half_buffer_size; j++)
+            for (size_t j = 0; j < bufferElements; j++)
             {
                 double dd;
                 ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
@@ -245,7 +243,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         }
 
         // Verify data
-        for (size_t j = 0; j < half_buffer_size; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
index ee6c5dd350..007f169686 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -61,7 +61,8 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
     // sizeof(cl_half) < sizeof (int32_t)
     // to prevent overflowing gOut_Ref2 it is necessary to use
     // bigger type as denominator for buffer size calculation
-    constexpr size_t half_buffer_size = BUFFER_SIZE / sizeof(int32_t);
+    constexpr size_t bufferElements = BUFFER_SIZE / sizeof(int32_t);
+    int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
 
     cl_ulong maxiError = 0;
 
@@ -78,24 +79,19 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
                                &build_info)))
         return error;
 
-    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    for (uint64_t i = 0; i < (1ULL << 16); i += step)
     {
         // Init input array
         cl_half *pIn = (cl_half *)gIn;
-
+        if (gWimpyMode)
         {
-            const unsigned m_size = 0x1ff;
-            const unsigned e_size = 0xf;
-            const unsigned s_size = 0x2;
-
-            for (size_t j = 0; j < half_buffer_size; j++)
-            {
-                unsigned ind = j % (s_size * e_size * m_size);
-                unsigned val = (((ind / (e_size * m_size)) << 15)
-                                | (((ind / m_size) % e_size + 1) << 10)
-                                | (ind % m_size + 1));
-                pIn[j] = val;
-            }
+            for (size_t j = 0; j < bufferElements; j++)
+                pIn[j] = (cl_ushort)i + j * scale;
+        }
+        else
+        {
+            for (size_t j = 0; j < bufferElements; j++)
+                pIn[j] = (cl_ushort)i + j;
         }
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
@@ -192,7 +188,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         // Calculate the correctly rounded reference result
         cl_half *ref1 = (cl_half *)gOut_Ref;
         int32_t *ref2 = (int32_t *)gOut_Ref2;
-        for (size_t j = 0; j < half_buffer_size; j++)
+        for (size_t j = 0; j < bufferElements; j++)
             ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j));
 
         // Read the data back
@@ -219,7 +215,7 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         if (gSkipCorrectnessTesting) break;
 
         // Verify data
-        for (size_t j = 0; j < half_buffer_size; j++)
+        for (size_t j = 0; j < bufferElements; j++)
         {
             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
             {

From 6f859068dbec99baae2552ffc96112c89e29d5cf Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 28 Nov 2023 11:52:58 +0100
Subject: [PATCH 22/24] Corrections related to code review (bureforce #142)

---
 .../math_brute_force/i_unary_half.cpp          | 18 ++++++------------
 .../unary_two_results_half.cpp                 |  3 +++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index d51f5ddb3f..22971bfc0a 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -19,6 +19,7 @@
 #include "test_functions.h"
 #include "utility.h"
 
+#include <algorithm>
 #include <cstring>
 #include <memory>
 #include <cinttypes>
@@ -48,9 +49,8 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ;
     size_t bufferSize = BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
-    size_t bufferElements = bufferSize / sizeof(cl_int);
-
-    int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
+    size_t bufferElements = std::min(bufferSize / sizeof(cl_int),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
 
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
     // This test is not using ThreadPool so we need to disable FTZ here
@@ -74,15 +74,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         cl_ushort *p = (cl_ushort *)gIn;
-        if (gWimpyMode)
-        {
-            for (size_t j = 0; j < bufferElements; j++)
-                p[j] = (cl_ushort)i + j * scale;
-        }
-        else
-        {
-            for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
-        }
+
+        for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
+
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))
         {
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index ae3a4a733a..9284fbd760 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -201,6 +201,9 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                 ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
                 ref2[j] = HFF((float)dd);
 
+                // ensure correct rounding of fract result is not reaching 1
+                if (isFract && HTF(ref1[j]) >= 1.f) ref1[j] = 0x3bff;
+
                 overflow[j] =
                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }

From 64db7f52b224d9c59e74cebf13ab9bdbdd26f0d0 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 28 Nov 2023 15:14:49 +0100
Subject: [PATCH 23/24] Correction for i_unary_half test capacity

---
 .../math_brute_force/i_unary_half.cpp         | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
index 22971bfc0a..baff3ee20d 100644
--- a/test_conformance/math_brute_force/i_unary_half.cpp
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -47,10 +47,11 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
     KernelMatrix kernels;
     const unsigned thread_id = 0; // Test is currently not multithreaded.
     int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ;
-    size_t bufferSize = BUFFER_SIZE;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
-    size_t bufferElements = std::min(bufferSize / sizeof(cl_int),
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int),
                                      size_t(1ULL << (sizeof(cl_half) * 8)));
+    size_t bufferSizeIn = bufferElements * sizeof(cl_half);
+    size_t bufferSizeOut = bufferElements * sizeof(cl_int);
 
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
     // This test is not using ThreadPool so we need to disable FTZ here
@@ -78,7 +79,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          bufferSize, gIn, 0, NULL, NULL)))
+                                          bufferSizeIn, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -90,9 +91,9 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
             uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
-                memset_pattern4(gOut[j], &pattern, bufferSize);
+                memset_pattern4(gOut[j], &pattern, bufferSizeOut);
                 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
-                                                  CL_FALSE, 0, bufferSize,
+                                                  CL_FALSE, 0, bufferSizeOut,
                                                   gOut[j], 0, NULL, NULL)))
                 {
                     vlog_error(
@@ -104,8 +105,8 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
             else
             {
                 error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
-                                            sizeof(pattern), 0, bufferSize, 0,
-                                            NULL, NULL);
+                                            sizeof(pattern), 0, bufferSizeOut,
+                                            0, NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer failed!\n");
             }
         }
@@ -114,7 +115,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_int);
-            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            size_t localCount = (bufferSizeOut + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j][thread_id], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -144,15 +145,15 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
         int *r = (int *)gOut_Ref;
         for (size_t j = 0; j < bufferElements; j++)
         {
-            s[j] = cl_half_to_float(p[j]);
+            s[j] = HTF(p[j]);
             r[j] = f->func.i_f(s[j]);
         }
         // Read the data back
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                             bufferSizeOut, gOut[j], 0, NULL,
+                                             NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 return error;
@@ -195,7 +196,7 @@ int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
             {
                 vlog("base:%14" PRIu64 " step:%10" PRIu64
                      "  bufferSize:%10zd \n",
-                     i, step, bufferSize);
+                     i, step, bufferSizeOut);
             }
             else
             {

From 29449119f66cd2797abc38b88523f8570015f0bc Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 28 Nov 2023 18:24:30 +0100
Subject: [PATCH 24/24] Corrections related to capacity of cl_khr_fp16 tests in
 bruteforce (#142)

---
 .../unary_two_results_half.cpp                | 42 +++++++---------
 .../unary_two_results_i_half.cpp              | 48 ++++++++-----------
 .../math_brute_force/unary_u_half.cpp         | 15 ++----
 3 files changed, 42 insertions(+), 63 deletions(-)

diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
index 9284fbd760..70a9f4c79e 100644
--- a/test_conformance/math_brute_force/unary_two_results_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -51,10 +51,11 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
     float maxErrorVal1 = 0.0f;
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
 
-    constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half);
-    int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+    size_t bufferSize = bufferElements * sizeof(cl_half);
 
-    cl_uchar overflow[bufferElements];
+    std::vector<cl_uchar> overflow(bufferElements);
     int isFract = 0 == strcmp("fract", f->nameInCode);
     int skipNanInf = isFract;
 
@@ -73,19 +74,10 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         cl_half *pIn = (cl_half *)gIn;
-        if (gWimpyMode)
-        {
-            for (size_t j = 0; j < bufferElements; j++)
-                pIn[j] = (cl_ushort)i + j * scale;
-        }
-        else
-        {
-            for (size_t j = 0; j < bufferElements; j++)
-                pIn[j] = (cl_ushort)i + j;
-        }
+        for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j;
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+                                          bufferSize, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -97,9 +89,9 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
-                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                memset_pattern4(gOut[j], &pattern, bufferSize);
                 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
-                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  CL_FALSE, 0, bufferSize,
                                                   gOut[j], 0, NULL, NULL)))
                 {
                     vlog_error(
@@ -108,9 +100,9 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
                     return error;
                 }
 
-                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                memset_pattern4(gOut2[j], &pattern, bufferSize);
                 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
-                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  CL_FALSE, 0, bufferSize,
                                                   gOut2[j], 0, NULL, NULL)))
                 {
                     vlog_error(
@@ -122,12 +114,12 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             else
             {
                 error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
-                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            sizeof(pattern), 0, bufferSize, 0,
                                             NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer 1 failed!\n");
 
                 error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
-                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            sizeof(pattern), 0, bufferSize, 0,
                                             NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer 2 failed!\n");
             }
@@ -137,7 +129,7 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             size_t vectorSize = sizeValues[j] * sizeof(cl_half);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j][thread_id], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -225,14 +217,14 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
         {
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
-                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 return error;
             }
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
-                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 return error;
@@ -432,8 +424,8 @@ int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14" PRIu64 " step:%10" PRIu64
-                     "  bufferSize:%10d \n",
-                     i, step, BUFFER_SIZE);
+                     "  bufferSize:%10zu \n",
+                     i, step, bufferSize);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
index 007f169686..5906c2837a 100644
--- a/test_conformance/math_brute_force/unary_two_results_i_half.cpp
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -61,8 +61,11 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
     // sizeof(cl_half) < sizeof (int32_t)
     // to prevent overflowing gOut_Ref2 it is necessary to use
     // bigger type as denominator for buffer size calculation
-    constexpr size_t bufferElements = BUFFER_SIZE / sizeof(int32_t);
-    int scale = (int)((1ULL << 16) / (16 * bufferElements) + 1);
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+
+    size_t bufferSizeLo = bufferElements * sizeof(cl_half);
+    size_t bufferSizeHi = bufferElements * sizeof(cl_int);
 
     cl_ulong maxiError = 0;
 
@@ -83,19 +86,10 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         cl_half *pIn = (cl_half *)gIn;
-        if (gWimpyMode)
-        {
-            for (size_t j = 0; j < bufferElements; j++)
-                pIn[j] = (cl_ushort)i + j * scale;
-        }
-        else
-        {
-            for (size_t j = 0; j < bufferElements; j++)
-                pIn[j] = (cl_ushort)i + j;
-        }
+        for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j;
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
-                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+                                          bufferSizeLo, gIn, 0, NULL, NULL)))
         {
             vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
             return error;
@@ -107,9 +101,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
             uint32_t pattern = 0xacdcacdc;
             if (gHostFill)
             {
-                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                memset_pattern4(gOut[j], &pattern, bufferSizeLo);
                 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
-                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  CL_FALSE, 0, bufferSizeLo,
                                                   gOut[j], 0, NULL, NULL)))
                 {
                     vlog_error(
@@ -118,9 +112,9 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
                     return error;
                 }
 
-                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                memset_pattern4(gOut2[j], &pattern, bufferSizeHi);
                 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
-                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  CL_FALSE, 0, bufferSizeHi,
                                                   gOut2[j], 0, NULL, NULL)))
                 {
                     vlog_error(
@@ -132,12 +126,12 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
             else
             {
                 error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
-                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            sizeof(pattern), 0, bufferSizeLo, 0,
                                             NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer 1 failed!\n");
 
                 error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern,
-                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            sizeof(pattern), 0, bufferSizeHi, 0,
                                             NULL, NULL);
                 test_error(error, "clEnqueueFillBuffer 2 failed!\n");
             }
@@ -147,8 +141,8 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
         {
             // align working group size with the bigger output type
-            size_t vectorSize = sizeValues[j] * sizeof(int32_t);
-            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            size_t vectorSize = sizeValues[j] * sizeof(cl_int);
+            size_t localCount = (bufferSizeHi + vectorSize - 1) / vectorSize;
             if ((error = clSetKernelArg(kernels[j][thread_id], 0,
                                         sizeof(gOutBuffer[j]), &gOutBuffer[j])))
             {
@@ -198,14 +192,14 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
                 (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
             if ((error =
                      clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
-                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+                                         bufferSizeLo, gOut[j], 0, NULL, NULL)))
             {
                 vlog_error("ReadArray failed %d\n", error);
                 return error;
             }
-            if ((error =
-                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0,
-                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+            if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking,
+                                             0, bufferSizeHi, gOut2[j], 0, NULL,
+                                             NULL)))
             {
                 vlog_error("ReadArray2 failed %d\n", error);
                 return error;
@@ -325,8 +319,8 @@ int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
             if (gVerboseBruteForce)
             {
                 vlog("base:%14" PRIu64 " step:%10" PRIu64
-                     "  bufferSize:%10d \n",
-                     i, step, BUFFER_SIZE);
+                     "  bufferSize:%10zu \n",
+                     i, step, bufferSizeHi);
             }
             else
             {
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
index 04b2b16b2b..6f21ef3eee 100644
--- a/test_conformance/math_brute_force/unary_u_half.cpp
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -48,10 +48,10 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
     float maxError = 0.0f;
     int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
     float maxErrorVal = 0.0f;
-    size_t bufferSize = BUFFER_SIZE;
-    size_t bufferElements = bufferSize / sizeof(cl_half);
     uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
-    int scale = (int)((1ULL << 32) / (16 * bufferElements) + 1);
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+    size_t bufferSize = bufferElements * sizeof(cl_half);
     logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
     const char *name = f->name;
     float half_ulps = f->half_ulps;
@@ -69,14 +69,7 @@ int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
     {
         // Init input array
         cl_ushort *p = (cl_ushort *)gIn;
-        if (gWimpyMode)
-        {
-            for (size_t j = 0; j < bufferElements; j++) p[j] = i + j * scale;
-        }
-        else
-        {
-            for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j;
-        }
+        for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j;
 
         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
                                           bufferSize, gIn, 0, NULL, NULL)))