From 4216c5323daaa1e908acc99e7468c369807bb05e Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Mon, 18 Dec 2023 19:14:33 +0100
Subject: [PATCH 1/7] Fp16 conversions staging (#1864)

* Added unification of existing conversions test as preparation for cl_khr_fp16 adaptation

* Unified initialization procedures for conversions test.

* Completed unification of data structures to handle cl_khr_fp16

* Added support for selective launch of the test

* Added half support for test_conversions, work in progres (issue #142, conversions)

* Added more work on halfs support for conversions test (issue #142, conversions)

* Added cosmetic corrections

* Added more cosmetic corrections before opening draft PR

* Added corrections related to pre-submit windows build

* Added more pre-build related corrections

* Added pre-submit ubuntu build related correction

* Added more pre-submit related corrections

* Divided structures into separate source files (issue #142, conversions)

* Added more corrections related to presubmit check

* Removed redeclarations due to presubmit check

* Added more corrections related to presubmit check arm build

* Added cosmetic correction

* Adapted modifications from related PR #1719 to avoid merging conflicts

* fixed clang format

* Added corrections related to code review (cl_khr_fp16 suuport according to issue #142)

* Corrections related to macos CI check fail

* fix for unclear clang format discrepancy

* More corrections related to code review (cl_khr_fp16 for conversions #142)

---------

Co-authored-by: Ewan Crawford <ewan@codeplay.com>
---
 test_common/harness/rounding_mode.h           |   9 +-
 .../conversions/basic_test_conversions.cpp    | 389 ++++++++++++------
 .../conversions/basic_test_conversions.h      |  46 ++-
 .../conversions/conversions_data_info.h       | 228 +++++++---
 .../conversions/test_conversions.cpp          |  69 +++-
 5 files changed, 542 insertions(+), 199 deletions(-)
diff --git a/test_common/harness/rounding_mode.h b/test_common/harness/rounding_mode.h
index 6f52f0a00b..bdc6bb98e1 100644
--- a/test_common/harness/rounding_mode.h
+++ b/test_common/harness/rounding_mode.h
@@ -42,10 +42,11 @@ typedef enum
     kshort = 3,
     kuint = 4,
     kint = 5,
-    kfloat = 6,
-    kdouble = 7,
-    kulong = 8,
-    klong = 9,
+    khalf = 6,
+    kfloat = 7,
+    kdouble = 8,
+    kulong = 9,
+    klong = 10,
 
     // This goes last
     kTypeCount
diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index 1020638af9..b5f59deab8 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -48,6 +48,7 @@
 
 #include <vector>
 #include <type_traits>
+#include <cmath>
 
 #include "basic_test_conversions.h"
 
@@ -86,9 +87,13 @@ int gWimpyReductionFactor = 128;
 int gSkipTesting = 0;
 int gForceFTZ = 0;
 int gIsRTZ = 0;
+int gForceHalfFTZ = 0;
+int gIsHalfRTZ = 0;
 uint32_t gSimdSize = 1;
 int gHasDouble = 0;
 int gTestDouble = 1;
+int gHasHalfs = 0;
+int gTestHalfs = 1;
 const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
 int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
 int gMinVectorSize = 0;
@@ -100,6 +105,8 @@ int argCount = 0;
 
 double SubtractTime(uint64_t endTime, uint64_t startTime);
 
+cl_half_rounding_mode DataInitInfo::halfRoundingMode = CL_HALF_RTE;
+cl_half_rounding_mode ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTE;
 
 // clang-format off
 // for readability sake keep this section unformatted
@@ -256,8 +263,30 @@ std::vector<double> DataInitInfo::specialValuesDouble = {
     MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30),
     MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10),
 };
-// clang-format on
 
+// A table of more difficult cases to get right
+std::vector<cl_half> DataInitInfo::specialValuesHalf = {
+    0xffff,
+    0x0000,
+    0x0001,
+    0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+// clang-format on
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision
 // (that's x87 default state).  This causes problems with the tests that
@@ -282,15 +311,32 @@ static inline void Force64BitFPUPrecision(void)
 #endif
 }
 
-
-template <typename InType, typename OutType>
-int CalcRefValsPat<InType, OutType>::check_result(void *test, uint32_t count,
-                                                  int vectorSize)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+int CalcRefValsPat<InType, OutType, InFP, OutFP>::check_result(void *test,
+                                                               uint32_t count,
+                                                               int vectorSize)
 {
     const cl_uchar *a = (const cl_uchar *)gAllowZ;
 
-    if (std::is_integral<OutType>::value)
-    { // char/uchar/short/ushort/int/uint/long/ulong
+    if (is_half<OutType, OutFP>())
+    {
+        const cl_half *t = (const cl_half *)test;
+        const cl_half *c = (const cl_half *)gRef;
+
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] &&
+                // Allow nan's to be binary different
+                !((t[i] & 0x7fff) > 0x7C00 && (c[i] & 0x7fff) > 0x7C00)
+                && !(a[i] != (cl_uchar)0 && t[i] == (c[i] & 0x8000)))
+            {
+                vlog(
+                    "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
+                    vectorSize, i, HTF(c[i]), HTF(t[i]));
+                return i + 1;
+            }
+    }
+    else if (std::is_integral<OutType>::value)
+    { // char/uchar/short/ushort/half/int/uint/long/ulong
         const OutType *t = (const OutType *)test;
         const OutType *c = (const OutType *)gRef;
         for (uint32_t i = 0; i < count; i++)
@@ -388,6 +434,20 @@ cl_int CustomConversionsTest::Run()
             continue;
         }
 
+        // skip half if we don't have it
+        if (!gTestHalfs && (inType == khalf || outType == khalf))
+        {
+            if (gHasHalfs)
+            {
+                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                           gTypeNames[outType], gSaturationNames[sat],
+                           gRoundingModeNames[round], gTypeNames[inType]);
+                vlog("\t\tcl_khr_fp16 enabled, but half testing turned "
+                     "off.\n");
+            }
+            continue;
+        }
+
         // skip longs on embedded
         if (!gHasLong
             && (inType == klong || outType == klong || inType == kulong
@@ -427,8 +487,8 @@ ConversionsTest::ConversionsTest(cl_device_id device, cl_context context,
                                  cl_command_queue queue)
     : context(context), device(device), queue(queue), num_elements(0),
       typeIterator({ cl_uchar(0), cl_char(0), cl_ushort(0), cl_short(0),
-                     cl_uint(0), cl_int(0), cl_float(0), cl_double(0),
-                     cl_ulong(0), cl_long(0) })
+                     cl_uint(0), cl_int(0), cl_half(0), cl_float(0),
+                     cl_double(0), cl_ulong(0), cl_long(0) })
 {}
 
 
@@ -445,11 +505,31 @@ cl_int ConversionsTest::Run()
 cl_int ConversionsTest::SetUp(int elements)
 {
     num_elements = elements;
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        const cl_device_fp_config fpConfigHalf =
+            get_default_rounding_mode(device, CL_DEVICE_HALF_FP_CONFIG);
+        if ((fpConfigHalf & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            DataInitInfo::halfRoundingMode = CL_HALF_RTE;
+            ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTE;
+        }
+        else if ((fpConfigHalf & CL_FP_ROUND_TO_ZERO) != 0)
+        {
+            DataInitInfo::halfRoundingMode = CL_HALF_RTZ;
+            ConversionsTest::defaultHalfRoundingMode = CL_HALF_RTZ;
+        }
+        else
+        {
+            log_error("Error while acquiring half rounding mode");
+            return TEST_FAIL;
+        }
+    }
+
     return CL_SUCCESS;
 }
 
-
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, bool InFP, bool OutFP>
 void ConversionsTest::TestTypesConversion(const Type &inType,
                                           const Type &outType, int &testNumber,
                                           int startMinVectorSize)
@@ -470,7 +550,8 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
          sat = (SaturationMode)(sat + 1))
     {
         // skip illegal saturated conversions to float type
-        if (kSaturated == sat && (outType == kfloat || outType == kdouble))
+        if (kSaturated == sat
+            && (outType == kfloat || outType == kdouble || outType == khalf))
         {
             continue;
         }
@@ -507,6 +588,20 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                 continue;
             }
 
+            // skip half if we don't have it
+            if (!gTestHalfs && (inType == khalf || outType == khalf))
+            {
+                if (gHasHalfs)
+                {
+                    vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                               gTypeNames[outType], gSaturationNames[sat],
+                               gRoundingModeNames[round], gTypeNames[inType]);
+                    vlog("\t\tcl_khr_fp16 enabled, but half testing turned "
+                         "off.\n");
+                }
+                continue;
+            }
+
             // Skip the implicit converts if the rounding mode is
             // not default or test is saturated
             if (0 == startMinVectorSize)
@@ -517,7 +612,8 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                     gMinVectorSize = 0;
             }
 
-            if ((error = DoTest<InType, OutType>(outType, inType, sat, round)))
+            if ((error = DoTest<InType, OutType, InFP, OutFP>(outType, inType,
+                                                              sat, round)))
             {
                 vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                            "FAILED ** \n",
@@ -529,8 +625,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
     }
 }
 
-
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, bool InFP, bool OutFP>
 int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
                             RoundingMode round)
 {
@@ -541,7 +636,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
     cl_uint threads = GetThreadCount();
 
     DataInitInfo info = { 0, 0, outType, inType, sat, round, threads };
-    DataInfoSpec<InType, OutType> init_info(info);
+    DataInfoSpec<InType, OutType, InFP, OutFP> init_info(info);
     WriteInputBufferInfo writeInputBufferInfo;
     int vectorSize;
     int error = 0;
@@ -564,7 +659,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
     for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
         writeInputBufferInfo.calcInfo[vectorSize].reset(
-            new CalcRefValsPat<InType, OutType>());
+            new CalcRefValsPat<InType, OutType, InFP, OutFP>());
         writeInputBufferInfo.calcInfo[vectorSize]->program =
             conv_test::MakeProgram(
                 outType, inType, sat, round, vectorSize,
@@ -597,6 +692,11 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
         if (round == kDefaultRoundingMode && gIsRTZ)
             init_info.round = round = kRoundTowardZero;
     }
+    else if (std::is_same<OutType, cl_half>::value && OutFP)
+    {
+        if (round == kDefaultRoundingMode && gIsHalfRTZ)
+            init_info.round = round = kRoundTowardZero;
+    }
 
     // Figure out how many elements are in a work block
     // we handle 64-bit types a bit differently.
@@ -764,6 +864,10 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
                         vlog("Input value: 0x%8.8x ",
                              ((unsigned int *)gIn)[error - 1]);
                         break;
+                    case khalf:
+                        vlog("Input value: %a ",
+                             HTF(((cl_half *)gIn)[error - 1]));
+                        break;
                     case kfloat:
                         vlog("Input value: %a ", ((float *)gIn)[error - 1]);
                         break;
@@ -901,8 +1005,6 @@ double SubtractTime(uint64_t endTime, uint64_t startTime)
 }
 #endif
 
-////////////////////////////////////////////////////////////////////////////////
-
 static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
 {
     cl_uint i;
@@ -951,6 +1053,112 @@ void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &info)
     // destroyed automatically soon after we exit.
 }
 
+template <typename T> static bool isnan_fp(const T &v)
+{
+    if (std::is_same<T, cl_half>::value)
+    {
+        uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+        uint16_t h_mant = ((cl_half)v) & 0x3FF;
+        return (h_exp == 0x1F && h_mant != 0);
+    }
+    else
+    {
+#if !defined(_WIN32)
+        return std::isnan(v);
+#else
+        return _isnan(v);
+#endif
+    }
+}
+
+template <typename InType>
+void ZeroNanToIntCases(cl_uint count, void *mapped, Type outType)
+{
+    InType *inp = (InType *)gIn;
+    for (auto j = 0; j < count; j++)
+    {
+        if (isnan_fp<InType>(inp[j]))
+            memset((char *)mapped + j * gTypeSizes[outType], 0,
+                   gTypeSizes[outType]);
+    }
+}
+
+template <typename InType, typename OutType>
+void FixNanToFltConversions(InType *inp, OutType *outp, cl_uint count)
+{
+    if (std::is_same<OutType, cl_half>::value)
+    {
+        for (auto j = 0; j < count; j++)
+            if (isnan_fp(inp[j]) && isnan_fp(outp[j]))
+                outp[j] = 0x7e00; // HALF_NAN
+    }
+    else
+    {
+        for (auto j = 0; j < count; j++)
+            if (isnan_fp(inp[j]) && isnan_fp(outp[j])) outp[j] = NAN;
+    }
+}
+
+void FixNanConversions(Type outType, Type inType, void *d, cl_uint count)
+{
+    if (outType != kfloat && outType != kdouble && outType != khalf)
+    {
+        if (inType == kfloat)
+            ZeroNanToIntCases<float>(count, d, outType);
+        else if (inType == kdouble)
+            ZeroNanToIntCases<double>(count, d, outType);
+        else if (inType == khalf)
+            ZeroNanToIntCases<cl_half>(count, d, outType);
+    }
+    else if (inType == kfloat || inType == kdouble || inType == khalf)
+    {
+        // outtype and intype is float or double or half.  NaN conversions for
+        // float/double/half could be any NaN
+        if (inType == kfloat)
+        {
+            float *inp = (float *)gIn;
+            if (outType == kdouble)
+            {
+                double *outp = (double *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+            else if (outType == khalf)
+            {
+                cl_half *outp = (cl_half *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+        }
+        else if (inType == kdouble)
+        {
+            double *inp = (double *)gIn;
+            if (outType == kfloat)
+            {
+                float *outp = (float *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+            else if (outType == khalf)
+            {
+                cl_half *outp = (cl_half *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+        }
+        else if (inType == khalf)
+        {
+            cl_half *inp = (cl_half *)gIn;
+            if (outType == kfloat)
+            {
+                float *outp = (float *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+            else if (outType == kdouble)
+            {
+                double *outp = (double *)d;
+                FixNanToFltConversions(inp, outp, count);
+            }
+        }
+    }
+}
+
 
 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
                                              void *data)
@@ -963,7 +1171,6 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
     Type outType =
         info->parent->outType; // the data type of the conversion result
     Type inType = info->parent->inType; // the data type of the conversion input
-    size_t j;
     cl_int error;
     cl_event doneBarrier = info->parent->doneBarrier;
 
@@ -985,51 +1192,7 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
 
     // Patch up NaNs conversions to integer to zero -- these can be converted to
     // any integer
-    if (outType != kfloat && outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)gIn;
-            double *outp = (double *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)gIn;
-            float *outp = (float *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-    }
+    FixNanConversions(outType, inType, mapped, count);
 
     if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
         info->result =
@@ -1077,12 +1240,8 @@ void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
     // CalcReferenceValuesComplete exit.
 }
 
-//
-
 namespace conv_test {
 
-////////////////////////////////////////////////////////////////////////////////
-
 cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
 {
     DataInitBase *info = (DataInitBase *)p;
@@ -1092,8 +1251,6 @@ cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
     return CL_SUCCESS;
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
 cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
 {
     DataInitBase *info = (DataInitBase *)p;
@@ -1102,7 +1259,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
     Type inType = info->inType;
     Type outType = info->outType;
     RoundingMode round = info->round;
-    size_t j;
 
     Force64BitFPUPrecision();
 
@@ -1110,7 +1266,6 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
     void *a = (cl_uchar *)gAllowZ + job_id * count;
     void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
 
-
     if (outType != inType)
     {
         // create the reference while we wait
@@ -1144,7 +1299,33 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
         qcom_sat = info->sat;
 #endif
 
-        RoundingMode oldRound = set_round(round, outType);
+        RoundingMode oldRound;
+        if (outType == khalf)
+        {
+            oldRound = set_round(kRoundToNearestEven, kfloat);
+            switch (round)
+            {
+                default:
+                case kDefaultRoundingMode:
+                    DataInitInfo::halfRoundingMode =
+                        ConversionsTest::defaultHalfRoundingMode;
+                    break;
+                case kRoundToNearestEven:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTE;
+                    break;
+                case kRoundUp:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTP;
+                    break;
+                case kRoundDown:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTN;
+                    break;
+                case kRoundTowardZero:
+                    DataInitInfo::halfRoundingMode = CL_HALF_RTZ;
+                    break;
+            }
+        }
+        else
+            oldRound = set_round(round, outType);
 
         if (info->sat)
             info->conv_array_sat(d, s, count);
@@ -1161,6 +1342,11 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
             if (inType == kfloat || outType == kfloat)
                 setAllowZ((uint8_t *)a, (uint32_t *)s, count);
         }
+        if (gForceHalfFTZ)
+        {
+            if (inType == khalf || outType == khalf)
+                setAllowZ((uint8_t *)a, (uint32_t *)s, count);
+        }
     }
     else
     {
@@ -1170,55 +1356,11 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
 
     // Patch up NaNs conversions to integer to zero -- these can be converted to
     // any integer
-    if (info->outType != kfloat && info->outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((double *)d)[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((float *)d)[j] = NAN;
-            }
-        }
-    }
+    FixNanConversions(outType, inType, d, count);
 
     return CL_SUCCESS;
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
 uint64_t GetTime(void)
 {
 #if defined(__APPLE__)
@@ -1233,8 +1375,6 @@ uint64_t GetTime(void)
 #endif
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
 // Note: not called reentrantly
 void WriteInputBufferComplete(void *data)
 {
@@ -1295,8 +1435,6 @@ void WriteInputBufferComplete(void *data)
     // automatically soon after we exit.
 }
 
-////////////////////////////////////////////////////////////////////////////////
-
 cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
                        RoundingMode round, int vectorSize, cl_kernel *outKernel)
 {
@@ -1308,6 +1446,9 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
     if (outType == kdouble || inType == kdouble)
         source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
+    if (outType == khalf || inType == khalf)
+        source << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+
     // Create the program. This is a bit complicated because we are trying to
     // avoid byte and short stores.
     if (0 == vectorSize)
@@ -1408,7 +1549,7 @@ cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
     *outKernel = NULL;
 
     const char *flags = NULL;
-    if (gForceFTZ) flags = "-cl-denorms-are-zero";
+    if (gForceFTZ || gForceHalfFTZ) flags = "-cl-denorms-are-zero";
 
     // build it
     std::string sourceString = source.str();
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index 2314ee748b..6fe88461db 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -30,6 +30,8 @@
     #include <CL/opencl.h>
 #endif
 
+#include <CL/cl_half.h>
+
 #include "harness/mt19937.h"
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
@@ -76,6 +78,8 @@ extern cl_mem gInBuffer;
 extern cl_mem gOutBuffers[];
 extern int gHasDouble;
 extern int gTestDouble;
+extern int gHasHalfs;
+extern int gTestHalfs;
 extern int gWimpyMode;
 extern int gWimpyReductionFactor;
 extern int gSkipTesting;
@@ -87,6 +91,8 @@ extern int gReportAverageTimes;
 extern int gStartTestNumber;
 extern int gEndTestNumber;
 extern int gIsRTZ;
+extern int gForceHalfFTZ;
+extern int gIsHalfRTZ;
 extern void *gIn;
 extern void *gRef;
 extern void *gAllowZ;
@@ -135,7 +141,7 @@ struct CalcRefValsBase
     cl_int result;
 };
 
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, bool InFP, bool OutFP>
 struct CalcRefValsPat : CalcRefValsBase
 {
     int check_result(void *, uint32_t, int) override;
@@ -162,8 +168,12 @@ struct WriteInputBufferInfo
 };
 
 // Must be aligned with Type enums!
-using TypeIter = std::tuple<cl_uchar, cl_char, cl_ushort, cl_short, cl_uint,
-                            cl_int, cl_float, cl_double, cl_ulong, cl_long>;
+using TypeIter =
+    std::tuple<cl_uchar, cl_char, cl_ushort, cl_short, cl_uint, cl_int, cl_half,
+               cl_float, cl_double, cl_ulong, cl_long>;
+
+// hardcoded solution needed due to typeid confusing cl_ushort/cl_half
+constexpr bool isTypeFp[] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0 };
 
 // Helper test fixture for constructing OpenCL objects used in testing
 // a variety of simple command-buffer enqueue scenarios.
@@ -179,13 +189,13 @@ struct ConversionsTest
     // Test body returning an OpenCL error code
     cl_int Run();
 
-    template <typename InType, typename OutType>
+    template <typename InType, typename OutType, bool InFP, bool OutFP>
     int DoTest(Type outType, Type inType, SaturationMode sat,
                RoundingMode round);
 
-    template <typename InType, typename OutType>
+    template <typename InType, typename OutType, bool InFP, bool OutFP>
     void TestTypesConversion(const Type &inType, const Type &outType, int &tn,
-                             const int smvs);
+                             int startMinVectorSize);
 
 protected:
     cl_context context;
@@ -195,6 +205,9 @@ struct ConversionsTest
     size_t num_elements;
 
     TypeIter typeIterator;
+
+public:
+    static cl_half_rounding_mode defaultHalfRoundingMode;
 };
 
 struct CustomConversionsTest : ConversionsTest
@@ -221,17 +234,18 @@ int MakeAndRunTest(cl_device_id device, cl_context context,
 
 struct TestType
 {
-    template <typename T> bool testType(Type in)
+    template <typename T, bool FP> bool testType(Type in)
     {
         switch (in)
         {
             default: return false;
             case kuchar: return std::is_same<cl_uchar, T>::value;
             case kchar: return std::is_same<cl_char, T>::value;
-            case kushort: return std::is_same<cl_ushort, T>::value;
+            case kushort: return std::is_same<cl_ushort, T>::value && !FP;
             case kshort: return std::is_same<cl_short, T>::value;
             case kuint: return std::is_same<cl_uint, T>::value;
             case kint: return std::is_same<cl_int, T>::value;
+            case khalf: return std::is_same<cl_half, T>::value && FP;
             case kfloat: return std::is_same<cl_float, T>::value;
             case kdouble: return std::is_same<cl_double, T>::value;
             case kulong: return std::is_same<cl_ulong, T>::value;
@@ -263,13 +277,15 @@ struct IterOverTypes : public TestType
               typename InType>
     void iterate_in_type(const InType &t)
     {
-        if (!testType<InType>(inType)) vlog_error("Unexpected data type!\n");
+        if (!testType<InType, isTypeFp[In]>(inType))
+            vlog_error("Unexpected data type!\n");
 
-        if (!testType<OutType>(outType)) vlog_error("Unexpected data type!\n");
+        if (!testType<OutType, isTypeFp[Out]>(outType))
+            vlog_error("Unexpected data type!\n");
 
         // run the conversions
-        test.TestTypesConversion<InType, OutType>(inType, outType, testNumber,
-                                                  startMinVectorSize);
+        test.TestTypesConversion<InType, OutType, isTypeFp[In], isTypeFp[Out]>(
+            inType, outType, testNumber, startMinVectorSize);
         inType = (Type)(inType + 1);
     }
 
@@ -337,11 +353,13 @@ struct IterOverSelectedTypes : public TestType
               typename InType>
     void iterate_in_type(const InType &t)
     {
-        if (testType<InType>(inType) && testType<OutType>(outType))
+        if (testType<InType, isTypeFp[In]>(inType)
+            && testType<OutType, isTypeFp[Out]>(outType))
         {
             // run selected conversion
             // testing of the result will happen afterwards
-            test.DoTest<InType, OutType>(outType, inType, saturation, rounding);
+            test.DoTest<InType, OutType, isTypeFp[In], isTypeFp[Out]>(
+                outType, inType, saturation, rounding);
         }
     }
 
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index c62d11ae75..043c509d1f 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -28,8 +28,11 @@ extern bool qcom_sat;
 extern roundingMode qcom_rm;
 #endif
 
+#include <CL/cl_half.h>
+
 #include "harness/mt19937.h"
 #include "harness/rounding_mode.h"
+#include "harness/typeWrappers.h"
 
 #include <vector>
 
@@ -60,11 +63,17 @@ struct DataInitInfo
     RoundingMode round;
     cl_uint threads;
 
+    static cl_half_rounding_mode halfRoundingMode;
     static std::vector<uint32_t> specialValuesUInt;
     static std::vector<float> specialValuesFloat;
     static std::vector<double> specialValuesDouble;
+    static std::vector<cl_half> specialValuesHalf;
 };
 
+#define HFF(num) cl_half_from_float(num, DataInitInfo::halfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
+#define HFD(num) cl_half_from_double(num, DataInitInfo::halfRoundingMode)
+
 struct DataInitBase : public DataInitInfo
 {
     virtual ~DataInitBase() = default;
@@ -75,7 +84,7 @@ struct DataInitBase : public DataInitInfo
     virtual void init(const cl_uint &, const cl_uint &) {}
 };
 
-template <typename InType, typename OutType>
+template <typename InType, typename OutType, bool InFP, bool OutFP>
 struct DataInfoSpec : public DataInitBase
 {
     explicit DataInfoSpec(const DataInitInfo &agg);
@@ -98,6 +107,16 @@ struct DataInfoSpec : public DataInitBase
 
     std::vector<MTdataHolder> mdv;
 
+    constexpr bool is_in_half() const
+    {
+        return (std::is_same<InType, cl_half>::value && InFP);
+    }
+
+    constexpr bool is_out_half() const
+    {
+        return (std::is_same<OutType, cl_half>::value && OutFP);
+    }
+
     void conv_array(void *out, void *in, size_t n) override
     {
         for (size_t i = 0; i < n; i++)
@@ -125,19 +144,22 @@ struct DataInfoSpec : public DataInitBase
     }
 };
 
-template <typename InType, typename OutType>
-DataInfoSpec<InType, OutType>::DataInfoSpec(const DataInitInfo &agg)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
+    const DataInitInfo &agg)
     : DataInitBase(agg), mdv(0)
 {
     if (std::is_same<cl_float, OutType>::value)
         ranges = std::make_pair(CL_FLT_MIN, CL_FLT_MAX);
     else if (std::is_same<cl_double, OutType>::value)
         ranges = std::make_pair(CL_DBL_MIN, CL_DBL_MAX);
+    else if (std::is_same<cl_half, OutType>::value && OutFP)
+        ranges = std::make_pair(HFF(CL_HALF_MIN), HFF(CL_HALF_MAX));
     else if (std::is_same<cl_uchar, OutType>::value)
         ranges = std::make_pair(0, CL_UCHAR_MAX);
     else if (std::is_same<cl_char, OutType>::value)
         ranges = std::make_pair(CL_CHAR_MIN, CL_CHAR_MAX);
-    else if (std::is_same<cl_ushort, OutType>::value)
+    else if (std::is_same<cl_ushort, OutType>::value && !OutFP)
         ranges = std::make_pair(0, CL_USHRT_MAX);
     else if (std::is_same<cl_short, OutType>::value)
         ranges = std::make_pair(CL_SHRT_MIN, CL_SHRT_MAX);
@@ -158,12 +180,12 @@ DataInfoSpec<InType, OutType>::DataInfoSpec(const DataInitInfo &agg)
         InType outMax = static_cast<InType>(ranges.second);
 
         InType eps = std::is_same<InType, cl_float>::value ? (InType) FLT_EPSILON : (InType) DBL_EPSILON;
-        if (std::is_integral<OutType>::value)
+        if (std::is_integral<OutType>::value && !OutFP)
         { // to char/uchar/short/ushort/int/uint/long/ulong
             if (sizeof(OutType)<=sizeof(cl_short))
             { // to char/uchar/short/ushort
                 clamp_ranges=
-                {{outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
+                 {{outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
                   {outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
                   {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, outMax-1.f},
                   {outMin-0.0f, outMax - outMax * 0.5f * eps },
@@ -249,11 +271,55 @@ DataInfoSpec<InType, OutType>::DataInfoSpec(const DataInitInfo &agg)
             }
         }
     }
+    else if (is_in_half())
+    {
+        float outMin = static_cast<float>(ranges.first);
+        float outMax = static_cast<float>(ranges.second);
+        float eps = CL_HALF_EPSILON;
+        cl_half_rounding_mode prev_half_round = DataInitInfo::halfRoundingMode;
+        DataInitInfo::halfRoundingMode = CL_HALF_RTZ;
+
+        if (std::is_integral<OutType>::value)
+        { // to char/uchar/short/ushort/int/uint/long/ulong
+            if (sizeof(OutType)<=sizeof(cl_char) || std::is_same<OutType, cl_short>::value)
+            { // to char/uchar
+                clamp_ranges=
+                 {{HFF(outMin-0.5f), HFF(outMax + 0.5f - outMax * 0.5f * eps)},
+                  {HFF(outMin-0.5f), HFF(outMax + 0.5f - outMax * 0.5f * eps)},
+                  {HFF(outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps), HFF(outMax-1.f)},
+                  {HFF(outMin-0.0f), HFF(outMax - outMax * 0.5f * eps) },
+                  {HFF(outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps), HFF(outMax - outMax * 0.5f * eps)}};
+            }
+            else
+            { // to ushort/int/uint/long/ulong
+                if (std::is_signed<OutType>::value)
+                {
+                    clamp_ranges=
+                    { {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)},
+                      {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)},
+                      {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)},
+                      {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)},
+                      {HFF(-CL_HALF_MAX), HFF(CL_HALF_MAX)}};
+                }
+                else
+                {
+                    clamp_ranges=
+                    { {HFF(outMin), HFF(CL_HALF_MAX)},
+                      {HFF(outMin), HFF(CL_HALF_MAX)},
+                      {HFF(outMin), HFF(CL_HALF_MAX)},
+                      {HFF(outMin), HFF(CL_HALF_MAX)},
+                      {HFF(outMin), HFF(CL_HALF_MAX)}};
+                }
+            }
+        }
+
+        DataInitInfo::halfRoundingMode = prev_half_round;
+    }
     // clang-format on
 }
 
-template <typename InType, typename OutType>
-float DataInfoSpec<InType, OutType>::round_to_int(float f)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
 {
     static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23),
                                     -MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23) };
@@ -281,8 +347,9 @@ float DataInfoSpec<InType, OutType>::round_to_int(float f)
     return f;
 }
 
-template <typename InType, typename OutType>
-long long DataInfoSpec<InType, OutType>::round_to_int_and_clamp(double f)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+long long
+DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
 {
     static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52),
                                      MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
@@ -313,8 +380,8 @@ long long DataInfoSpec<InType, OutType>::round_to_int_and_clamp(double f)
     return (long long)f;
 }
 
-template <typename InType, typename OutType>
-OutType DataInfoSpec<InType, OutType>::absolute(const OutType &x)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+OutType DataInfoSpec<InType, OutType, InFP, OutFP>::absolute(const OutType &x)
 {
     union {
         cl_uint u;
@@ -331,17 +398,30 @@ OutType DataInfoSpec<InType, OutType>::absolute(const OutType &x)
     return u.f;
 }
 
-template <typename InType, typename OutType>
-void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
+template <typename T, bool fp> constexpr bool is_half()
+{
+    return (std::is_same<cl_half, T>::value && fp);
+}
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
 {
-    if (std::is_same<cl_float, InType>::value)
+    if (std::is_same<cl_float, InType>::value || is_in_half())
     {
         cl_float inVal = *in;
+        if (std::is_same<cl_half, InType>::value)
+        {
+            inVal = HTF(*in);
+        }
 
         if (std::is_floating_point<OutType>::value)
         {
             *out = (OutType)inVal;
         }
+        else if (is_out_half())
+        {
+            *out = HFF(*in);
+        }
         else if (std::is_same<cl_ulong, OutType>::value)
         {
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
@@ -376,6 +456,8 @@ void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
     {
         if (std::is_same<cl_float, OutType>::value)
             *out = (OutType)*in;
+        else if (is_out_half())
+            *out = static_cast<OutType>(HFD(*in));
         else
             *out = rint(*in);
     }
@@ -417,7 +499,7 @@ void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
             *out = (vi == 0 ? 0.0 : static_cast<OutType>(vi));
 #endif
         }
-        else if (std::is_same<cl_float, OutType>::value)
+        else if (std::is_same<cl_float, OutType>::value || is_out_half())
         {
             cl_float outVal = 0.f;
 
@@ -463,7 +545,9 @@ void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
 #endif
 #endif
 
-            *out = outVal;
+            *out = std::is_same<cl_half, OutType>::value
+                ? static_cast<OutType>(HFF(outVal))
+                : outVal;
         }
         else
         {
@@ -484,6 +568,8 @@ void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
             // Per IEEE-754-2008 5.4.1, 0 always converts to +0.0
             *out = (*in == 0 ? 0.0 : *in);
         }
+        else if (is_out_half())
+            *out = static_cast<OutType>(HFF(*in == 0 ? 0.f : *in));
         else
         {
             *out = (OutType)*in;
@@ -494,19 +580,26 @@ void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
 #define CLAMP(_lo, _x, _hi)                                                    \
     ((_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
 
-template <typename InType, typename OutType>
-void DataInfoSpec<InType, OutType>::conv_sat(OutType *out, InType *in)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
+                                                          InType *in)
 {
-    if (std::is_floating_point<InType>::value)
+    if (std::is_floating_point<InType>::value || is_in_half())
     {
-        if (std::is_floating_point<OutType>::value)
-        { // in float/double, out float/double
-            *out = (OutType)(*in);
+        cl_float inVal = *in;
+        if (is_in_half()) inVal = HTF(*in);
+
+        if (std::is_floating_point<OutType>::value || is_out_half())
+        { // in half/float/double, out half/float/double
+            if (is_out_half())
+                *out = static_cast<OutType>(HFF(inVal));
+            else
+                *out = (OutType)(is_in_half() ? inVal : *in);
         }
-        else if ((std::is_same<InType, cl_float>::value)
+        else if ((std::is_same<InType, cl_float>::value || is_in_half())
                  && std::is_same<cl_ulong, OutType>::value)
         {
-            cl_float x = round_to_int(*in);
+            cl_float x = round_to_int(is_in_half() ? HTF(*in) : *in);
 
 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
             // VS2005 (at least) on x86 uses fistp to store the float as a
@@ -534,47 +627,57 @@ void DataInfoSpec<InType, OutType>::conv_sat(OutType *out, InType *in)
             }
 #else
             *out = x >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64)
-                ? 0xFFFFFFFFFFFFFFFFULL
-                : x < 0 ? 0 : (OutType)x;
+                ? (OutType)0xFFFFFFFFFFFFFFFFULL
+                : x < 0 ? 0
+                        : (OutType)x;
 #endif
         }
-        else if ((std::is_same<InType, cl_float>::value)
+        else if ((std::is_same<InType, cl_float>::value || is_in_half())
                  && std::is_same<cl_long, OutType>::value)
         {
-            cl_float f = round_to_int(*in);
+            cl_float f = round_to_int(is_in_half() ? HTF(*in) : *in);
             *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63)
-                ? 0x7FFFFFFFFFFFFFFFULL
+                ? (OutType)0x7FFFFFFFFFFFFFFFULL
                 : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63)
-                    ? 0x8000000000000000LL
-                    : (OutType)f;
+                ? (OutType)0x8000000000000000LL
+                : (OutType)f;
         }
         else if (std::is_same<InType, cl_double>::value
                  && std::is_same<cl_ulong, OutType>::value)
         {
             InType f = rint(*in);
             *out = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64)
-                ? 0xFFFFFFFFFFFFFFFFULL
-                : f < 0 ? 0 : (OutType)f;
+                ? (OutType)0xFFFFFFFFFFFFFFFFULL
+                : f < 0 ? 0
+                        : (OutType)f;
         }
         else if (std::is_same<InType, cl_double>::value
                  && std::is_same<cl_long, OutType>::value)
         {
             InType f = rint(*in);
             *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63)
-                ? 0x7FFFFFFFFFFFFFFFULL
+                ? (OutType)0x7FFFFFFFFFFFFFFFULL
                 : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63)
-                    ? 0x8000000000000000LL
-                    : (OutType)f;
+                ? (OutType)0x8000000000000000LL
+                : (OutType)f;
         }
         else
-        { // in float/double, out char/uchar/short/ushort/int/uint
-            *out =
-                CLAMP(ranges.first, round_to_int_and_clamp(*in), ranges.second);
+        { // in half/float/double, out char/uchar/short/ushort/int/uint
+            *out = CLAMP(ranges.first,
+                         round_to_int_and_clamp(is_in_half() ? inVal : *in),
+                         ranges.second);
         }
     }
     else if (std::is_integral<InType>::value
              && std::is_integral<OutType>::value)
     {
+        if (is_out_half())
+        {
+            *out = std::is_signed<InType>::value
+                ? static_cast<OutType>(HFF((cl_float)*in))
+                : absolute(static_cast<OutType>(HFF((cl_float)*in)));
+        }
+        else
         {
             if ((std::is_signed<InType>::value
                  && std::is_signed<OutType>::value)
@@ -612,14 +715,40 @@ void DataInfoSpec<InType, OutType>::conv_sat(OutType *out, InType *in)
     }
 }
 
-template <typename InType, typename OutType>
-void DataInfoSpec<InType, OutType>::init(const cl_uint &job_id,
-                                         const cl_uint &thread_id)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
+                                                      const cl_uint &thread_id)
 {
     uint64_t ulStart = start;
     void *pIn = (char *)gIn + job_id * size * gTypeSizes[inType];
 
-    if (std::is_integral<InType>::value)
+    if (is_in_half())
+    {
+        cl_half *o = (cl_half *)pIn;
+        int i;
+
+        if (gIsEmbedded)
+            for (i = 0; i < size; i++)
+                o[i] = (cl_half)genrand_int32(mdv[thread_id]);
+        else
+            for (i = 0; i < size; i++) o[i] = (cl_half)((i + ulStart) % 0xffff);
+
+        if (0 == ulStart)
+        {
+            size_t tableSize = specialValuesHalf.size()
+                * sizeof(decltype(specialValuesHalf)::value_type);
+            if (sizeof(InType) * size < tableSize)
+                tableSize = sizeof(InType) * size;
+            memcpy((char *)(o + i) - tableSize, &specialValuesHalf.front(),
+                   tableSize);
+        }
+
+        if (kUnsaturated == sat)
+        {
+            for (i = 0; i < size; i++) o[i] = clamp(o[i]);
+        }
+    }
+    else if (std::is_integral<InType>::value)
     {
         InType *o = (InType *)pIn;
         if (sizeof(InType) <= sizeof(cl_short))
@@ -774,10 +903,10 @@ void DataInfoSpec<InType, OutType>::init(const cl_uint &job_id,
     }
 }
 
-template <typename InType, typename OutType>
-InType DataInfoSpec<InType, OutType>::clamp(const InType &in)
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+InType DataInfoSpec<InType, OutType, InFP, OutFP>::clamp(const InType &in)
 {
-    if (std::is_integral<OutType>::value)
+    if (std::is_integral<OutType>::value && !OutFP)
     {
         if (std::is_same<InType, cl_float>::value)
         {
@@ -789,6 +918,11 @@ InType DataInfoSpec<InType, OutType>::clamp(const InType &in)
             return dclamp(clamp_ranges[round].first, in,
                           clamp_ranges[round].second);
         }
+        else if (std::is_same<InType, cl_half>::value && InFP)
+        {
+            return HFF(fclamp(HTF(clamp_ranges[round].first), HTF(in),
+                              HTF(clamp_ranges[round].second)));
+        }
     }
     return in;
 }
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index dab61dc500..b7d6b07156 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -73,9 +73,9 @@ static void PrintUsage(void);
 test_status InitCL(cl_device_id device);
 
 
-const char *gTypeNames[kTypeCount] = { "uchar", "char", "ushort", "short",
-                                       "uint",  "int",  "float",  "double",
-                                       "ulong", "long" };
+const char *gTypeNames[kTypeCount] = { "uchar",  "char",  "ushort", "short",
+                                       "uint",   "int",   "half",   "float",
+                                       "double", "ulong", "long" };
 
 const char *gRoundingModeNames[kRoundingModeCount] = { "", "_rte", "_rtp",
                                                        "_rtn", "_rtz" };
@@ -83,9 +83,9 @@ const char *gRoundingModeNames[kRoundingModeCount] = { "", "_rte", "_rtp",
 const char *gSaturationNames[2] = { "", "_sat" };
 
 size_t gTypeSizes[kTypeCount] = {
-    sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short),
-    sizeof(cl_uint),  sizeof(cl_int),  sizeof(cl_float),  sizeof(cl_double),
-    sizeof(cl_ulong), sizeof(cl_long),
+    sizeof(cl_uchar),  sizeof(cl_char),  sizeof(cl_ushort), sizeof(cl_short),
+    sizeof(cl_uint),   sizeof(cl_int),   sizeof(cl_half),   sizeof(cl_float),
+    sizeof(cl_double), sizeof(cl_ulong), sizeof(cl_long),
 };
 
 char appName[64] = "ctest";
@@ -221,13 +221,17 @@ static int ParseArgs(int argc, const char **argv)
                 switch (*arg)
                 {
                     case 'd': gTestDouble ^= 1; break;
+                    case 'h': gTestHalfs ^= 1; break;
                     case 'l': gSkipTesting ^= 1; break;
                     case 'm': gMultithread ^= 1; break;
                     case 'w': gWimpyMode ^= 1; break;
                     case '[':
                         parseWimpyReductionFactor(arg, gWimpyReductionFactor);
                         break;
-                    case 'z': gForceFTZ ^= 1; break;
+                    case 'z':
+                        gForceFTZ ^= 1;
+                        gForceHalfFTZ ^= 1;
+                        break;
                     case 't': gTimeResults ^= 1; break;
                     case 'a': gReportAverageTimes ^= 1; break;
                     case '1':
@@ -355,7 +359,6 @@ static void PrintUsage(void)
 }
 
 
-
 test_status InitCL(cl_device_id device)
 {
     int error, i;
@@ -412,6 +415,50 @@ test_status InitCL(cl_device_id device)
     }
     gTestDouble &= gHasDouble;
 
+    if (is_extension_available(device, "cl_khr_fp16"))
+    {
+        gHasHalfs = 1;
+
+        cl_device_fp_config floatCapabilities = 0;
+        if ((error = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG,
+                                     sizeof(floatCapabilities),
+                                     &floatCapabilities, NULL)))
+            floatCapabilities = 0;
+
+        if (0 == (CL_FP_DENORM & floatCapabilities)) gForceHalfFTZ ^= 1;
+
+        if (0 == (floatCapabilities & CL_FP_ROUND_TO_NEAREST))
+        {
+            char profileStr[128] = "";
+            // Verify that we are an embedded profile device
+            if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE,
+                                         sizeof(profileStr), profileStr, NULL)))
+            {
+                vlog_error("FAILURE: Could not get device profile: error %d\n",
+                           error);
+                return TEST_FAIL;
+            }
+
+            if (strcmp(profileStr, "EMBEDDED_PROFILE"))
+            {
+                vlog_error(
+                    "FAILURE: non-embedded profile device does not support "
+                    "CL_FP_ROUND_TO_NEAREST\n");
+                return TEST_FAIL;
+            }
+
+            if (0 == (floatCapabilities & CL_FP_ROUND_TO_ZERO))
+            {
+                vlog_error("FAILURE: embedded profile device supports neither "
+                           "CL_FP_ROUND_TO_NEAREST or CL_FP_ROUND_TO_ZERO\n");
+                return TEST_FAIL;
+            }
+
+            gIsHalfRTZ = 1;
+        }
+    }
+    gTestHalfs &= gHasHalfs;
+
     // detect whether profile of the device is embedded
     char profile[1024] = "";
     if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
@@ -492,8 +539,12 @@ test_status InitCL(cl_device_id device)
     vlog("\tSubnormal values supported for floats? %s\n",
          no_yes[0 != (CL_FP_DENORM & floatCapabilities)]);
     vlog("\tTesting with FTZ mode ON for floats? %s\n", no_yes[0 != gForceFTZ]);
+    vlog("\tTesting with FTZ mode ON for halfs? %s\n",
+         no_yes[0 != gForceHalfFTZ]);
     vlog("\tTesting with default RTZ mode for floats? %s\n",
          no_yes[0 != gIsRTZ]);
+    vlog("\tTesting with default RTZ mode for halfs? %s\n",
+         no_yes[0 != gIsHalfRTZ]);
     vlog("\tHas Double? %s\n", no_yes[0 != gHasDouble]);
     if (gHasDouble) vlog("\tTest Double? %s\n", no_yes[0 != gTestDouble]);
     vlog("\tHas Long? %s\n", no_yes[0 != gHasLong]);
@@ -503,5 +554,3 @@ test_status InitCL(cl_device_id device)
     vlog("\n");
     return TEST_PASS;
 }
-
-

From 87dc09c66ff3b41ed10cebefdf1b84fc17e46af3 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Mon, 18 Dec 2023 19:15:31 +0100
Subject: [PATCH 2/7] Fp16 math bruteforce staging (#1863)

* Enable fp16 in math bruteforce

* Added modernization of remaining half tests for consistency (issue #142, bruteforce)

* Added kernel types related corrections

* Added more fixes and general cleanup

* Corrected ULP values for half tests (issue #142, bruteforce)

* Corrected presubmit check for clang format

* Added support for ternary, unary_two_result and unary_two_result_i tests for cl_half (issue #142, bruteforce)

* Added missing condition due to vendor's review

* code format correction

* Added check for lack of support for denormals in binary_half scenario

* Corrected procedure to compute nextafter cl_half for flush-to-zero mode

* Added correction for external check of reference value for nextafter test

* Added correction due to code review request

* Changed quantity of tests performed for half in unary and macro_unary procedures from basic

* Added corrections related to code review:

-added binary_operator_half.cpp and binary_two_results_i_half.cpp
-address sanitizer errors fixed
-extending list of special half values
-removed unnecessary relaxed math references in half tests
-corrected conditions to verify ulp narrowing of computation results
-several refactoring and cosmetics corrections

* Print format correction due to failed CI check

* Corrected bug found in code review (fp16 bruteforce)

* Corrections related to code review (cl_khr_fp16 support according to #142)

-gHostFill missing support added
-special half values array extended
-cosmetics and unifying

* clang format applied

* consistency correction

* more consistency corrections for cl_fp16_khr supported tests

* Corrections related to code review (bureforce #142)

* Correction for i_unary_half test capacity

* Corrections related to capacity of cl_khr_fp16 tests in bruteforce (#142)

---------

Co-authored-by: Wawiorko, Grzegorz <grzegorz.wawiorko@intel.com>
---
 .../math_brute_force/CMakeLists.txt           |  13 +
 .../math_brute_force/binary_half.cpp          | 784 ++++++++++++++++++
 .../math_brute_force/binary_i_double.cpp      |   6 +-
 .../math_brute_force/binary_i_float.cpp       |   7 +-
 .../math_brute_force/binary_i_half.cpp        | 548 ++++++++++++
 .../binary_operator_double.cpp                |   3 +-
 .../binary_operator_float.cpp                 |   3 +-
 .../math_brute_force/binary_operator_half.cpp | 680 +++++++++++++++
 .../binary_two_results_i_half.cpp             | 477 +++++++++++
 test_conformance/math_brute_force/common.cpp  |  13 +-
 test_conformance/math_brute_force/common.h    |   4 +
 .../math_brute_force/function_list.cpp        | 274 +++---
 .../math_brute_force/function_list.h          |   4 +
 .../math_brute_force/i_unary_half.cpp         | 220 +++++
 .../math_brute_force/macro_binary_double.cpp  |   3 +-
 .../math_brute_force/macro_binary_float.cpp   |   3 +-
 .../math_brute_force/macro_binary_half.cpp    | 540 ++++++++++++
 .../math_brute_force/macro_unary_half.cpp     | 427 ++++++++++
 .../math_brute_force/mad_half.cpp             | 201 +++++
 test_conformance/math_brute_force/main.cpp    |  77 +-
 .../math_brute_force/reference_math.cpp       |  43 +
 .../math_brute_force/reference_math.h         |   4 +
 .../math_brute_force/ternary_double.cpp       |   3 +-
 .../math_brute_force/ternary_float.cpp        |   3 +-
 .../math_brute_force/ternary_half.cpp         | 777 +++++++++++++++++
 .../math_brute_force/test_functions.h         |  51 +-
 .../math_brute_force/unary_half.cpp           | 483 +++++++++++
 .../unary_two_results_half.cpp                | 452 ++++++++++
 .../unary_two_results_i_half.cpp              | 347 ++++++++
 .../math_brute_force/unary_u_half.cpp         | 239 ++++++
 test_conformance/math_brute_force/utility.h   |  37 +
 31 files changed, 6581 insertions(+), 145 deletions(-)
 create mode 100644 test_conformance/math_brute_force/binary_half.cpp
 create mode 100644 test_conformance/math_brute_force/binary_i_half.cpp
 create mode 100644 test_conformance/math_brute_force/binary_operator_half.cpp
 create mode 100644 test_conformance/math_brute_force/binary_two_results_i_half.cpp
 create mode 100644 test_conformance/math_brute_force/i_unary_half.cpp
 create mode 100644 test_conformance/math_brute_force/macro_binary_half.cpp
 create mode 100644 test_conformance/math_brute_force/macro_unary_half.cpp
 create mode 100644 test_conformance/math_brute_force/mad_half.cpp
 create mode 100644 test_conformance/math_brute_force/ternary_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_two_results_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_two_results_i_half.cpp
 create mode 100644 test_conformance/math_brute_force/unary_u_half.cpp

diff --git a/test_conformance/math_brute_force/CMakeLists.txt b/test_conformance/math_brute_force/CMakeLists.txt
index a221f05add..d53911e433 100644
--- a/test_conformance/math_brute_force/CMakeLists.txt
+++ b/test_conformance/math_brute_force/CMakeLists.txt
@@ -3,24 +3,32 @@ set(MODULE_NAME BRUTEFORCE)
 set(${MODULE_NAME}_SOURCES
     binary_double.cpp
     binary_float.cpp
+    binary_half.cpp
     binary_i_double.cpp
     binary_i_float.cpp
+    binary_i_half.cpp
     binary_operator_double.cpp
     binary_operator_float.cpp
+    binary_operator_half.cpp
     binary_two_results_i_double.cpp
     binary_two_results_i_float.cpp
+    binary_two_results_i_half.cpp
     common.cpp
     common.h
     function_list.cpp
     function_list.h
     i_unary_double.cpp
     i_unary_float.cpp
+    i_unary_half.cpp
     macro_binary_double.cpp
     macro_binary_float.cpp
+    macro_binary_half.cpp
     macro_unary_double.cpp
     macro_unary_float.cpp
+    macro_unary_half.cpp
     mad_double.cpp
     mad_float.cpp
+    mad_half.cpp
     main.cpp
     reference_math.cpp
     reference_math.h
@@ -28,15 +36,20 @@ set(${MODULE_NAME}_SOURCES
     sleep.h
     ternary_double.cpp
     ternary_float.cpp
+    ternary_half.cpp
     test_functions.h
     unary_double.cpp
     unary_float.cpp
+    unary_half.cpp
     unary_two_results_double.cpp
     unary_two_results_float.cpp
+    unary_two_results_half.cpp
     unary_two_results_i_double.cpp
     unary_two_results_i_float.cpp
+    unary_two_results_i_half.cpp
     unary_u_double.cpp
     unary_u_float.cpp
+    unary_u_half.cpp
     utility.cpp
     utility.h
 )
diff --git a/test_conformance/math_brute_force/binary_half.cpp b/test_conformance/math_brute_force/binary_half.cpp
new file mode 100644
index 0000000000..f80a085370
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_half.cpp
@@ -0,0 +1,784 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "harness/errorHelpers.h"
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+#include "reference_math.h"
+
+#include <cstring>
+#include <algorithm>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Half, ParameterType::Half,
+                               vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+struct ThreadInfo
+{
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper inBuf2; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdataHolder d;
+
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+};
+
+struct TestInfoBase
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isFDim;
+    int skipNanInf;
+    int isNextafter;
+};
+
+struct TestInfo : public TestInfoBase
+{
+    TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+};
+
+// A table of more difficult cases to get right
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_int error;
+    const char *name = job->f->name;
+
+    int isFDim = job->isFDim;
+    int skipNanInf = job->skipNanInf;
+    int isNextafter = job->isNextafter;
+    cl_ushort *t;
+    cl_half *r;
+    std::vector<float> s(0), s2(0);
+    cl_uint j = 0;
+
+    RoundingMode oldRoundMode;
+    cl_int copysign_test = 0;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ushort *out[VECTOR_SIZE_COUNT];
+
+    if (gHostFill)
+    {
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            out[j] = (cl_ushort *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
+
+    // Init input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            p[j] = specialValuesHalf[x];
+            p2[j] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = (cl_ushort)genrand_int32(d);
+        p2[j] = (cl_ushort)genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if (gHostFill)
+        {
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
+        {
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting)
+    {
+        return CL_SUCCESS;
+    }
+
+    FPU_mode_type oldMode;
+    oldRoundMode = kRoundToNearestEven;
+    if (isFDim)
+    {
+        // Calculate the correctly rounded reference result
+        memset(&oldMode, 0, sizeof(oldMode));
+        if (ftz) ForceFTZ(&oldMode);
+
+        // Set the rounding mode to match the device
+        if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
+    }
+
+    if (!strcmp(name, "copysign")) copysign_test = 1;
+
+#define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    t = (cl_ushort *)r;
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        s2[j] = cl_half_to_float(p2[j]);
+        if (isNextafter)
+            r[j] = cl_half_from_float(reference_nextafterh(s[j], s2[j]),
+                                      CL_HALF_RTE);
+        else
+            r[j] = cl_half_from_float(ref_func(s[j], s2[j]), CL_HALF_RTE);
+    }
+
+    if (isFDim && ftz) RestoreFPState(&oldMode);
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Verify data
+
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ushort *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                double correct;
+                if (isNextafter)
+                    correct = reference_nextafterh(s[j], s2[j]);
+                else
+                    correct = ref_func(s[j], s2[j]);
+
+                float test = cl_half_to_float(q[j]);
+
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (skipNanInf)
+                {
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                        continue;
+                }
+                float err = Ulp_Error_Half(q[j], correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, ulps))
+                    {
+                        if (isNextafter)
+                        {
+                            correct = reference_nextafterh(s[j], s2[j], false);
+                            err = Ulp_Error_Half(q[j], correct);
+                            fail = !(fabsf(err) <= ulps);
+                        }
+
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+                        if (isNextafter)
+                        {
+                            correct2 = reference_nextafterh(0.0, s2[j]);
+                            correct3 = reference_nextafterh(-0.0, s2[j]);
+                        }
+                        else
+                        {
+                            correct2 = ref_func(0.0, s2[j]);
+                            correct3 = ref_func(-0.0, s2[j]);
+                        }
+                        if (skipNanInf)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
+                                continue;
+                        }
+
+                        auto check_error = [&]() {
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                        };
+                        check_error();
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            if (fail && isNextafter)
+                            {
+                                correct2 =
+                                    reference_nextafterh(0.0, s2[j], false);
+                                correct3 =
+                                    reference_nextafterh(-0.0, s2[j], false);
+                                check_error();
+                            }
+
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // allow to omit denorm values for platforms with no
+                        // denorm support for nextafter
+                        if (fail && (isNextafter)
+                            && (correct <= cl_half_to_float(0x3FF))
+                            && (correct >= cl_half_to_float(0x83FF)))
+                        {
+                            fail = fail && (q[j] != p[j]);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            double correct4, correct5;
+                            float err4, err5;
+
+                            if (isNextafter)
+                            {
+                                correct2 = reference_nextafterh(0.0, 0.0);
+                                correct3 = reference_nextafterh(-0.0, 0.0);
+                                correct4 = reference_nextafterh(0.0, -0.0);
+                                correct5 = reference_nextafterh(-0.0, -0.0);
+                            }
+                            else
+                            {
+                                correct2 = ref_func(0.0, 0.0);
+                                correct3 = ref_func(-0.0, 0.0);
+                                correct4 = ref_func(0.0, -0.0);
+                                correct5 = ref_func(-0.0, -0.0);
+                            }
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (skipNanInf)
+                            {
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
+                                    continue;
+                            }
+
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            err4 = Ulp_Error_Half(q[j], correct4);
+                            err5 = Ulp_Error_Half(q[j], correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps)
+                                || IsHalfResultSubnormal(correct4, ulps)
+                                || IsHalfResultSubnormal(correct5, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+
+                            // allow to omit denorm values for platforms with no
+                            // denorm support for nextafter
+                            if (fail && (isNextafter)
+                                && (correct <= cl_half_to_float(0x3FF))
+                                && (correct >= cl_half_to_float(0x83FF)))
+                            {
+                                fail = fail && (q[j] != p2[j]);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        if (isNextafter)
+                        {
+                            correct2 = reference_nextafterh(s[j], 0.0);
+                            correct3 = reference_nextafterh(s[j], -0.0);
+                        }
+                        else
+                        {
+                            correct2 = ref_func(s[j], 0.0);
+                            correct3 = ref_func(s[j], -0.0);
+                        }
+
+                        if (skipNanInf)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
+                                continue;
+                        }
+
+                        auto check_error = [&]() {
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                        };
+                        check_error();
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            if (fail && isNextafter)
+                            {
+                                correct2 =
+                                    reference_nextafterh(s[j], 0.0, false);
+                                correct3 =
+                                    reference_nextafterh(s[j], -0.0, false);
+                                check_error();
+                            }
+
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // allow to omit denorm values for platforms with no
+                        // denorm support for nextafter
+                        if (fail && (isNextafter)
+                            && (correct <= cl_half_to_float(0x3FF))
+                            && (correct >= cl_half_to_float(0x83FF)))
+                        {
+                            fail = fail && (q[j] != p2[j]);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x) at index: %u\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               p2[j], cl_half_to_float(r[j]), r[j], test, q[j],
+                               j);
+                    error = -1;
+                    return error;
+                }
+            }
+        }
+    }
+
+    if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return error;
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_common(const Func *f, MTdata d, int isNextafter,
+                                   bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
+    test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
+    test_info.isNextafter = isNextafter;
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error(
+                    "Error: Unable to create sub-buffer of gOutBuffer[%d] "
+                    "for region {%zd, %zd}\n",
+                    (int)j, region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
+
+int TestFunc_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    return TestFunc_Half_Half_Half_common(f, d, 0, relaxedMode);
+}
+
+int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata d, bool relaxedMode)
+{
+    return TestFunc_Half_Half_Half_common(f, d, 1, relaxedMode);
+}
diff --git a/test_conformance/math_brute_force/binary_i_double.cpp b/test_conformance/math_brute_force/binary_i_double.cpp
index a6c2855735..a0561422e9 100644
--- a/test_conformance/math_brute_force/binary_i_double.cpp
+++ b/test_conformance/math_brute_force/binary_i_double.cpp
@@ -193,16 +193,14 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 const int specialValuesInt[] = {
     0,       1,  2,  3,  1022,  1023,  1024,   INT_MIN,
     INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX,
 };
 
-constexpr size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
+constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/binary_i_float.cpp b/test_conformance/math_brute_force/binary_i_float.cpp
index dfe25efc69..f9e13abaaf 100644
--- a/test_conformance/math_brute_force/binary_i_float.cpp
+++ b/test_conformance/math_brute_force/binary_i_float.cpp
@@ -184,8 +184,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 const int specialValuesInt[] = {
     0,           1,           2,           3,          126,        127,
@@ -194,9 +193,7 @@ const int specialValuesInt[] = {
     -0x04000001, -1465264071, -1488522147,
 };
 
-constexpr size_t specialValuesIntCount =
-    sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
-
+constexpr size_t specialValuesIntCount = ARRAY_SIZE(specialValuesInt);
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
     TestInfo *job = (TestInfo *)data;
diff --git a/test_conformance/math_brute_force/binary_i_half.cpp b/test_conformance/math_brute_force/binary_i_half.cpp
new file mode 100644
index 0000000000..001e2b4f54
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_i_half.cpp
@@ -0,0 +1,548 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <climits>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Half, ParameterType::Int,
+                               vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper inBuf2; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double
+        maxErrorValue; // position of the max error value (param 1).  Init to 0.
+    cl_int maxErrorValue2; // position of the max error value (param 2).  Init
+                           // to 0.
+    MTdataHolder d;
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+struct TestInfoBase
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+};
+
+struct TestInfo : public TestInfoBase
+{
+    TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+};
+
+// A table of more difficult cases to get right
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+const int specialValuesInt3[] = { 0,     1,       2,       3,       1022, 1023,
+                                  1024,  INT_MIN, INT_MAX, -1,      -2,   -3,
+                                  -1022, -1023,   -11024,  -INT_MAX };
+size_t specialValuesInt3Count = ARRAY_SIZE(specialValuesInt3);
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_ushort *t;
+    cl_half *r;
+    std::vector<float> s;
+    cl_int *s2;
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ushort *out[VECTOR_SIZE_COUNT];
+
+    if (gHostFill)
+    {
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            out[j] = (cl_ushort *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_elements * sizeof(cl_ushort), 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
+
+    // Init input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesInt3Count;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            p[j] = specialValuesHalf[x];
+            p2[j] = specialValuesInt3[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesInt3Count) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = (cl_ushort)genrand_int32(d);
+        p2[j] = genrand_int32(d);
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_elements * sizeof(cl_half), p, 0,
+                                      NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_elements * sizeof(cl_int), p2, 0,
+                                      NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if (gHostFill)
+        {
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
+        {
+            memset_pattern4(out[j], &pattern,
+                            buffer_elements * sizeof(cl_half));
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], &pattern, sizeof(pattern), 0,
+                buffer_elements * sizeof(cl_half), 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    t = (cl_ushort *)r;
+    s.resize(buffer_elements);
+    s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        r[j] = HFF(func.f_fi(s[j], s2[j]));
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_ushort *)clEnqueueMapBuffer(
+        tinfo->tQueue, tinfo->outBuf[j], CL_TRUE, CL_MAP_READ, 0,
+        buffer_elements * sizeof(cl_ushort), 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ushort *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (t[j] != q[j])
+            {
+                float test = cl_half_to_float(q[j]);
+                double correct = func.f_fi(s[j], s2[j]);
+                float err = Ulp_Error_Half(q[j], correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+                        correct2 = func.f_fi(0.0, s2[j]);
+                        correct3 = func.f_fi(-0.0, s2[j]);
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%d}\nExpected: %a (half 0x%04x) \nActual: %a "
+                               "(half 0x%04x) at index: %d\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               cl_half_to_float(r[j]), r[j], test, q[j],
+                               (cl_uint)j);
+                    error = -1;
+                    return error;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+    return error;
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Int(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    cl_int maxErrorVal2 = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_int) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        cl_buffer_region region2 = { i * test_info.subBufferSize
+                                         * sizeof(cl_int),
+                                     test_info.subBufferSize * sizeof(cl_int) };
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+
+    // Accumulate the arithmetic errors
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        if (test_info.tinfo[i].maxError > maxError)
+        {
+            maxError = test_info.tinfo[i].maxError;
+            maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+        }
+    }
+
+    test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_operator_double.cpp b/test_conformance/math_brute_force/binary_operator_double.cpp
index 7600ab16a3..517188030b 100644
--- a/test_conformance/math_brute_force/binary_operator_double.cpp
+++ b/test_conformance/math_brute_force/binary_operator_double.cpp
@@ -192,8 +192,7 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/binary_operator_float.cpp b/test_conformance/math_brute_force/binary_operator_float.cpp
index 741c396ca8..3eb1041834 100644
--- a/test_conformance/math_brute_force/binary_operator_float.cpp
+++ b/test_conformance/math_brute_force/binary_operator_float.cpp
@@ -184,8 +184,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
new file mode 100644
index 0000000000..e7f53af871
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -0,0 +1,680 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Half, ParameterType::Half,
+                               vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+struct ThreadInfo
+{
+    // Input and output buffers for the thread
+    clMemWrapper inBuf;
+    clMemWrapper inBuf2;
+    Buffers outBuf;
+
+    // max error value. Init to 0.
+    float maxError;
+    // position of the max error value (param 1).  Init to 0.
+    double maxErrorValue;
+    // position of the max error value (param 2).  Init to 0.
+    double maxErrorValue2;
+    MTdataHolder d;
+
+    // Per thread command queue to improve performance
+    clCommandQueueWrapper tQueue;
+};
+
+struct TestInfo
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    // no special fields
+};
+
+// A table of more difficult cases to get right
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_int error;
+
+    const char *name = job->f->name;
+    cl_half *r = 0;
+    std::vector<float> s(0), s2(0);
+    RoundingMode oldRoundMode;
+
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_half *out[VECTOR_SIZE_COUNT];
+
+    if (gHostFill)
+    {
+        // start the map of the output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            out[j] = (cl_ushort *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
+
+    bool divide = strcmp(name, "divide") == 0;
+
+    // Init input array
+    cl_half *p = (cl_half *)gIn + thread_id * buffer_elements;
+    cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
+    cl_uint idx = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)lastSpecialJobIndex)
+    {
+        // Insert special values
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; idx < buffer_elements; idx++)
+        {
+            p[idx] = specialValuesHalf[x];
+            p2[idx] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+
+            if (divide)
+            {
+                cl_half pj = p[idx] & 0x7fff;
+                cl_half p2j = p2[idx] & 0x7fff;
+                // Replace values outside [2^-7, 2^7] with QNaN
+                if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
+                if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
+            }
+        }
+    }
+
+    // Init any remaining values
+    for (; idx < buffer_elements; idx++)
+    {
+        p[idx] = (cl_half)genrand_int32(d);
+        p2[idx] = (cl_half)genrand_int32(d);
+
+        if (divide)
+        {
+            cl_half pj = p[idx] & 0x7fff;
+            cl_half p2j = p2[idx] & 0x7fff;
+            // Replace values outside [2^-7, 2^7] with QNaN
+            if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
+            if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
+        }
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if (gHostFill)
+        {
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
+        {
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
+        }
+
+        // Run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting)
+    {
+        return CL_SUCCESS;
+    }
+
+    // Calculate the correctly rounded reference result
+    FPU_mode_type oldMode;
+    memset(&oldMode, 0, sizeof(oldMode));
+    if (ftz) ForceFTZ(&oldMode);
+
+    // Set the rounding mode to match the device
+    oldRoundMode = kRoundToNearestEven;
+    if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
+
+    // Calculate the correctly rounded reference result
+    r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
+
+    for (size_t j = 0; j < buffer_elements; j++)
+    {
+        s[j] = HTF(p[j]);
+        s2[j] = HTF(p2[j]);
+        r[j] = HFF(func.f_ff(s[j], s2[j]));
+    }
+
+    if (ftz) RestoreFPState(&oldMode);
+
+    // Read the data back -- no need to wait for the first N-1 buffers but wait
+    // for the last buffer. This is an in order queue.
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+        out[j] = (cl_ushort *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Verify data
+
+    for (size_t j = 0; j < buffer_elements; j++)
+    {
+        for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_half *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (r[j] != q[j])
+            {
+                float test = HTF(q[j]);
+                float correct = func.f_ff(s[j], s2[j]);
+
+                // Per section 10 paragraph 6, accept any result if an input or
+                // output is a infinity or NaN or overflow
+                if (!gInfNanSupport)
+                {
+                    // Note: no double rounding here.  Reference functions
+                    // calculate in single precision.
+                    if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                        || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
+                        || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
+                        continue;
+                }
+
+                float err = Ulp_Error_Half(q[j], correct);
+
+                int fail = !(fabsf(err) <= ulps);
+
+                if (fail && ftz)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, ulps))
+                    {
+                        fail = fail && (test != 0.0f);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        correct2 = HTF(func.f_ff(0.0, s2[j]));
+                        correct3 = HTF(func.f_ff(-0.0, s2[j]));
+
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2)
+                                || IsFloatInfinity(correct3)
+                                || IsFloatNaN(correct3))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            double correct4, correct5;
+                            float err4, err5;
+
+                            correct2 = HTF(func.f_ff(0.0, 0.0));
+                            correct3 = HTF(func.f_ff(-0.0, 0.0));
+                            correct4 = HTF(func.f_ff(0.0, -0.0));
+                            correct5 = HTF(func.f_ff(-0.0, -0.0));
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (!gInfNanSupport)
+                            {
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsFloatInfinity(correct2)
+                                    || IsFloatNaN(correct2)
+                                    || IsFloatInfinity(correct3)
+                                    || IsFloatNaN(correct3)
+                                    || IsFloatInfinity(correct4)
+                                    || IsFloatNaN(correct4)
+                                    || IsFloatInfinity(correct5)
+                                    || IsFloatNaN(correct5))
+                                    continue;
+                            }
+
+                            err2 = Ulp_Error_Half(q[j], correct2);
+                            err3 = Ulp_Error_Half(q[j], correct3);
+                            err4 = Ulp_Error_Half(q[j], correct4);
+                            err5 = Ulp_Error_Half(q[j], correct5);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps))
+                                    && (!(fabsf(err4) <= ulps))
+                                    && (!(fabsf(err5) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps)
+                                || IsHalfResultSubnormal(correct4, ulps)
+                                || IsHalfResultSubnormal(correct5, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        double correct2, correct3;
+                        float err2, err3;
+
+                        correct2 = HTF(func.f_ff(s[j], 0.0));
+                        correct3 = HTF(func.f_ff(s[j], -0.0));
+
+                        // Per section 10 paragraph 6, accept any result if an
+                        // input or output is a infinity or NaN or overflow
+                        if (!gInfNanSupport)
+                        {
+                            // Note: no double rounding here.  Reference
+                            // functions calculate in single precision.
+                            if (IsFloatInfinity(correct) || IsFloatNaN(correct)
+                                || IsFloatInfinity(correct2)
+                                || IsFloatNaN(correct2))
+                                continue;
+                        }
+
+                        err2 = Ulp_Error_Half(q[j], correct2);
+                        err3 = Ulp_Error_Half(q[j], correct3);
+                        fail = fail
+                            && ((!(fabsf(err2) <= ulps))
+                                && (!(fabsf(err3) <= ulps)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, ulps)
+                            || IsHalfResultSubnormal(correct3, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                    tinfo->maxErrorValue2 = s2[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at {%a (0x%04x), "
+                               "%a (0x%04x)}\nExpected: %a  (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x) at index: %zu\n",
+                               name, sizeNames[k], err, s[j], p[j], s2[j],
+                               p2[j], HTF(r[j]), r[j], test, q[j], j);
+                    return -1;
+                }
+            }
+        }
+    }
+
+    if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+    for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return CL_SUCCESS;
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfo test_info{};
+    cl_int error;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+    double maxErrorVal2 = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+    for (cl_uint i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of "
+                           "gOutBuffer[%d] for region {%zd, %zd}\n",
+                           (int)j, region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
+                                    test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+    // Run the kernels
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (cl_uint i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+                maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/binary_two_results_i_half.cpp b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
new file mode 100644
index 0000000000..bc2519e95b
--- /dev/null
+++ b/test_conformance/math_brute_force/binary_two_results_i_half.cpp
@@ -0,0 +1,477 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Half,
+                               ParameterType::Int, ParameterType::Half,
+                               ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+struct ComputeReferenceInfoF
+{
+    const cl_half *x;
+    const cl_half *y;
+    cl_half *r;
+    int32_t *i;
+    double (*f_ffpI)(double, double, int *);
+    cl_uint lim;
+    cl_uint count;
+};
+
+cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
+{
+    ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
+    cl_uint lim = cri->lim;
+    cl_uint count = cri->count;
+    cl_uint off = jid * count;
+    const cl_half *x = cri->x + off;
+    const cl_half *y = cri->y + off;
+    cl_half *r = cri->r + off;
+    int32_t *i = cri->i + off;
+    double (*f)(double, double, int *) = cri->f_ffpI;
+
+    if (off + count > lim) count = lim - off;
+
+    for (cl_uint j = 0; j < count; ++j)
+        r[j] = HFF((float)f((double)HTF(x[j]), (double)HTF(y[j]), i + j));
+
+    return CL_SUCCESS;
+}
+
+} // anonymous namespace
+
+int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    int64_t maxError2 = 0;
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    // use larger type of output data to prevent overflowing buffer size
+    constexpr size_t buffer_size = BUFFER_SIZE / sizeof(int32_t);
+
+    cl_uint threadCount = GetThreadCount();
+
+    float half_ulps = f->half_ulps;
+
+    int testingRemquo = !strcmp(f->name, "remquo");
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *p = (cl_half *)gIn;
+        cl_half *p2 = (cl_half *)gIn2;
+        for (size_t j = 0; j < buffer_size; j++)
+        {
+            p[j] = (cl_half)genrand_int32(d);
+            p2[j] = (cl_half)genrand_int32(d);
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          buffer_size * sizeof(cl_half), gIn, 0,
+                                          NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          buffer_size * sizeof(cl_half), gIn2,
+                                          0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 1 failed!\n");
+
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 2 failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            // align working group size with the bigger output type
+            size_t vectorSize = sizeValues[j] * sizeof(int32_t);
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        if (threadCount > 1)
+        {
+            ComputeReferenceInfoF cri;
+            cri.x = p;
+            cri.y = p2;
+            cri.r = (cl_half *)gOut_Ref;
+            cri.i = (int32_t *)gOut_Ref2;
+            cri.f_ffpI = f->func.f_ffpI;
+            cri.lim = buffer_size;
+            cri.count = (cri.lim + threadCount - 1) / threadCount;
+            ThreadPool_Do(ReferenceF, threadCount, &cri);
+        }
+        else
+        {
+            cl_half *r = (cl_half *)gOut_Ref;
+            int32_t *r2 = (int32_t *)gOut_Ref2;
+            for (size_t j = 0; j < buffer_size; j++)
+                r[j] =
+                    HFF((float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), r2 + j));
+        }
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            cl_bool blocking =
+                (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking, 0,
+                                         BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        cl_half *t = (cl_half *)gOut_Ref;
+        int32_t *t2 = (int32_t *)gOut_Ref2;
+        for (size_t j = 0; j < buffer_size; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *q = (cl_half *)(gOut[k]);
+                int32_t *q2 = (int32_t *)gOut2[k];
+
+                // Check for exact match to correctly rounded result
+                if (t[j] == q[j] && t2[j] == q2[j]) continue;
+
+                // Check for paired NaNs
+                if (IsHalfNaN(t[j]) && IsHalfNaN(q[j]) && t2[j] == q2[j])
+                    continue;
+
+                cl_half test = ((cl_half *)q)[j];
+                int correct2 = INT_MIN;
+                float correct =
+                    (float)f->func.f_ffpI(HTF(p[j]), HTF(p2[j]), &correct2);
+                float err = Ulp_Error_Half(test, correct);
+                int64_t iErr;
+
+                // in case of remquo, we only care about the sign and last
+                // seven bits of integer as per the spec.
+                if (testingRemquo)
+                    iErr = (long long)(q2[j] & 0x0000007f)
+                        - (long long)(correct2 & 0x0000007f);
+                else
+                    iErr = (long long)q2[j] - (long long)correct2;
+
+                // For remquo, if y = 0, x is infinite, or either is NaN
+                // then the standard either neglects to say what is returned
+                // in iptr or leaves it undefined or implementation defined.
+                int iptrUndefined = IsHalfInfinity(p[j]) || (HTF(p2[j]) == 0.0f)
+                    || IsHalfNaN(p2[j]) || IsHalfNaN(p[j]);
+                if (iptrUndefined) iErr = 0;
+
+                int fail = !(fabsf(err) <= half_ulps && iErr == 0);
+                if (ftz && fail)
+                {
+                    // retry per section 6.5.3.2
+                    if (IsHalfResultSubnormal(correct, half_ulps))
+                    {
+                        fail = fail && !(test == 0.0f && iErr == 0);
+                        if (!fail) err = 0.0f;
+                    }
+
+                    // retry per section 6.5.3.3
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        int correct3i, correct4i;
+                        float correct3 =
+                            (float)f->func.f_ffpI(0.0, HTF(p2[j]), &correct3i);
+                        float correct4 =
+                            (float)f->func.f_ffpI(-0.0, HTF(p2[j]), &correct4i);
+                        float err2 = Ulp_Error_Half(test, correct3);
+                        float err3 = Ulp_Error_Half(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= half_ulps && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, half_ulps)
+                            || IsHalfResultSubnormal(correct3, half_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // try with both args as zero
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            int correct7i, correct8i;
+                            correct3 = f->func.f_ffpI(0.0, 0.0, &correct3i);
+                            correct4 = f->func.f_ffpI(-0.0, 0.0, &correct4i);
+                            double correct7 =
+                                f->func.f_ffpI(0.0, -0.0, &correct7i);
+                            double correct8 =
+                                f->func.f_ffpI(-0.0, -0.0, &correct8i);
+                            err2 = Ulp_Error_Half(test, correct3);
+                            err3 = Ulp_Error_Half(test, correct4);
+                            float err4 = Ulp_Error_Half(test, correct7);
+                            float err5 = Ulp_Error_Half(test, correct8);
+                            iErr3 = (long long)q2[j] - (long long)correct3i;
+                            iErr4 = (long long)q2[j] - (long long)correct4i;
+                            int64_t iErr7 =
+                                (long long)q2[j] - (long long)correct7i;
+                            int64_t iErr8 =
+                                (long long)q2[j] - (long long)correct8i;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                    && (!(fabsf(err3) <= half_ulps
+                                          && iErr4 == 0))
+                                    && (!(fabsf(err4) <= half_ulps
+                                          && iErr7 == 0))
+                                    && (!(fabsf(err5) <= half_ulps
+                                          && iErr8 == 0)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+                            if (fabsf(err4) < fabsf(err)) err = err4;
+                            if (fabsf(err5) < fabsf(err)) err = err5;
+                            if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                            if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+                            if (llabs(iErr7) < llabs(iErr)) iErr = iErr7;
+                            if (llabs(iErr8) < llabs(iErr)) iErr = iErr8;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct3, half_ulps)
+                                || IsHalfResultSubnormal(correct4, half_ulps)
+                                || IsHalfResultSubnormal(correct7, half_ulps)
+                                || IsHalfResultSubnormal(correct8, half_ulps))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (iErr3 == 0 || iErr4 == 0
+                                             || iErr7 == 0 || iErr8 == 0));
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        int correct3i, correct4i;
+                        double correct3 =
+                            f->func.f_ffpI(HTF(p[j]), 0.0, &correct3i);
+                        double correct4 =
+                            f->func.f_ffpI(HTF(p[j]), -0.0, &correct4i);
+                        float err2 = Ulp_Error_Half(test, correct3);
+                        float err3 = Ulp_Error_Half(test, correct4);
+                        int64_t iErr3 = (long long)q2[j] - (long long)correct3i;
+                        int64_t iErr4 = (long long)q2[j] - (long long)correct4i;
+                        fail = fail
+                            && ((!(fabsf(err2) <= half_ulps && iErr3 == 0))
+                                && (!(fabsf(err3) <= half_ulps && iErr4 == 0)));
+                        if (fabsf(err2) < fabsf(err)) err = err2;
+                        if (fabsf(err3) < fabsf(err)) err = err3;
+                        if (llabs(iErr3) < llabs(iErr)) iErr = iErr3;
+                        if (llabs(iErr4) < llabs(iErr)) iErr = iErr4;
+
+                        // retry per section 6.5.3.4
+                        if (IsHalfResultSubnormal(correct2, half_ulps)
+                            || IsHalfResultSubnormal(correct3, half_ulps))
+                        {
+                            fail = fail
+                                && !(test == 0.0f
+                                     && (iErr3 == 0 || iErr4 == 0));
+                            if (!fail) err = 0.0f;
+                        }
+                    }
+                }
+                if (fabsf(err) > maxError)
+                {
+                    maxError = fabsf(err);
+                    maxErrorVal = HTF(p[j]);
+                }
+                if (llabs(iErr) > maxError2)
+                {
+                    maxError2 = llabs(iErr);
+                    maxErrorVal2 = HTF(p[j]);
+                }
+
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: {%f, %" PRId64
+                               "} ulp error at {%a, %a} "
+                               "({0x%04x, 0x%04x}): *{%a, %d} ({0x%04x, "
+                               "0x%8.8x}) vs. {%a, %d} ({0x%04x, 0x%8.8x})\n",
+                               f->name, sizeNames[k], err, iErr, HTF(p[j]),
+                               HTF(p2[j]), p[j], p2[j], HTF(t[j]), t2[j], t[j],
+                               t2[j], HTF(test), q2[j], test, q2[j]);
+                    return -1;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/common.cpp b/test_conformance/math_brute_force/common.cpp
index 47f493e7a6..3771a6fb00 100644
--- a/test_conformance/math_brute_force/common.cpp
+++ b/test_conformance/math_brute_force/common.cpp
@@ -27,8 +27,11 @@ const char *GetTypeName(ParameterType type)
 {
     switch (type)
     {
+        case ParameterType::Half: return "half";
         case ParameterType::Float: return "float";
         case ParameterType::Double: return "double";
+        case ParameterType::Short: return "short";
+        case ParameterType::UShort: return "ushort";
         case ParameterType::Int: return "int";
         case ParameterType::UInt: return "uint";
         case ParameterType::Long: return "long";
@@ -41,9 +44,13 @@ const char *GetUndefValue(ParameterType type)
 {
     switch (type)
     {
+        case ParameterType::Half:
         case ParameterType::Float:
         case ParameterType::Double: return "NAN";
 
+        case ParameterType::Short:
+        case ParameterType::UShort: return "0x5678";
+
         case ParameterType::Int:
         case ParameterType::UInt: return "0x12345678";
 
@@ -71,14 +78,17 @@ void EmitEnableExtension(std::ostringstream &kernel,
                          const std::initializer_list<ParameterType> &types)
 {
     bool needsFp64 = false;
+    bool needsFp16 = false;
 
     for (const auto &type : types)
     {
         switch (type)
         {
             case ParameterType::Double: needsFp64 = true; break;
-
+            case ParameterType::Half: needsFp16 = true; break;
             case ParameterType::Float:
+            case ParameterType::Short:
+            case ParameterType::UShort:
             case ParameterType::Int:
             case ParameterType::UInt:
             case ParameterType::Long:
@@ -89,6 +99,7 @@ void EmitEnableExtension(std::ostringstream &kernel,
     }
 
     if (needsFp64) kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
+    if (needsFp16) kernel << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
 }
 
 std::string GetBuildOptions(bool relaxed_mode)
diff --git a/test_conformance/math_brute_force/common.h b/test_conformance/math_brute_force/common.h
index 481b3b2a29..793a00fe92 100644
--- a/test_conformance/math_brute_force/common.h
+++ b/test_conformance/math_brute_force/common.h
@@ -36,8 +36,11 @@ using Buffers = std::array<clMemWrapper, VECTOR_SIZE_COUNT>;
 // Types supported for kernel code generation.
 enum class ParameterType
 {
+    Half,
     Float,
     Double,
+    Short,
+    UShort,
     Int,
     UInt,
     Long,
@@ -91,4 +94,5 @@ using SourceGenerator = std::string (*)(const std::string &kernel_name,
 cl_int BuildKernels(BuildKernelInfo &info, cl_uint job_id,
                     SourceGenerator generator);
 
+
 #endif /* COMMON_H */
diff --git a/test_conformance/math_brute_force/function_list.cpp b/test_conformance/math_brute_force/function_list.cpp
index 917362852c..b2f3de82ef 100644
--- a/test_conformance/math_brute_force/function_list.cpp
+++ b/test_conformance/math_brute_force/function_list.cpp
@@ -29,36 +29,41 @@
 // Only use ulps information in spir test
 #ifdef FUNCTION_LIST_ULPS_ONLY
 
-#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                       \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)            \
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,             \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,  \
             RELAXED_OFF, _type                                                 \
     }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type,     \
-                  _relaxed_embedded_ulp)                                       \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \
+                  _type, _relaxed_embedded_ulp)                                \
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { NULL }, { NULL }, { NULL },      \
-            _ulp, _ulp, _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp,    \
-            _rmode, RELAXED_ON, _type                                          \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, _relaxed_ulp,                \
+            _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type                   \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
         "half_" STRINGIFY(_name), "half_" STRINGIFY(_name), { NULL },          \
-            { NULL }, { NULL }, _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, \
-            _rmode, RELAXED_OFF, _type                                         \
+            { NULL }, { NULL }, _ulp, _ulp, _ulp, _embedded_ulp, INFINITY,     \
+            INFINITY, _rmode, RELAXED_OFF, _type                               \
     }
-#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)   \
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp,       \
+                       _rmode, _type)                                          \
     {                                                                          \
         STRINGIFY(_name), _operator, { NULL }, { NULL }, { NULL }, _ulp, _ulp, \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
+            _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, \
+            _type                                                              \
     }
 
 #define unaryF NULL
+#define unaryOF NULL
 #define i_unaryF NULL
 #define unaryF_u NULL
 #define macro_unaryF NULL
 #define binaryF NULL
+#define binaryOF NULL
+#define binaryF_nextafter NULL
 #define binaryOperatorF NULL
 #define binaryF_i NULL
 #define macro_binaryF NULL
@@ -76,31 +81,34 @@
 
 #else // FUNCTION_LIST_ULPS_ONLY
 
-#define ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                       \
+#define ENTRY(_name, _ulp, _embedded_ulp, _half_ulp, _rmode, _type)            \
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
             { (void*)reference_##_name##l }, { (void*)reference_##_name },     \
-            _ulp, _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,             \
+            _ulp, _ulp, _half_ulp, _embedded_ulp, INFINITY, INFINITY, _rmode,  \
             RELAXED_OFF, _type                                                 \
     }
-#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _relaxed_ulp, _rmode, _type,     \
-                  _relaxed_embedded_ulp)                                       \
+#define ENTRY_EXT(_name, _ulp, _embedded_ulp, _half_ulp, _relaxed_ulp, _rmode, \
+                  _type, _relaxed_embedded_ulp)                                \
     {                                                                          \
         STRINGIFY(_name), STRINGIFY(_name), { (void*)reference_##_name },      \
             { (void*)reference_##_name##l },                                   \
-            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _embedded_ulp, \
-            _relaxed_ulp, _relaxed_embedded_ulp, _rmode, RELAXED_ON, _type     \
+            { (void*)reference_##relaxed_##_name }, _ulp, _ulp, _half_ulp,     \
+            _embedded_ulp, _relaxed_ulp, _relaxed_embedded_ulp, _rmode,        \
+            RELAXED_ON, _type                                                  \
     }
 #define HALF_ENTRY(_name, _ulp, _embedded_ulp, _rmode, _type)                  \
     {                                                                          \
         "half_" STRINGIFY(_name), "half_" STRINGIFY(_name),                    \
             { (void*)reference_##_name }, { NULL }, { NULL }, _ulp, _ulp,      \
-            _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
+            _ulp, _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF,      \
+            _type                                                              \
     }
-#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _rmode, _type)   \
+#define OPERATOR_ENTRY(_name, _operator, _ulp, _embedded_ulp, _half_ulp,       \
+                       _rmode, _type)                                          \
     {                                                                          \
         STRINGIFY(_name), _operator, { (void*)reference_##_name },             \
-            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp,             \
+            { (void*)reference_##_name##l }, { NULL }, _ulp, _ulp, _half_ulp,  \
             _embedded_ulp, INFINITY, INFINITY, _rmode, RELAXED_OFF, _type      \
     }
 
@@ -108,85 +116,114 @@ static constexpr vtbl _unary = {
     "unary",
     TestFunc_Float_Float,
     TestFunc_Double_Double,
+    TestFunc_Half_Half,
 };
 
+static constexpr vtbl _unaryof = { "unaryof", TestFunc_Float_Float, NULL,
+                                   NULL };
+
 static constexpr vtbl _i_unary = {
     "i_unary",
     TestFunc_Int_Float,
     TestFunc_Int_Double,
+    TestFunc_Int_Half,
 };
 
 static constexpr vtbl _unary_u = {
     "unary_u",
     TestFunc_Float_UInt,
     TestFunc_Double_ULong,
+    TestFunc_Half_UShort,
 };
 
 static constexpr vtbl _macro_unary = {
     "macro_unary",
     TestMacro_Int_Float,
     TestMacro_Int_Double,
+    TestMacro_Int_Half,
 };
 
 static constexpr vtbl _binary = {
     "binary",
     TestFunc_Float_Float_Float,
     TestFunc_Double_Double_Double,
+    TestFunc_Half_Half_Half,
 };
 
+static constexpr vtbl _binary_nextafter = {
+    "binary",
+    TestFunc_Float_Float_Float,
+    TestFunc_Double_Double_Double,
+    TestFunc_Half_Half_Half_nextafter,
+};
+
+static constexpr vtbl _binaryof = { "binaryof", TestFunc_Float_Float_Float,
+                                    NULL, NULL };
+
 static constexpr vtbl _binary_operator = {
     "binaryOperator",
     TestFunc_Float_Float_Float_Operator,
     TestFunc_Double_Double_Double_Operator,
+    TestFunc_Half_Half_Half_Operator,
 };
 
 static constexpr vtbl _binary_i = {
     "binary_i",
     TestFunc_Float_Float_Int,
     TestFunc_Double_Double_Int,
+    TestFunc_Half_Half_Int,
 };
 
 static constexpr vtbl _macro_binary = {
     "macro_binary",
     TestMacro_Int_Float_Float,
     TestMacro_Int_Double_Double,
+    TestMacro_Int_Half_Half,
 };
 
 static constexpr vtbl _ternary = {
     "ternary",
     TestFunc_Float_Float_Float_Float,
     TestFunc_Double_Double_Double_Double,
+    TestFunc_Half_Half_Half_Half,
 };
 
 static constexpr vtbl _unary_two_results = {
     "unary_two_results",
     TestFunc_Float2_Float,
     TestFunc_Double2_Double,
+    TestFunc_Half2_Half,
 };
 
 static constexpr vtbl _unary_two_results_i = {
     "unary_two_results_i",
     TestFunc_FloatI_Float,
     TestFunc_DoubleI_Double,
+    TestFunc_HalfI_Half,
 };
 
 static constexpr vtbl _binary_two_results_i = {
     "binary_two_results_i",
     TestFunc_FloatI_Float_Float,
     TestFunc_DoubleI_Double_Double,
+    TestFunc_HalfI_Half_Half,
 };
 
 static constexpr vtbl _mad_tbl = {
     "ternary",
     TestFunc_mad_Float,
     TestFunc_mad_Double,
+    TestFunc_mad_Half,
 };
 
 #define unaryF &_unary
+#define unaryOF &_unaryof
 #define i_unaryF &_i_unary
 #define unaryF_u &_unary_u
 #define macro_unaryF &_macro_unary
 #define binaryF &_binary
+#define binaryF_nextafter &_binary_nextafter
+#define binaryOF &_binaryof
 #define binaryOperatorF &_binary_operator
 #define binaryF_i &_binary_i
 #define macro_binaryF &_macro_binary
@@ -199,24 +236,24 @@ static constexpr vtbl _mad_tbl = {
 #endif // FUNCTION_LIST_ULPS_ONLY
 
 const Func functionList[] = {
-    ENTRY_EXT(acos, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(acosh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(acospi, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(asin, 4.0f, 4.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(asinh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(asinpi, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(atan, 5.0f, 5.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
-    ENTRY(atanh, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY(atanpi, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY(atan2, 6.0f, 6.0f, FTZ_OFF, binaryF),
-    ENTRY(atan2pi, 6.0f, 6.0f, FTZ_OFF, binaryF),
-    ENTRY(cbrt, 2.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(ceil, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(copysign, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY_EXT(cos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY_EXT(acos, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(acosh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(acospi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(asin, 4.0f, 4.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(asinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(asinpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(atan, 5.0f, 5.0f, 2.0f, 4096.0f, FTZ_OFF, unaryF, 4096.0f),
+    ENTRY(atanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(atanpi, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(atan2, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF),
+    ENTRY(atan2pi, 6.0f, 6.0f, 2.0f, FTZ_OFF, binaryF),
+    ENTRY(cbrt, 2.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
+    ENTRY(ceil, 0.0f, 0.0f, 0.f, FTZ_OFF, unaryF),
+    ENTRY(copysign, 0.0f, 0.0f, 0.f, FTZ_OFF, binaryF),
+    ENTRY_EXT(cos, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(cosh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(cospi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(cosh, 4.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
+    ENTRY_EXT(cospi, 4.0f, 4.0f, 2.f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
     //                                  ENTRY( erfc,                  16.0f,
     //                                  16.0f,         FTZ_OFF,     unaryF),
@@ -225,81 +262,84 @@ const Func functionList[] = {
     //                                  16.0f,         16.0f,         FTZ_OFF,
     //                                  unaryF), //disabled for 1.0 due to lack
     //                                  of reference implementation
-    ENTRY_EXT(exp, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF,
+    ENTRY_EXT(exp, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF,
               4.0f), // relaxed error is actually overwritten in unary.c as it
                      // is 3+floor(fabs(2*x))
-    ENTRY_EXT(exp2, 3.0f, 4.0f, 3.0f, FTZ_OFF, unaryF,
+    ENTRY_EXT(exp2, 3.0f, 4.0f, 2.f, 3.0f, FTZ_OFF, unaryF,
               4.0f), // relaxed error is actually overwritten in unary.c as it
                      // is 3+floor(fabs(2*x))
-    ENTRY_EXT(exp10, 3.0f, 4.0f, 8192.0f, FTZ_OFF, unaryF,
+    ENTRY_EXT(exp10, 3.0f, 4.0f, 2.f, 8192.0f, FTZ_OFF, unaryF,
               8192.0f), // relaxed error is actually overwritten in unary.c as
                         // it is 3+floor(fabs(2*x)) in derived mode,
     // in non-derived mode it uses the ulp error for half_exp10.
-    ENTRY(expm1, 3.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(fabs, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(fdim, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(floor, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(fma, 0.0f, 0.0f, FTZ_OFF, ternaryF),
-    ENTRY(fmax, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fmin, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fmod, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(fract, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
-    ENTRY(frexp, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
-    ENTRY(hypot, 4.0f, 4.0f, FTZ_OFF, binaryF),
-    ENTRY(ilogb, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
-    ENTRY(isequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isfinite, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isgreaterequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isinf, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isless, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(islessequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(islessgreater, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isnan, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isnormal, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY(isnotequal, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(isunordered, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
-    ENTRY(ldexp, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
-    ENTRY(lgamma, INFINITY, INFINITY, FTZ_OFF, unaryF),
-    ENTRY(lgamma_r, INFINITY, INFINITY, FTZ_OFF, unaryF_two_results_i),
-    ENTRY_EXT(log, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY(expm1, 3.0f, 4.0f, 2.f, FTZ_OFF, unaryF),
+    ENTRY(fabs, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(fdim, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(floor, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(fma, 0.0f, 0.0f, 0.0f, FTZ_OFF, ternaryF),
+    ENTRY(fmax, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fmin, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fmod, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(fract, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
+    ENTRY(frexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results_i),
+    ENTRY(hypot, 4.0f, 4.0f, 2.0f, FTZ_OFF, binaryF),
+    ENTRY(ilogb, 0.0f, 0.0f, 0.0f, FTZ_OFF, i_unaryF),
+    ENTRY(isequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isfinite, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isgreaterequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isinf, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isless, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(islessequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(islessgreater, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isnan, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isnormal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY(isnotequal, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(isunordered, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_binaryF),
+    ENTRY(ldexp, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_i),
+    ENTRY(lgamma, INFINITY, INFINITY, INFINITY, FTZ_OFF, unaryF),
+    ENTRY(lgamma_r, INFINITY, INFINITY, INFINITY, FTZ_OFF,
+          unaryF_two_results_i),
+    ENTRY_EXT(log, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY_EXT(log2, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log2, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY_EXT(log10, 3.0f, 4.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
+    ENTRY_EXT(log10, 3.0f, 4.0f, 2.0f, 4.76837158203125e-7f, FTZ_OFF, unaryF,
               4.76837158203125e-7f), // relaxed ulp 2^-21
-    ENTRY(log1p, 2.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(logb, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, FTZ_OFF, mad_function,
+    ENTRY(log1p, 2.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(logb, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(mad, INFINITY, INFINITY, INFINITY, INFINITY, FTZ_OFF,
+              mad_function,
               INFINITY), // in fast-relaxed-math mode it has to be either
                          // exactly rounded fma or exactly rounded a*b+c
-    ENTRY(maxmag, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(minmag, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(modf, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
-    ENTRY(nan, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
-    ENTRY(nextafter, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY_EXT(pow, 16.0f, 16.0f, 8192.0f, FTZ_OFF, binaryF,
+    ENTRY(maxmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(minmag, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(modf, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_two_results),
+    ENTRY(nan, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF_u),
+    ENTRY(nextafter, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_nextafter),
+    ENTRY_EXT(pow, 16.0f, 16.0f, 4.0f, 8192.0f, FTZ_OFF, binaryF,
               8192.0f), // in derived mode the ulp error is calculated as
                         // exp2(y*log2(x)) and in non-derived it is the same as
                         // half_pow
-    ENTRY(pown, 16.0f, 16.0f, FTZ_OFF, binaryF_i),
-    ENTRY(powr, 16.0f, 16.0f, FTZ_OFF, binaryF),
+    ENTRY(pown, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i),
+    ENTRY(powr, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF),
     //                                  ENTRY( reciprocal,            1.0f,
     //                                  1.0f,         FTZ_OFF,     unaryF),
-    ENTRY(remainder, 0.0f, 0.0f, FTZ_OFF, binaryF),
-    ENTRY(remquo, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
-    ENTRY(rint, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(rootn, 16.0f, 16.0f, FTZ_OFF, binaryF_i),
-    ENTRY(round, 0.0f, 0.0f, FTZ_OFF, unaryF),
-    ENTRY(rsqrt, 2.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY(signbit, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
-    ENTRY_EXT(sin, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(remainder, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF),
+    ENTRY(remquo, 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryF_two_results_i),
+    ENTRY(rint, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(rootn, 16.0f, 16.0f, 4.0f, FTZ_OFF, binaryF_i),
+    ENTRY(round, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(rsqrt, 2.0f, 4.0f, 1.0f, FTZ_OFF, unaryF),
+    ENTRY(signbit, 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    ENTRY_EXT(sin, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY_EXT(sincos, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF_two_results,
+    ENTRY_EXT(sincos, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF,
+              unaryF_two_results,
               0.00048828125f), // relaxed ulp 2^-11
-    ENTRY(sinh, 4.0f, 4.0f, FTZ_OFF, unaryF),
-    ENTRY_EXT(sinpi, 4.0f, 4.0f, 0.00048828125f, FTZ_OFF, unaryF,
+    ENTRY(sinh, 4.0f, 4.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY_EXT(sinpi, 4.0f, 4.0f, 2.0f, 0.00048828125f, FTZ_OFF, unaryF,
               0.00048828125f), // relaxed ulp 2^-11
     { "sqrt",
       "sqrt",
@@ -308,6 +348,7 @@ const Func functionList[] = {
       { NULL },
       3.0f,
       0.0f,
+      0.0f,
       4.0f,
       INFINITY,
       INFINITY,
@@ -322,41 +363,42 @@ const Func functionList[] = {
       0.0f,
       0.0f,
       0.0f,
+      0.0f,
       INFINITY,
       INFINITY,
       FTZ_OFF,
       RELAXED_OFF,
       unaryF },
     ENTRY_EXT(
-        tan, 5.0f, 5.0f, 8192.0f, FTZ_OFF, unaryF,
+        tan, 5.0f, 5.0f, 2.0f, 8192.0f, FTZ_OFF, unaryF,
         8192.0f), // in derived mode it the ulp error is calculated as sin/cos
                   // and in non-derived mode it is the same as half_tan.
-    ENTRY(tanh, 5.0f, 5.0f, FTZ_OFF, unaryF),
-    ENTRY(tanpi, 6.0f, 6.0f, FTZ_OFF, unaryF),
+    ENTRY(tanh, 5.0f, 5.0f, 2.0f, FTZ_OFF, unaryF),
+    ENTRY(tanpi, 6.0f, 6.0f, 2.0f, FTZ_OFF, unaryF),
     //                                    ENTRY( tgamma,                 16.0f,
     //                                    16.0f,         FTZ_OFF,     unaryF),
     //                                    // Commented this out until we can be
     //                                    sure this requirement is realistic
-    ENTRY(trunc, 0.0f, 0.0f, FTZ_OFF, unaryF),
+    ENTRY(trunc, 0.0f, 0.0f, 0.0f, FTZ_OFF, unaryF),
 
-    HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryF),
-    HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryF),
-    HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryF),
-    HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryF),
+    HALF_ENTRY(cos, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(divide, 8192.0f, 8192.0f, FTZ_ON, binaryOF),
+    HALF_ENTRY(exp, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(exp2, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(exp10, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(log, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(log2, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(log10, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(powr, 8192.0f, 8192.0f, FTZ_ON, binaryOF),
+    HALF_ENTRY(recip, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(rsqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(sin, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(sqrt, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
+    HALF_ENTRY(tan, 8192.0f, 8192.0f, FTZ_ON, unaryOF),
 
     // basic operations
-    OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(add, "+", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(subtract, "-", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
     { "divide",
       "/",
       { (void*)reference_divide },
@@ -364,6 +406,7 @@ const Func functionList[] = {
       { (void*)reference_relaxed_divide },
       2.5f,
       0.0f,
+      0.0f,
       3.0f,
       2.5f,
       INFINITY,
@@ -378,15 +421,16 @@ const Func functionList[] = {
       0.0f,
       0.0f,
       0.0f,
+      0.0f,
       0.f,
       INFINITY,
       FTZ_OFF,
       RELAXED_OFF,
       binaryOperatorF },
-    OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
-    OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, FTZ_OFF,
+    OPERATOR_ENTRY(multiply, "*", 0.0f, 0.0f, 0.0f, FTZ_OFF, binaryOperatorF),
+    OPERATOR_ENTRY(assignment, "", 0.0f, 0.0f, 0.0f, FTZ_OFF,
                    unaryF), // A simple copy operation
-    OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
+    OPERATOR_ENTRY(not, "!", 0.0f, 0.0f, 0.0f, FTZ_OFF, macro_unaryF),
 };
 
 const size_t functionListCount = sizeof(functionList) / sizeof(functionList[0]);
diff --git a/test_conformance/math_brute_force/function_list.h b/test_conformance/math_brute_force/function_list.h
index 95a2945932..6ea0fa9e2b 100644
--- a/test_conformance/math_brute_force/function_list.h
+++ b/test_conformance/math_brute_force/function_list.h
@@ -70,6 +70,9 @@ struct vtbl
     int (*DoubleTestFunc)(
         const struct Func *, MTdata,
         bool); // may be NULL if function is single precision only
+    int (*HalfTestFunc)(
+        const struct Func *, MTdata,
+        bool); // may be NULL if function is single precision only
 };
 
 struct Func
@@ -82,6 +85,7 @@ struct Func
     fptr rfunc;
     float float_ulps;
     float double_ulps;
+    float half_ulps;
     float float_embedded_ulps;
     float relaxed_error;
     float relaxed_embedded_error;
diff --git a/test_conformance/math_brute_force/i_unary_half.cpp b/test_conformance/math_brute_force/i_unary_half.cpp
new file mode 100644
index 0000000000..baff3ee20d
--- /dev/null
+++ b/test_conformance/math_brute_force/i_unary_half.cpp
@@ -0,0 +1,220 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <cinttypes>
+
+namespace {
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Int,
+                              ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+} // anonymous namespace
+
+int TestFunc_Int_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    KernelMatrix kernels;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    int ftz = f->ftz || 0 == (gHalfCapabilities & CL_FP_DENORM) || gForceFTZ;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+    size_t bufferSizeIn = bufferElements * sizeof(cl_half);
+    size_t bufferSizeOut = bufferElements * sizeof(cl_int);
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // This test is not using ThreadPool so we need to disable FTZ here
+    // for reference computations
+    FPU_mode_type oldMode;
+    DisableFTZ(&oldMode);
+    std::shared_ptr<int> at_scope_exit(
+        nullptr, [&oldMode](int *) { RestoreFPState(&oldMode); });
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+    std::vector<float> s(bufferElements);
+
+    for (uint64_t i = 0; i < (1ULL << 16); i += step)
+    {
+        // Init input array
+        cl_ushort *p = (cl_ushort *)gIn;
+
+        for (size_t j = 0; j < bufferElements; j++) p[j] = (cl_ushort)i + j;
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSizeIn, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, bufferSizeOut);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, bufferSizeOut,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSizeOut,
+                                            0, NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_int);
+            size_t localCount = (bufferSizeOut + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        int *r = (int *)gOut_Ref;
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            s[j] = HTF(p[j]);
+            r[j] = f->func.i_f(s[j]);
+        }
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                             bufferSizeOut, gOut[j], 0, NULL,
+                                             NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint32_t *t = (uint32_t *)gOut_Ref;
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint32_t *q = (uint32_t *)(gOut[k]);
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    if (ftz && IsHalfSubnormal(p[j]))
+                    {
+                        unsigned int correct0 = f->func.i_f(0.0);
+                        unsigned int correct1 = f->func.i_f(-0.0);
+                        if (q[j] == correct0 || q[j] == correct1) continue;
+                    }
+
+                    uint32_t err = t[j] - q[j];
+                    if (q[j] > t[j]) err = q[j] - t[j];
+                    vlog_error("\nERROR: %s%s: %d ulp error at %a (0x%04x): "
+                               "*%d vs. %d\n",
+                               f->name, sizeNames[k], err, s[j], p[j], t[j],
+                               q[j]);
+                    return -1;
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10zd \n",
+                     i, step, bufferSizeOut);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_binary_double.cpp b/test_conformance/math_brute_force/macro_binary_double.cpp
index 51d5b64b39..9c8a61ed34 100644
--- a/test_conformance/math_brute_force/macro_binary_double.cpp
+++ b/test_conformance/math_brute_force/macro_binary_double.cpp
@@ -185,8 +185,7 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/macro_binary_float.cpp b/test_conformance/math_brute_force/macro_binary_float.cpp
index b00a29ff82..8e73acad8c 100644
--- a/test_conformance/math_brute_force/macro_binary_float.cpp
+++ b/test_conformance/math_brute_force/macro_binary_float.cpp
@@ -176,8 +176,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 {
diff --git a/test_conformance/math_brute_force/macro_binary_half.cpp b/test_conformance/math_brute_force/macro_binary_half.cpp
new file mode 100644
index 0000000000..d25342dda5
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_binary_half.cpp
@@ -0,0 +1,540 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetBinaryKernel(kernel_name, builtin, ParameterType::Short,
+                               ParameterType::Half, ParameterType::Half,
+                               vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+struct ThreadInfo
+{
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper inBuf2; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    MTdataHolder d;
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+};
+
+struct TestInfoBase
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+};
+
+struct TestInfo : public TestInfoBase
+{
+    TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+};
+
+// A table of more difficult cases to get right
+const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    MTdata d = tinfo->d;
+    cl_uint j, k;
+    cl_int error;
+    const char *name = job->f->name;
+    cl_short *t, *r;
+    std::vector<float> s(0), s2(0);
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_short *out[VECTOR_SIZE_COUNT];
+
+    if (gHostFill)
+    {
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            out[j] = (cl_short *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
+
+    // Init input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    cl_ushort *p2 = (cl_ushort *)gIn2 + thread_id * buffer_elements;
+    j = 0;
+    int totalSpecialValueCount =
+        specialValuesHalfCount * specialValuesHalfCount;
+    int indx = (totalSpecialValueCount - 1) / buffer_elements;
+
+    if (job_id <= (cl_uint)indx)
+    { // test edge cases
+        uint32_t x, y;
+
+        x = (job_id * buffer_elements) % specialValuesHalfCount;
+        y = (job_id * buffer_elements) / specialValuesHalfCount;
+
+        for (; j < buffer_elements; j++)
+        {
+            p[j] = specialValuesHalf[x];
+            p2[j] = specialValuesHalf[y];
+            if (++x >= specialValuesHalfCount)
+            {
+                x = 0;
+                y++;
+                if (y >= specialValuesHalfCount) break;
+            }
+        }
+    }
+
+    // Init any remaining values.
+    for (; j < buffer_elements; j++)
+    {
+        p[j] = (cl_ushort)genrand_int32(d);
+        p2[j] = (cl_ushort)genrand_int32(d);
+    }
+
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
+                                      buffer_size, p2, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if (gHostFill)
+        {
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
+        {
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
+                                    &tinfo->inBuf2)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    r = (cl_short *)gOut_Ref + thread_id * buffer_elements;
+    t = (cl_short *)r;
+    s.resize(buffer_elements);
+    s2.resize(buffer_elements);
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        s2[j] = cl_half_to_float(p2[j]);
+        r[j] = (short)func.i_ff(s[j], s2[j]);
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_short *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+
+    // Wait for the last buffer
+    out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_short *q = (cl_short *)out[0];
+
+        // If we aren't getting the correctly rounded result
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            if (ftz)
+            {
+                if (IsHalfSubnormal(p[j]))
+                {
+                    if (IsHalfSubnormal(p2[j]))
+                    {
+                        short correct = (short)func.i_ff(0.0f, 0.0f);
+                        short correct2 = (short)func.i_ff(0.0f, -0.0f);
+                        short correct3 = (short)func.i_ff(-0.0f, 0.0f);
+                        short correct4 = (short)func.i_ff(-0.0f, -0.0f);
+
+                        if (correct == q[j] || correct2 == q[j]
+                            || correct3 == q[j] || correct4 == q[j])
+                            continue;
+                    }
+                    else
+                    {
+                        short correct = (short)func.i_ff(0.0f, s2[j]);
+                        short correct2 = (short)func.i_ff(-0.0f, s2[j]);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+                else if (IsHalfSubnormal(p2[j]))
+                {
+                    short correct = (short)func.i_ff(s[j], 0.0f);
+                    short correct2 = (short)func.i_ff(s[j], -0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            short err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error(
+                "\nERROR: %s: %d ulp error at {%a (0x%04x), %a "
+                "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x (index: %d)\n",
+                name, err, s[j], p[j], s2[j], p2[j], t[j], q[j], j);
+            error = -1;
+            return error;
+        }
+
+
+        for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
+             k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        if (IsHalfSubnormal(p2[j]))
+                        {
+                            short correct = (short)-func.i_ff(0.0f, 0.0f);
+                            short correct2 = (short)-func.i_ff(0.0f, -0.0f);
+                            short correct3 = (short)-func.i_ff(-0.0f, 0.0f);
+                            short correct4 = (short)-func.i_ff(-0.0f, -0.0f);
+
+                            if (correct == q[j] || correct2 == q[j]
+                                || correct3 == q[j] || correct4 == q[j])
+                                continue;
+                        }
+                        else
+                        {
+                            short correct = (short)-func.i_ff(0.0f, s2[j]);
+                            short correct2 = (short)-func.i_ff(-0.0f, s2[j]);
+                            if (correct == q[j] || correct2 == q[j]) continue;
+                        }
+                    }
+                    else if (IsHalfSubnormal(p2[j]))
+                    {
+                        short correct = (short)-func.i_ff(s[j], 0.0f);
+                        short correct2 = (short)-func.i_ff(s[j], -0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+
+                cl_ushort err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s: %d ulp error at {%a (0x%04x), %a "
+                           "(0x%04x)}\nExpected: 0x%04x \nActual: 0x%04x "
+                           "(index: %d)\n",
+                           name, err, s[j], p[j], s2[j], p2[j], -t[j], q[j], j);
+                error = -1;
+                return error;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return error;
+}
+
+} // anonymous namespace
+
+int TestMacro_Int_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+        test_info.tinfo[i].inBuf2 =
+            clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf2)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+
+        test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/macro_unary_half.cpp b/test_conformance/math_brute_force/macro_unary_half.cpp
new file mode 100644
index 0000000000..a755ddb15a
--- /dev/null
+++ b/test_conformance/math_brute_force/macro_unary_half.cpp
@@ -0,0 +1,427 @@
+//
+// Copyright (c) 2017 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Short,
+                              ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+struct ThreadInfo
+{
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+};
+
+struct TestInfoBase
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    int ftz; // non-zero if running in flush to zero mode
+};
+
+struct TestInfo : public TestInfoBase
+{
+    TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+};
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    fptr func = job->f->func;
+    int ftz = job->ftz;
+    cl_uint j, k;
+    cl_int error = CL_SUCCESS;
+    const char *name = job->f->name;
+    std::vector<float> s(0);
+
+    int signbit_test = 0;
+    if (!strcmp(name, "signbit")) signbit_test = 1;
+
+#define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
+
+    // start the map of the output arrays
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_short *out[VECTOR_SIZE_COUNT];
+
+    if (gHostFill)
+    {
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            out[j] = (cl_short *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
+
+    // Write the new values to the input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if (gHostFill)
+        {
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
+        {
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_short *r = (cl_short *)gOut_Ref + thread_id * buffer_elements;
+    cl_short *t = (cl_short *)r;
+    s.resize(buffer_elements);
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = cl_half_to_float(p[j]);
+        if (!strcmp(name, "isnormal"))
+        {
+            if ((IsHalfSubnormal(p[j]) == 0) && !((p[j] & 0x7fffU) >= 0x7c00U)
+                && ((p[j] & 0x7fffU) != 0x0000U))
+                r[j] = 1;
+            else
+                r[j] = 0;
+        }
+        else
+            r[j] = (short)ref_func(s[j]);
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (cl_short *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (cl_short *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        cl_short *q = out[0];
+
+        // If we aren't getting the correctly rounded result
+        if (gMinVectorSizeIndex == 0 && t[j] != q[j])
+        {
+            // If we aren't getting the correctly rounded result
+            if (ftz)
+            {
+                if (IsHalfSubnormal(p[j]))
+                {
+                    short correct = (short)ref_func(+0.0f);
+                    short correct2 = (short)ref_func(-0.0f);
+                    if (correct == q[j] || correct2 == q[j]) continue;
+                }
+            }
+
+            short err = t[j] - q[j];
+            if (q[j] > t[j]) err = q[j] - t[j];
+            vlog_error("\nERROR: %s: %d ulp error at %a (0x%04x)\nExpected: "
+                       "%d vs. %d\n",
+                       name, err, s[j], p[j], t[j], q[j]);
+            error = -1;
+            return error;
+        }
+
+
+        for (k = std::max(1U, gMinVectorSizeIndex); k < gMaxVectorSizeIndex;
+             k++)
+        {
+            q = out[k];
+            // If we aren't getting the correctly rounded result
+            if (-t[j] != q[j])
+            {
+                if (ftz)
+                {
+                    if (IsHalfSubnormal(p[j]))
+                    {
+                        short correct = (short)-ref_func(+0.0f);
+                        short correct2 = (short)-ref_func(-0.0f);
+                        if (correct == q[j] || correct2 == q[j]) continue;
+                    }
+                }
+
+                short err = -t[j] - q[j];
+                if (q[j] > -t[j]) err = q[j] + t[j];
+                vlog_error("\nERROR: %s%s: %d ulp error at %a "
+                           "(0x%04x)\nExpected: %d \nActual: %d\n",
+                           name, sizeNames[k], err, s[j], p[j], -t[j], q[j]);
+                error = -1;
+                return error;
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+    return error;
+}
+
+} // anonymous namespace
+
+int TestMacro_Int_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
+    }
+
+    test_info.f = f;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/mad_half.cpp b/test_conformance/math_brute_force/mad_half.cpp
new file mode 100644
index 0000000000..d8aefde386
--- /dev/null
+++ b/test_conformance/math_brute_force/mad_half.cpp
@@ -0,0 +1,201 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetTernaryKernel(kernel_name, builtin, ParameterType::Half,
+                                ParameterType::Half, ParameterType::Half,
+                                ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+} // anonymous namespace
+
+int TestFunc_mad_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    KernelMatrix kernels;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    float maxError = 0.0f;
+
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    float maxErrorVal3 = 0.0f;
+    size_t bufferSize = BUFFER_SIZE;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    uint64_t step = getTestStep(sizeof(cl_half), bufferSize);
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
+        if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                                   gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                                   &build_info)))
+            return error;
+    }
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_ushort *p = (cl_ushort *)gIn;
+        cl_ushort *p2 = (cl_ushort *)gIn2;
+        cl_ushort *p3 = (cl_ushort *)gIn3;
+        for (size_t j = 0; j < bufferSize / sizeof(cl_ushort); j++)
+        {
+            p[j] = (cl_ushort)genrand_int32(d);
+            p2[j] = (cl_ushort)genrand_int32(d);
+            p3[j] = (cl_ushort)genrand_int32(d);
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          bufferSize, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          bufferSize, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_half) * sizeValues[j];
+            size_t localCount = (bufferSize + vectorSize - 1)
+                / vectorSize; // bufferSize / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer3), &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data - no verification possible. MAD is a random number
+        // generator.
+
+        if (0 == (i & 0x0fffffff))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("pass");
+
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    }
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 74dd5c47d7..8d8acb1b19 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -49,6 +49,8 @@
 #include "harness/testHarness.h"
 
 #define kPageSize 4096
+#define HALF_REQUIRED_FEATURES_1 (CL_FP_ROUND_TO_ZERO)
+#define HALF_REQUIRED_FEATURES_2 (CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN)
 #define DOUBLE_REQUIRED_FEATURES                                               \
     (CL_FP_FMA | CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO                  \
      | CL_FP_ROUND_TO_INF | CL_FP_INF_NAN | CL_FP_DENORM)
@@ -81,6 +83,8 @@ static int gTestFastRelaxed = 1;
 */
 int gFastRelaxedDerived = 1;
 static int gToggleCorrectlyRoundedDivideSqrt = 0;
+int gHasHalf = 0;
+cl_device_fp_config gHalfCapabilities = 0;
 int gDeviceILogb0 = 1;
 int gDeviceILogbNaN = 1;
 int gCheckTininessBeforeRounding = 1;
@@ -104,6 +108,8 @@ cl_device_fp_config gFloatCapabilities = 0;
 int gWimpyReductionFactor = 32;
 int gVerboseBruteForce = 0;
 
+cl_half_rounding_mode gHalfRoundingMode = CL_HALF_RTE;
+
 static int ParseArgs(int argc, const char **argv);
 static void PrintUsage(void);
 static void PrintFunctions(void);
@@ -167,7 +173,6 @@ static int doTest(const char *name)
             return 0;
         }
     }
-
     {
         if (0 == strcmp("ilogb", func_data->name))
         {
@@ -236,6 +241,23 @@ static int doTest(const char *name)
                 }
             }
         }
+
+        if (gHasHalf && NULL != func_data->vtbl_ptr->HalfTestFunc)
+        {
+            gTestCount++;
+            vlog("%3d: ", gTestCount);
+            if (func_data->vtbl_ptr->HalfTestFunc(func_data, gMTdata,
+                                                  false /* relaxed mode*/))
+            {
+                gFailCount++;
+                error++;
+                if (gStopOnError)
+                {
+                    gSkipRestOfTests = true;
+                    return error;
+                }
+            }
+        }
     }
 
     return error;
@@ -408,6 +430,8 @@ static int ParseArgs(int argc, const char **argv)
 
                     case 'm': singleThreaded ^= 1; break;
 
+                    case 'g': gHasHalf ^= 1; break;
+
                     case 'r': gTestFastRelaxed ^= 1; break;
 
                     case 's': gStopOnError ^= 1; break;
@@ -540,6 +564,8 @@ static void PrintUsage(void)
     vlog("\t\t-d\tToggle double precision testing. (Default: on iff khr_fp_64 "
          "on)\n");
     vlog("\t\t-f\tToggle float precision testing. (Default: on)\n");
+    vlog("\t\t-g\tToggle half precision testing. (Default: on if khr_fp_16 "
+         "on)\n");
     vlog("\t\t-r\tToggle fast relaxed math precision testing. (Default: on)\n");
     vlog("\t\t-e\tToggle test as derived implementations for fast relaxed math "
          "precision. (Default: on)\n");
@@ -640,6 +666,54 @@ test_status InitCL(cl_device_id device)
 #endif
     }
 
+    gFloatToHalfRoundingMode = kRoundToNearestEven;
+    if (is_extension_available(gDevice, "cl_khr_fp16"))
+    {
+        gHasHalf ^= 1;
+#if defined(CL_DEVICE_HALF_FP_CONFIG)
+        if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_HALF_FP_CONFIG,
+                                     sizeof(gHalfCapabilities),
+                                     &gHalfCapabilities, NULL)))
+        {
+            vlog_error(
+                "ERROR: Unable to get device CL_DEVICE_HALF_FP_CONFIG. (%d)\n",
+                error);
+            return TEST_FAIL;
+        }
+        if (HALF_REQUIRED_FEATURES_1
+                != (gHalfCapabilities & HALF_REQUIRED_FEATURES_1)
+            && HALF_REQUIRED_FEATURES_2
+                != (gHalfCapabilities & HALF_REQUIRED_FEATURES_2))
+        {
+            char list[300] = "";
+            if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_NEAREST))
+                strncat(list, "CL_FP_ROUND_TO_NEAREST, ", sizeof(list) - 1);
+            if (0 == (gHalfCapabilities & CL_FP_ROUND_TO_ZERO))
+                strncat(list, "CL_FP_ROUND_TO_ZERO, ", sizeof(list) - 1);
+            if (0 == (gHalfCapabilities & CL_FP_INF_NAN))
+                strncat(list, "CL_FP_INF_NAN, ", sizeof(list) - 1);
+            vlog_error("ERROR: required half features are missing: %s\n", list);
+
+            return TEST_FAIL;
+        }
+
+        if ((gHalfCapabilities & CL_FP_ROUND_TO_NEAREST) != 0)
+        {
+            gHalfRoundingMode = CL_HALF_RTE;
+        }
+        else // due to above condition it must be RTZ
+        {
+            gHalfRoundingMode = CL_HALF_RTZ;
+        }
+
+#else
+        vlog_error("FAIL: device says it supports cl_khr_fp16 but "
+                   "CL_DEVICE_HALF_FP_CONFIG is not in the headers!\n");
+        return TEST_FAIL;
+#endif
+    }
+
+
     uint32_t deviceFrequency = 0;
     size_t configSize = sizeof(deviceFrequency);
     if ((error = clGetDeviceInfo(gDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY,
@@ -828,6 +902,7 @@ test_status InitCL(cl_device_id device)
              "Bruteforce_Ulp_Error_Double() for more details.\n\n");
     }
 
+    vlog("\tTesting half precision? %s\n", no_yes[0 != gHasHalf]);
     vlog("\tIs Embedded? %s\n", no_yes[0 != gIsEmbedded]);
     if (gIsEmbedded)
         vlog("\tRunning in RTZ mode? %s\n", no_yes[0 != gIsInRTZMode]);
diff --git a/test_conformance/math_brute_force/reference_math.cpp b/test_conformance/math_brute_force/reference_math.cpp
index afa072f8e0..c31221e3ab 100644
--- a/test_conformance/math_brute_force/reference_math.cpp
+++ b/test_conformance/math_brute_force/reference_math.cpp
@@ -4699,6 +4699,49 @@ double reference_nextafter(double xx, double yy)
     return a.f;
 }
 
+cl_half reference_nanh(cl_ushort x)
+{
+    cl_ushort u;
+    cl_half h;
+    u = x | 0x7e00U;
+    memcpy(&h, &u, sizeof(cl_half));
+    return h;
+}
+
+float reference_nextafterh(float xx, float yy, bool allow_denorms)
+{
+    cl_half tmp_a = cl_half_from_float(xx, CL_HALF_RTE);
+    cl_half tmp_b = cl_half_from_float(yy, CL_HALF_RTE);
+    float x = cl_half_to_float(tmp_a);
+    float y = cl_half_to_float(tmp_b);
+
+    // take care of nans
+    if (x != x) return x;
+
+    if (y != y) return y;
+
+    if (x == y) return y;
+
+    short a_h = cl_half_from_float(x, CL_HALF_RTE);
+    short b_h = cl_half_from_float(y, CL_HALF_RTE);
+    short oa_h = a_h;
+
+    if (a_h & 0x8000) a_h = 0x8000 - a_h;
+    if (b_h & 0x8000) b_h = 0x8000 - b_h;
+
+    a_h += (a_h < b_h) ? 1 : -1;
+    a_h = (a_h < 0) ? (cl_short)0x8000 - a_h : a_h;
+
+    if (!allow_denorms && IsHalfSubnormal(a_h))
+    {
+        if (cl_half_to_float(0x7fff & oa_h) < cl_half_to_float(0x7fff & a_h))
+            a_h = (a_h & 0x8000) ? 0x8400 : 0x0400;
+        else
+            a_h = 0;
+    }
+
+    return cl_half_to_float(a_h);
+}
 
 long double reference_nextafterl(long double xx, long double yy)
 {
diff --git a/test_conformance/math_brute_force/reference_math.h b/test_conformance/math_brute_force/reference_math.h
index 78b245105e..175ee73120 100644
--- a/test_conformance/math_brute_force/reference_math.h
+++ b/test_conformance/math_brute_force/reference_math.h
@@ -18,8 +18,10 @@
 
 #if defined(__APPLE__)
 #include <OpenCL/opencl.h>
+
 #else
 #include <CL/cl.h>
+#include "CL/cl_half.h"
 #endif
 
 // --  for testing float --
@@ -160,6 +162,8 @@ long double reference_fractl(long double, long double*);
 long double reference_fmal(long double, long double, long double);
 long double reference_madl(long double, long double, long double);
 long double reference_nextafterl(long double, long double);
+float reference_nextafterh(float, float, bool allow_denormals = true);
+cl_half reference_nanh(cl_ushort);
 long double reference_recipl(long double);
 long double reference_rootnl(long double, int);
 long double reference_rsqrtl(long double);
diff --git a/test_conformance/math_brute_force/ternary_double.cpp b/test_conformance/math_brute_force/ternary_double.cpp
index 2ae65424f8..7de115b294 100644
--- a/test_conformance/math_brute_force/ternary_double.cpp
+++ b/test_conformance/math_brute_force/ternary_double.cpp
@@ -108,8 +108,7 @@ const double specialValues[] = {
     +0.0,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 } // anonymous namespace
 
diff --git a/test_conformance/math_brute_force/ternary_float.cpp b/test_conformance/math_brute_force/ternary_float.cpp
index d11f4ba3b9..c597d240bb 100644
--- a/test_conformance/math_brute_force/ternary_float.cpp
+++ b/test_conformance/math_brute_force/ternary_float.cpp
@@ -118,8 +118,7 @@ const float specialValues[] = {
     +0.0f,
 };
 
-constexpr size_t specialValuesCount =
-    sizeof(specialValues) / sizeof(specialValues[0]);
+constexpr size_t specialValuesCount = ARRAY_SIZE(specialValues);
 
 } // anonymous namespace
 
diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
new file mode 100644
index 0000000000..ba6dd4d480
--- /dev/null
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -0,0 +1,777 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <cstring>
+
+#define CORRECTLY_ROUNDED 0
+#define FLUSHED 1
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetTernaryKernel(kernel_name, builtin, ParameterType::Half,
+                                ParameterType::Half, ParameterType::Half,
+                                ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// A table of more difficult cases to get right
+static const cl_half specialValuesHalf[] = {
+    0xffff, 0x0000, 0x0001, 0x7c00, /*INFINITY*/
+    0xfc00, /*-INFINITY*/
+    0x8000, /*-0*/
+    0x7bff, /*HALF_MAX*/
+    0x0400, /*HALF_MIN*/
+    0x03ff, /* Largest denormal */
+    0x3c00, /* 1 */
+    0xbc00, /* -1 */
+    0x3555, /*nearest value to 1/3*/
+    0x3bff, /*largest number less than one*/
+    0xc000, /* -2 */
+    0xfbff, /* -HALF_MAX */
+    0x8400, /* -HALF_MIN */
+    0x4248, /* M_PI_H */
+    0xc248, /* -M_PI_H */
+    0xbbff, /* Largest negative fraction */
+};
+
+constexpr size_t specialValuesHalfCount = ARRAY_SIZE(specialValuesHalf);
+
+} // anonymous namespace
+
+int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    float maxErrorVal3 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    constexpr size_t bufferElements = BUFFER_SIZE / sizeof(cl_half);
+
+    cl_uchar overflow[bufferElements];
+    float half_ulps = f->half_ulps;
+    int skipNanInf = (0 == strcmp("fma", f->nameInCode));
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_half *hp0 = (cl_half *)gIn;
+        cl_half *hp1 = (cl_half *)gIn2;
+        cl_half *hp2 = (cl_half *)gIn3;
+        size_t idx = 0;
+
+        if (i == 0)
+        { // test edge cases
+            uint32_t x, y, z;
+            x = y = z = 0;
+            for (; idx < bufferElements; idx++)
+            {
+                hp0[idx] = specialValuesHalf[x];
+                hp1[idx] = specialValuesHalf[y];
+                hp2[idx] = specialValuesHalf[z];
+
+                if (++x >= specialValuesHalfCount)
+                {
+                    x = 0;
+                    if (++y >= specialValuesHalfCount)
+                    {
+                        y = 0;
+                        if (++z >= specialValuesHalfCount) break;
+                    }
+                }
+            }
+            if (idx == bufferElements)
+                vlog_error("Test Error: not all special cases tested!\n");
+        }
+
+        auto any_value = [&d]() {
+            float t = (float)((double)genrand_int32(d) / (double)0xFFFFFFFF);
+            return HFF((1.0f - t) * CL_HALF_MIN + t * CL_HALF_MAX);
+        };
+
+        for (; idx < bufferElements; idx++)
+        {
+            hp0[idx] = any_value();
+            hp1[idx] = any_value();
+            hp2[idx] = any_value();
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer2, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn2, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer2 ***\n", error);
+            return error;
+        }
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer3, CL_FALSE, 0,
+                                          BUFFER_SIZE, gIn3, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer3 ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, BUFFER_SIZE,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, BUFFER_SIZE, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeof(cl_half) * sizeValues[j];
+            size_t localCount = (BUFFER_SIZE + vectorSize - 1)
+                / vectorSize; // BUFFER_SIZE / vectorSize  rounded up
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer2), &gInBuffer2)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 3,
+                                        sizeof(gInBuffer3), &gInBuffer3)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue)))
+        {
+            vlog("clFlush failed\n");
+            return error;
+        }
+
+        // Calculate the correctly rounded reference result
+        cl_half *res = (cl_half *)gOut_Ref;
+        if (skipNanInf)
+        {
+            for (size_t j = 0; j < bufferElements; j++)
+            {
+                feclearexcept(FE_OVERFLOW);
+                res[j] = HFF((float)f->func.f_fma(
+                    HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
+        }
+        else
+        {
+            for (size_t j = 0; j < bufferElements; j++)
+                res[j] = HFF((float)f->func.f_fma(
+                    HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
+        }
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        uint16_t *t = (uint16_t *)gOut_Ref;
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                uint16_t *q = (uint16_t *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    int fail;
+                    cl_half test = ((cl_half *)q)[j];
+                    float ref1 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
+                                               HTF(hp2[j]), CORRECTLY_ROUNDED);
+                    cl_half correct = HFF(ref1);
+
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
+                    if (skipNanInf)
+                    {
+                        if (overflow[j] || IsHalfInfinity(correct)
+                            || IsHalfNaN(correct) || IsHalfInfinity(hp0[j])
+                            || IsHalfNaN(hp0[j]) || IsHalfInfinity(hp1[j])
+                            || IsHalfNaN(hp1[j]) || IsHalfInfinity(hp2[j])
+                            || IsHalfNaN(hp2[j]))
+                            continue;
+                    }
+
+                    float err =
+                        test != correct ? Ulp_Error_Half(test, ref1) : 0.f;
+                    fail = !(fabsf(err) <= half_ulps);
+
+                    if (fail && ftz)
+                    {
+                        // retry per section 6.5.3.2  with flushing on
+                        if (0.0f == test
+                            && 0.0f
+                                == f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
+                                                 HTF(hp2[j]), FLUSHED))
+                        {
+                            fail = 0;
+                            err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (fail && IsHalfSubnormal(hp0[j]))
+                        { // look at me,
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                            float ref2 =
+                                f->func.f_fma(0.0f, HTF(hp1[j]), HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct2 = HFF(ref2);
+                            float ref3 =
+                                f->func.f_fma(-0.0f, HTF(hp1[j]), HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct3 = HFF(ref3);
+
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correct2)
+                                    || IsHalfNaN(correct2)
+                                    || IsHalfInfinity(correct3)
+                                    || IsHalfNaN(correct3))
+                                    continue;
+                            }
+
+                            float err2 = test != correct2
+                                ? Ulp_Error_Half(test, ref2)
+                                : 0.f;
+                            float err3 = test != correct3
+                                ? Ulp_Error_Half(test, ref3)
+                                : 0.f;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps))
+                                    && (!(fabsf(err3) <= half_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(0.0f, HTF(hp1[j]),
+                                                         HTF(hp2[j]), FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(-0.0f, HTF(hp1[j]),
+                                                         HTF(hp2[j]), FLUSHED)))
+                            {
+                                fail = 0;
+                                err = 0.0f;
+                            }
+
+                            // try with first two args as zero
+                            if (IsHalfSubnormal(hp1[j]))
+                            { // its fun to have fun,
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                ref2 = f->func.f_fma(0.0f, 0.0f, HTF(hp2[j]),
+                                                     CORRECTLY_ROUNDED);
+                                correct2 = HFF(ref2);
+                                ref3 = f->func.f_fma(-0.0f, 0.0f, HTF(hp2[j]),
+                                                     CORRECTLY_ROUNDED);
+                                correct3 = HFF(ref3);
+                                float ref4 =
+                                    f->func.f_fma(0.0f, -0.0f, HTF(hp2[j]),
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct4 = HFF(ref4);
+                                float ref5 =
+                                    f->func.f_fma(-0.0f, -0.0f, HTF(hp2[j]),
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct5 = HFF(ref5);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
+                                {
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsHalfInfinity(correct2)
+                                        || IsHalfNaN(correct2)
+                                        || IsHalfInfinity(correct3)
+                                        || IsHalfNaN(correct3)
+                                        || IsHalfInfinity(correct4)
+                                        || IsHalfNaN(correct4)
+                                        || IsHalfInfinity(correct5)
+                                        || IsHalfNaN(correct5))
+                                        continue;
+                                }
+
+                                err2 = test != correct2
+                                    ? Ulp_Error_Half(test, ref2)
+                                    : 0.f;
+                                err3 = test != correct3
+                                    ? Ulp_Error_Half(test, ref3)
+                                    : 0.f;
+                                float err4 = test != correct4
+                                    ? Ulp_Error_Half(test, ref4)
+                                    : 0.f;
+                                float err5 = test != correct5
+                                    ? Ulp_Error_Half(test, ref5)
+                                    : 0.f;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= half_ulps))
+                                        && (!(fabsf(err3) <= half_ulps))
+                                        && (!(fabsf(err4) <= half_ulps))
+                                        && (!(fabsf(err5) <= half_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, 0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, 0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, -0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, -0.0f,
+                                                             HTF(hp2[j]),
+                                                             FLUSHED)))
+                                {
+                                    fail = 0;
+                                    err = 0.0f;
+                                }
+
+                                if (IsHalfSubnormal(hp2[j]))
+                                {
+                                    if (test == 0.0f) // 0*0+0 is 0
+                                    {
+                                        fail = 0;
+                                        err = 0.0f;
+                                    }
+                                }
+                            }
+                            else if (IsHalfSubnormal(hp2[j]))
+                            {
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                ref2 = f->func.f_fma(0.0f, HTF(hp1[j]), 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct2 = HFF(ref2);
+                                ref3 = f->func.f_fma(-0.0f, HTF(hp1[j]), 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct3 = HFF(ref3);
+                                float ref4 =
+                                    f->func.f_fma(0.0f, HTF(hp1[j]), -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct4 = HFF(ref4);
+                                float ref5 =
+                                    f->func.f_fma(-0.0f, HTF(hp1[j]), -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct5 = HFF(ref5);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
+                                {
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsHalfInfinity(correct2)
+                                        || IsHalfNaN(correct2)
+                                        || IsHalfInfinity(correct3)
+                                        || IsHalfNaN(correct3)
+                                        || IsHalfInfinity(correct4)
+                                        || IsHalfNaN(correct4)
+                                        || IsHalfInfinity(correct5)
+                                        || IsHalfNaN(correct5))
+                                        continue;
+                                }
+
+                                err2 = test != correct2
+                                    ? Ulp_Error_Half(test, ref2)
+                                    : 0.f;
+                                err3 = test != correct3
+                                    ? Ulp_Error_Half(test, ref3)
+                                    : 0.f;
+                                float err4 = test != correct4
+                                    ? Ulp_Error_Half(test, ref4)
+                                    : 0.f;
+                                float err5 = test != correct5
+                                    ? Ulp_Error_Half(test, ref5)
+                                    : 0.f;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= half_ulps))
+                                        && (!(fabsf(err3) <= half_ulps))
+                                        && (!(fabsf(err4) <= half_ulps))
+                                        && (!(fabsf(err5) <= half_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(0.0f, HTF(hp1[j]),
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, HTF(hp1[j]),
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(0.0f, HTF(hp1[j]),
+                                                             -0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(-0.0f, HTF(hp1[j]),
+                                                             -0.0f, FLUSHED)))
+                                {
+                                    fail = 0;
+                                    err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsHalfSubnormal(hp1[j]))
+                        {
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                            float ref2 =
+                                f->func.f_fma(HTF(hp0[j]), 0.0f, HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct2 = HFF(ref2);
+                            float ref3 =
+                                f->func.f_fma(HTF(hp0[j]), -0.0f, HTF(hp2[j]),
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct3 = HFF(ref3);
+
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correct2)
+                                    || IsHalfNaN(correct2)
+                                    || IsHalfInfinity(correct3)
+                                    || IsHalfNaN(correct3))
+                                    continue;
+                            }
+
+                            float err2 = test != correct2
+                                ? Ulp_Error_Half(test, ref2)
+                                : 0.f;
+                            float err3 = test != correct3
+                                ? Ulp_Error_Half(test, ref3)
+                                : 0.f;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps))
+                                    && (!(fabsf(err3) <= half_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(HTF(hp0[j]), 0.0f,
+                                                         HTF(hp2[j]), FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(HTF(hp0[j]), -0.0f,
+                                                         HTF(hp2[j]), FLUSHED)))
+                            {
+                                fail = 0;
+                                err = 0.0f;
+                            }
+
+                            // try with second two args as zero
+                            if (IsHalfSubnormal(hp2[j]))
+                            {
+                                if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                                ref2 = f->func.f_fma(HTF(hp0[j]), 0.0f, 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct2 = HFF(ref2);
+                                ref3 = f->func.f_fma(HTF(hp0[j]), -0.0f, 0.0f,
+                                                     CORRECTLY_ROUNDED);
+                                correct3 = HFF(ref3);
+                                float ref4 =
+                                    f->func.f_fma(HTF(hp0[j]), 0.0f, -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct4 = HFF(ref4);
+                                float ref5 =
+                                    f->func.f_fma(HTF(hp0[j]), -0.0f, -0.0f,
+                                                  CORRECTLY_ROUNDED);
+                                cl_half correct5 = HFF(ref5);
+
+                                // Per section 10 paragraph 6, accept any result
+                                // if an input or output is a infinity or NaN or
+                                // overflow
+                                if (!gInfNanSupport)
+                                {
+                                    if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                    // Note: no double rounding here.  Reference
+                                    // functions calculate in single precision.
+                                    if (IsHalfInfinity(correct2)
+                                        || IsHalfNaN(correct2)
+                                        || IsHalfInfinity(correct3)
+                                        || IsHalfNaN(correct3)
+                                        || IsHalfInfinity(correct4)
+                                        || IsHalfNaN(correct4)
+                                        || IsHalfInfinity(correct5)
+                                        || IsHalfNaN(correct5))
+                                        continue;
+                                }
+
+                                err2 = test != correct2
+                                    ? Ulp_Error_Half(test, ref2)
+                                    : 0.f;
+                                err3 = test != correct3
+                                    ? Ulp_Error_Half(test, ref3)
+                                    : 0.f;
+                                float err4 = test != correct4
+                                    ? Ulp_Error_Half(test, ref4)
+                                    : 0.f;
+                                float err5 = test != correct5
+                                    ? Ulp_Error_Half(test, ref5)
+                                    : 0.f;
+                                fail = fail
+                                    && ((!(fabsf(err2) <= half_ulps))
+                                        && (!(fabsf(err3) <= half_ulps))
+                                        && (!(fabsf(err4) <= half_ulps))
+                                        && (!(fabsf(err5) <= half_ulps)));
+                                if (fabsf(err2) < fabsf(err)) err = err2;
+                                if (fabsf(err3) < fabsf(err)) err = err3;
+                                if (fabsf(err4) < fabsf(err)) err = err4;
+                                if (fabsf(err5) < fabsf(err)) err = err5;
+
+                                // retry per section 6.5.3.4
+                                if (0.0f == test
+                                    && (0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), 0.0f,
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), -0.0f,
+                                                             0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), 0.0f,
+                                                             -0.0f, FLUSHED)
+                                        || 0.0f
+                                            == f->func.f_fma(HTF(hp0[j]), -0.0f,
+                                                             -0.0f, FLUSHED)))
+                                {
+                                    fail = 0;
+                                    err = 0.0f;
+                                }
+                            }
+                        }
+                        else if (fail && IsHalfSubnormal(hp2[j]))
+                        {
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+
+                            float ref2 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
+                                                       0.0f, CORRECTLY_ROUNDED);
+                            cl_half correct2 = HFF(ref2);
+                            float ref3 =
+                                f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]), -0.0f,
+                                              CORRECTLY_ROUNDED);
+                            cl_half correct3 = HFF(ref3);
+
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correct2)
+                                    || IsHalfNaN(correct2)
+                                    || IsHalfInfinity(correct3)
+                                    || IsHalfNaN(correct3))
+                                    continue;
+                            }
+
+                            float err2 = test != correct2
+                                ? Ulp_Error_Half(test, correct2)
+                                : 0.f;
+                            float err3 = test != correct3
+                                ? Ulp_Error_Half(test, correct3)
+                                : 0.f;
+                            fail = fail
+                                && ((!(fabsf(err2) <= half_ulps))
+                                    && (!(fabsf(err3) <= half_ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (0.0f == test
+                                && (0.0f
+                                        == f->func.f_fma(HTF(hp0[j]),
+                                                         HTF(hp1[j]), 0.0f,
+                                                         FLUSHED)
+                                    || 0.0f
+                                        == f->func.f_fma(HTF(hp0[j]),
+                                                         HTF(hp1[j]), -0.0f,
+                                                         FLUSHED)))
+                            {
+                                fail = 0;
+                                err = 0.0f;
+                            }
+                        }
+                    }
+
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = HTF(hp0[j]);
+                        maxErrorVal2 = HTF(hp1[j]);
+                        maxErrorVal3 = HTF(hp2[j]);
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error(
+                            "\nERROR: %s%s: %f ulp error at {%a, %a, %a} "
+                            "({0x%4.4x, 0x%4.4x, 0x%4.4x}): *%a vs. %a\n",
+                            f->name, sizeNames[k], err, HTF(hp0[j]),
+                            HTF(hp1[j]), HTF(hp2[j]), hp0[j], hp1[j], hp2[j],
+                            HTF(res[j]), HTF(test));
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64 " bufferSize:%10d \n",
+                     i, step, BUFFER_SIZE);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t%8.2f @ {%a, %a, %a}", maxError, maxErrorVal, maxErrorVal2,
+             maxErrorVal3);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/test_functions.h b/test_conformance/math_brute_force/test_functions.h
index 78aef9c9a6..16b361d53a 100644
--- a/test_conformance/math_brute_force/test_functions.h
+++ b/test_conformance/math_brute_force/test_functions.h
@@ -24,6 +24,9 @@ int TestFunc_Float_Float(const Func *f, MTdata, bool relaxedMode);
 // double foo(double)
 int TestFunc_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half)
+int TestFunc_Half_Half(const Func *f, MTdata, bool relaxedMode);
+
 // int foo(float)
 int TestFunc_Int_Float(const Func *f, MTdata, bool relaxedMode);
 
@@ -36,6 +39,9 @@ int TestFunc_Float_UInt(const Func *f, MTdata, bool relaxedMode);
 // double foo(ulong)
 int TestFunc_Double_ULong(const Func *f, MTdata, bool relaxedMode);
 
+// half (Ushort)
+int TestFunc_Half_UShort(const Func *f, MTdata, bool relaxedMode);
+
 // Returns {0, 1} for scalar and {0, -1} for vector.
 // int foo(float)
 int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
@@ -44,21 +50,34 @@ int TestMacro_Int_Float(const Func *f, MTdata, bool relaxedMode);
 // int foo(double)
 int TestMacro_Int_Double(const Func *f, MTdata, bool relaxedMode);
 
+// int foo(half,half)
+int TestMacro_Int_Half_Half(const Func *f, MTdata, bool relaxedMode);
+
+// int foo(half)
+int TestMacro_Int_Half(const Func *f, MTdata, bool relaxedMode);
+
+// int foo(half)
+int TestFunc_Int_Half(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, float)
 int TestFunc_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, double)
 int TestFunc_Double_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// Half foo(half, half)
+int TestFunc_Half_Half_Half(const Func *f, MTdata, bool relaxedMode);
 // Special handling for nextafter.
-// float foo(float, float)
-int TestFunc_Float_Float_Float_nextafter(const Func *f, MTdata,
-                                         bool relaxedMode);
+// Half foo(Half, Half)
+int TestFunc_Half_Half_Half_nextafter(const Func *f, MTdata, bool relaxedMode);
+
+// Half foo(Half, Half)
+int TestFunc_Half_Half_Half_common(const Func *f, MTdata, int isNextafter,
+                                   bool relaxedMode);
+
+// Half foo(Half, int)
+int TestFunc_Half_Half_Int(const Func *f, MTdata, bool relaxedMode);
 
-// Special handling for nextafter.
-// double foo(double, double)
-int TestFunc_Double_Double_Double_nextafter(const Func *f, MTdata,
-                                            bool relaxedMode);
 
 // float op float
 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
@@ -68,6 +87,9 @@ int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata,
 int TestFunc_Double_Double_Double_Operator(const Func *f, MTdata,
                                            bool relaxedMode);
 
+// half op half
+int TestFunc_Half_Half_Half_Operator(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, int)
 int TestFunc_Float_Float_Int(const Func *f, MTdata, bool relaxedMode);
 
@@ -89,24 +111,36 @@ int TestFunc_Float_Float_Float_Float(const Func *f, MTdata, bool relaxedMode);
 int TestFunc_Double_Double_Double_Double(const Func *f, MTdata,
                                          bool relaxedMode);
 
+// half foo(half, half, half)
+int TestFunc_Half_Half_Half_Half(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, float*)
 int TestFunc_Float2_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, double*)
 int TestFunc_Double2_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, half*)
+int TestFunc_Half2_Half(const Func *f, MTdata, bool relaxedMode);
+
 // float foo(float, int*)
 int TestFunc_FloatI_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, int*)
 int TestFunc_DoubleI_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, int*)
+int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode);
+
 // float foo(float, float, int*)
 int TestFunc_FloatI_Float_Float(const Func *f, MTdata, bool relaxedMode);
 
 // double foo(double, double, int*)
 int TestFunc_DoubleI_Double_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half foo(half, half, int*)
+int TestFunc_HalfI_Half_Half(const Func *f, MTdata d, bool relaxedMode);
+
 // Special handling for mad.
 // float mad(float, float, float)
 int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode);
@@ -115,4 +149,7 @@ int TestFunc_mad_Float(const Func *f, MTdata, bool relaxedMode);
 // double mad(double, double, double)
 int TestFunc_mad_Double(const Func *f, MTdata, bool relaxedMode);
 
+// half mad(half, half, half)
+int TestFunc_mad_Half(const Func *f, MTdata, bool relaxedMode);
+
 #endif
diff --git a/test_conformance/math_brute_force/unary_half.cpp b/test_conformance/math_brute_force/unary_half.cpp
new file mode 100644
index 0000000000..9b230f96bc
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_half.cpp
@@ -0,0 +1,483 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::Half, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+// Thread specific data for a worker thread
+typedef struct ThreadInfo
+{
+    clMemWrapper inBuf; // input buffer for the thread
+    clMemWrapper outBuf[VECTOR_SIZE_COUNT]; // output buffers for the thread
+    float maxError; // max error value. Init to 0.
+    double maxErrorValue; // position of the max error value.  Init to 0.
+    clCommandQueueWrapper
+        tQueue; // per thread command queue to improve performance
+} ThreadInfo;
+
+struct TestInfoBase
+{
+    size_t subBufferSize; // Size of the sub-buffer in elements
+    const Func *f; // A pointer to the function info
+    cl_uint threadCount; // Number of worker threads
+    cl_uint jobCount; // Number of jobs
+    cl_uint step; // step between each chunk and the next.
+    cl_uint scale; // stride between individual test values
+    float ulps; // max_allowed ulps
+    int ftz; // non-zero if running in flush to zero mode
+
+    int isRangeLimited; // 1 if the function is only to be evaluated over a
+                        // range
+    float half_sin_cos_tan_limit;
+};
+
+struct TestInfo : public TestInfoBase
+{
+    TestInfo(const TestInfoBase &base): TestInfoBase(base) {}
+
+    // Array of thread specific information
+    std::vector<ThreadInfo> tinfo;
+
+    // Programs for various vector sizes.
+    Programs programs;
+
+    // Thread-specific kernels for each vector size:
+    // k[vector_size][thread_id]
+    KernelMatrix k;
+};
+
+cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
+{
+    TestInfo *job = (TestInfo *)data;
+    size_t buffer_elements = job->subBufferSize;
+    size_t buffer_size = buffer_elements * sizeof(cl_half);
+    cl_uint scale = job->scale;
+    cl_uint base = job_id * (cl_uint)job->step;
+    ThreadInfo *tinfo = &(job->tinfo[thread_id]);
+    float ulps = job->ulps;
+    fptr func = job->f->func;
+    cl_uint j, k;
+    cl_int error = CL_SUCCESS;
+
+    int isRangeLimited = job->isRangeLimited;
+    float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
+    int ftz = job->ftz;
+
+    std::vector<float> s(0);
+
+    cl_event e[VECTOR_SIZE_COUNT];
+    cl_ushort *out[VECTOR_SIZE_COUNT];
+
+    if (gHostFill)
+    {
+        // start the map of the output arrays
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            out[j] = (uint16_t *)clEnqueueMapBuffer(
+                tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
+                buffer_size, 0, NULL, e + j, &error);
+            if (error || NULL == out[j])
+            {
+                vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                           error);
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
+    }
+
+    // Write the new values to the input array
+    cl_ushort *p = (cl_ushort *)gIn + thread_id * buffer_elements;
+    for (j = 0; j < buffer_elements; j++)
+    {
+        p[j] = base + j * scale;
+    }
+
+    if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
+                                      buffer_size, p, 0, NULL, NULL)))
+    {
+        vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
+        return error;
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if (gHostFill)
+        {
+            // Wait for the map to finish
+            if ((error = clWaitForEvents(1, e + j)))
+            {
+                vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
+                return error;
+            }
+            if ((error = clReleaseEvent(e[j])))
+            {
+                vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
+                return error;
+            }
+        }
+
+        // Fill the result buffer with garbage, so that old results don't carry
+        // over
+        uint32_t pattern = 0xacdcacdc;
+        if (gHostFill)
+        {
+            memset_pattern4(out[j], &pattern, buffer_size);
+            error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                            out[j], 0, NULL, NULL);
+            test_error(error, "clEnqueueUnmapMemObject failed!\n");
+        }
+        else
+        {
+            error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                        &pattern, sizeof(pattern), 0,
+                                        buffer_size, 0, NULL, NULL);
+            test_error(error, "clEnqueueFillBuffer failed!\n");
+        }
+
+        // run the kernel
+        size_t vectorCount =
+            (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
+        cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
+                                                 // own copy of the cl_kernel
+        cl_program program = job->programs[j];
+
+        if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
+                                    &tinfo->outBuf[j])))
+        {
+            LogBuildError(program);
+            return error;
+        }
+        if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
+                                    &tinfo->inBuf)))
+        {
+            LogBuildError(program);
+            return error;
+        }
+
+        if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
+                                            &vectorCount, NULL, 0, NULL, NULL)))
+        {
+            vlog_error("FAILED -- could not execute kernel\n");
+            return error;
+        }
+    }
+
+
+    // Get that moving
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
+
+    if (gSkipCorrectnessTesting) return CL_SUCCESS;
+
+    // Calculate the correctly rounded reference result
+    cl_half *r = (cl_half *)gOut_Ref + thread_id * buffer_elements;
+    s.resize(buffer_elements);
+    for (j = 0; j < buffer_elements; j++)
+    {
+        s[j] = (float)cl_half_to_float(p[j]);
+        r[j] = HFF(func.f_f(s[j]));
+    }
+
+    // Read the data back -- no need to wait for the first N-1 buffers. This is
+    // an in order queue.
+    for (j = gMinVectorSizeIndex; j + 1 < gMaxVectorSizeIndex; j++)
+    {
+        out[j] = (uint16_t *)clEnqueueMapBuffer(
+            tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_READ, 0,
+            buffer_size, 0, NULL, NULL, &error);
+        if (error || NULL == out[j])
+        {
+            vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
+                       error);
+            return error;
+        }
+    }
+    // Wait for the last buffer
+    out[j] = (uint16_t *)clEnqueueMapBuffer(tinfo->tQueue, tinfo->outBuf[j],
+                                            CL_TRUE, CL_MAP_READ, 0,
+                                            buffer_size, 0, NULL, NULL, &error);
+    if (error || NULL == out[j])
+    {
+        vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j, error);
+        return error;
+    }
+
+    // Verify data
+    for (j = 0; j < buffer_elements; j++)
+    {
+        for (k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+        {
+            cl_ushort *q = out[k];
+
+            // If we aren't getting the correctly rounded result
+            if (r[j] != q[j])
+            {
+                float test = cl_half_to_float(q[j]);
+                double correct = func.f_f(s[j]);
+                float err = Ulp_Error_Half(q[j], correct);
+                int fail = !(fabsf(err) <= ulps);
+
+                // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
+                if (isRangeLimited
+                    && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
+                    && fabsf(s[j]) < INFINITY)
+                {
+                    if (fabsf(test) <= half_sin_cos_tan_limit)
+                    {
+                        err = 0;
+                        fail = 0;
+                    }
+                }
+
+                if (fail)
+                {
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsHalfResultSubnormal(correct, ulps))
+                        {
+                            fail = fail && (test != 0.0f);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsHalfSubnormal(p[j]))
+                        {
+                            double correct2 = func.f_f(0.0);
+                            double correct3 = func.f_f(-0.0);
+                            float err2 = Ulp_Error_Half(q[j], correct2);
+                            float err3 = Ulp_Error_Half(q[j], correct3);
+                            fail = fail
+                                && ((!(fabsf(err2) <= ulps))
+                                    && (!(fabsf(err3) <= ulps)));
+                            if (fabsf(err2) < fabsf(err)) err = err2;
+                            if (fabsf(err3) < fabsf(err)) err = err3;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(correct2, ulps)
+                                || IsHalfResultSubnormal(correct3, ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                }
+                if (fabsf(err) > tinfo->maxError)
+                {
+                    tinfo->maxError = fabsf(err);
+                    tinfo->maxErrorValue = s[j];
+                }
+                if (fail)
+                {
+                    vlog_error("\nERROR: %s%s: %f ulp error at %a "
+                               "(half 0x%04x)\nExpected: %a (half 0x%04x) "
+                               "\nActual: %a (half 0x%04x)\n",
+                               job->f->name, sizeNames[k], err, s[j], p[j],
+                               cl_half_to_float(r[j]), r[j], test, q[j]);
+                    error = -1;
+                    return error;
+                }
+            }
+        }
+    }
+
+    for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+    {
+        if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
+                                             out[j], 0, NULL, NULL)))
+        {
+            vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
+                       j, error);
+            return error;
+        }
+    }
+
+    if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
+
+
+    if (0 == (base & 0x0fffffff))
+    {
+        if (gVerboseBruteForce)
+        {
+            vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
+                 "ThreadCount:%2u\n",
+                 base, job->step, job->scale, buffer_elements, job->ulps,
+                 job->threadCount);
+        }
+        else
+        {
+            vlog(".");
+        }
+        fflush(stdout);
+    }
+
+    return error;
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    TestInfoBase test_info_base;
+    cl_int error;
+    size_t i, j;
+    float maxError = 0.0f;
+    double maxErrorVal = 0.0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    // Init test_info
+    memset(&test_info_base, 0, sizeof(test_info_base));
+    TestInfo test_info(test_info_base);
+
+    test_info.threadCount = GetThreadCount();
+
+    test_info.subBufferSize = BUFFER_SIZE
+        / (sizeof(cl_half) * RoundUpToNextPowerOfTwo(test_info.threadCount));
+    test_info.scale = getTestScale(sizeof(cl_half));
+    test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
+    if (test_info.step / test_info.subBufferSize != test_info.scale)
+    {
+        // there was overflow
+        test_info.jobCount = 1;
+    }
+    else
+    {
+        test_info.jobCount =
+            std::max((cl_uint)1,
+                     (cl_uint)((1ULL << sizeof(cl_half) * 8) / test_info.step));
+    }
+
+    test_info.f = f;
+    test_info.ulps = f->half_ulps;
+    test_info.ftz =
+        f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+
+    test_info.tinfo.resize(test_info.threadCount);
+
+    for (i = 0; i < test_info.threadCount; i++)
+    {
+        cl_buffer_region region = { i * test_info.subBufferSize
+                                        * sizeof(cl_half),
+                                    test_info.subBufferSize * sizeof(cl_half) };
+        test_info.tinfo[i].inBuf =
+            clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
+                              CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
+        if (error || NULL == test_info.tinfo[i].inBuf)
+        {
+            vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
+                       "region {%zd, %zd}\n",
+                       region.origin, region.size);
+            return error;
+        }
+
+        for (j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
+                gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
+                &region, &error);
+            if (error || NULL == test_info.tinfo[i].outBuf[j])
+            {
+                vlog_error("Error: Unable to create sub-buffer of gOutBuffer "
+                           "for region {%zd, %zd}\n",
+                           region.origin, region.size);
+                return error;
+            }
+        }
+        test_info.tinfo[i].tQueue =
+            clCreateCommandQueue(gContext, gDevice, 0, &error);
+        if (NULL == test_info.tinfo[i].tQueue || error)
+        {
+            vlog_error("clCreateCommandQueue failed. (%d)\n", error);
+            return error;
+        }
+    }
+
+    // Check for special cases for unary float
+    test_info.isRangeLimited = 0;
+    test_info.half_sin_cos_tan_limit = 0;
+    if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit = 1.0f
+            + test_info.ulps
+                * (FLT_EPSILON / 2.0f); // out of range results from finite
+                                        // inputs must be in [-1,1]
+    }
+    else if (0 == strcmp(f->name, "half_tan"))
+    {
+        test_info.isRangeLimited = 1;
+        test_info.half_sin_cos_tan_limit =
+            INFINITY; // out of range resut from finite inputs must be numeric
+    }
+
+    // Init the kernels
+    {
+        BuildKernelInfo build_info = { test_info.threadCount, test_info.k,
+                                       test_info.programs, f->nameInCode };
+        error = ThreadPool_Do(BuildKernel_HalfFn,
+                              gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                              &build_info);
+        test_error(error, "ThreadPool_Do: BuildKernel_HalfFn failed\n");
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        error = ThreadPool_Do(TestHalf, test_info.jobCount, &test_info);
+
+        // Accumulate the arithmetic errors
+        for (i = 0; i < test_info.threadCount; i++)
+        {
+            if (test_info.tinfo[i].maxError > maxError)
+            {
+                maxError = test_info.tinfo[i].maxError;
+                maxErrorVal = test_info.tinfo[i].maxErrorValue;
+            }
+        }
+
+        test_error(error, "ThreadPool_Do: TestHalf failed\n");
+
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results_half.cpp b/test_conformance/math_brute_force/unary_two_results_half.cpp
new file mode 100644
index 0000000000..70a9f4c79e
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_half.cpp
@@ -0,0 +1,452 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::Half, ParameterType::Half,
+                              vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+} // anonymous namespace
+
+int TestFunc_Half2_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError0 = 0.0f;
+    float maxError1 = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal0 = 0.0f;
+    float maxErrorVal1 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+    size_t bufferSize = bufferElements * sizeof(cl_half);
+
+    std::vector<cl_uchar> overflow(bufferElements);
+    int isFract = 0 == strcmp("fract", f->nameInCode);
+    int skipNanInf = isFract;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    float half_ulps = f->half_ulps;
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 16); i += step)
+    {
+        // Init input array
+        cl_half *pIn = (cl_half *)gIn;
+        for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j;
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, bufferSize);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, bufferSize,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, bufferSize);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, bufferSize,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSize, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 1 failed!\n");
+
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSize, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 2 failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_half);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue)))
+        {
+            vlog_error("clFlush failed\n");
+            return error;
+        }
+
+        FPU_mode_type oldMode;
+        RoundingMode oldRoundMode = kRoundToNearestEven;
+        if (isFract)
+        {
+            // Calculate the correctly rounded reference result
+            memset(&oldMode, 0, sizeof(oldMode));
+            if (ftz) ForceFTZ(&oldMode);
+
+            // Set the rounding mode to match the device
+            if (gIsInRTZMode)
+                oldRoundMode = set_round(kRoundTowardZero, kfloat);
+        }
+
+        // Calculate the correctly rounded reference result
+        cl_half *ref1 = (cl_half *)gOut_Ref;
+        cl_half *ref2 = (cl_half *)gOut_Ref2;
+
+        if (skipNanInf)
+        {
+            for (size_t j = 0; j < bufferElements; j++)
+            {
+                double dd;
+                feclearexcept(FE_OVERFLOW);
+
+                ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
+                ref2[j] = HFF((float)dd);
+
+                // ensure correct rounding of fract result is not reaching 1
+                if (isFract && HTF(ref1[j]) >= 1.f) ref1[j] = 0x3bff;
+
+                overflow[j] =
+                    FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
+            }
+        }
+        else
+        {
+            for (size_t j = 0; j < bufferElements; j++)
+            {
+                double dd;
+                ref1[j] = HFF((float)f->func.f_fpf(HTF(pIn[j]), &dd));
+                ref2[j] = HFF((float)dd);
+            }
+        }
+
+        if (isFract && ftz) RestoreFPState(&oldMode);
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
+                                         bufferSize, gOut2[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting)
+        {
+            if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+            break;
+        }
+
+        // Verify data
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *test1 = (cl_half *)gOut[k];
+                cl_half *test2 = (cl_half *)gOut2[k];
+
+                // If we aren't getting the correctly rounded result
+                if (ref1[j] != test1[j] || ref2[j] != test2[j])
+                {
+                    double fp_correct1 = 0, fp_correct2 = 0;
+                    float err = 0, err2 = 0;
+
+                    fp_correct1 = f->func.f_fpf(HTF(pIn[j]), &fp_correct2);
+
+                    cl_half correct1 = HFF(fp_correct1);
+                    cl_half correct2 = HFF(fp_correct2);
+
+                    // Per section 10 paragraph 6, accept any result if an input
+                    // or output is a infinity or NaN or overflow
+                    if (skipNanInf)
+                    {
+                        if (skipNanInf && overflow[j]) continue;
+                        // Note: no double rounding here.  Reference functions
+                        // calculate in single precision.
+                        if (IsHalfInfinity(correct1) || IsHalfNaN(correct1)
+                            || IsHalfInfinity(correct2) || IsHalfNaN(correct2)
+                            || IsHalfInfinity(pIn[j]) || IsHalfNaN(pIn[j]))
+                            continue;
+                    }
+
+                    err = Ulp_Error_Half(test1[j], fp_correct1);
+                    err2 = Ulp_Error_Half(test2[j], fp_correct2);
+
+                    int fail =
+                        !(fabsf(err) <= half_ulps && fabsf(err2) <= half_ulps);
+
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsHalfResultSubnormal(fp_correct1, half_ulps))
+                        {
+                            if (IsHalfResultSubnormal(fp_correct2, half_ulps))
+                            {
+                                fail = fail
+                                    && !(HTF(test1[j]) == 0.0f
+                                         && HTF(test2[j]) == 0.0f);
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    err2 = 0.0f;
+                                }
+                            }
+                            else
+                            {
+                                fail = fail
+                                    && !(HTF(test1[j]) == 0.0f
+                                         && fabsf(err2) <= half_ulps);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                        else if (IsHalfResultSubnormal(fp_correct2, half_ulps))
+                        {
+                            fail = fail
+                                && !(HTF(test2[j]) == 0.0f
+                                     && fabsf(err) <= half_ulps);
+                            if (!fail) err2 = 0.0f;
+                        }
+
+
+                        // retry per section 6.5.3.3
+                        if (IsHalfSubnormal(pIn[j]))
+                        {
+                            double fp_correctp, fp_correctn;
+                            double fp_correct2p, fp_correct2n;
+                            float errp, err2p, errn, err2n;
+
+                            if (skipNanInf) feclearexcept(FE_OVERFLOW);
+                            fp_correctp = f->func.f_fpf(0.0, &fp_correct2p);
+                            fp_correctn = f->func.f_fpf(-0.0, &fp_correct2n);
+
+                            cl_half correctp = HFF(fp_correctp);
+                            cl_half correctn = HFF(fp_correctn);
+                            cl_half correct2p = HFF(fp_correct2p);
+                            cl_half correct2n = HFF(fp_correct2n);
+
+                            // Per section 10 paragraph 6, accept any result if
+                            // an input or output is a infinity or NaN or
+                            // overflow
+                            if (skipNanInf)
+                            {
+                                if (fetestexcept(FE_OVERFLOW)) continue;
+
+                                // Note: no double rounding here.  Reference
+                                // functions calculate in single precision.
+                                if (IsHalfInfinity(correctp)
+                                    || IsHalfNaN(correctp)
+                                    || IsHalfInfinity(correctn)
+                                    || IsHalfNaN(correctn)
+                                    || IsHalfInfinity(correct2p)
+                                    || IsHalfNaN(correct2p)
+                                    || IsHalfInfinity(correct2n)
+                                    || IsHalfNaN(correct2n))
+                                    continue;
+                            }
+
+                            errp = Ulp_Error_Half(test1[j], fp_correctp);
+                            err2p = Ulp_Error_Half(test1[j], fp_correct2p);
+                            errn = Ulp_Error_Half(test1[j], fp_correctn);
+                            err2n = Ulp_Error_Half(test1[j], fp_correct2n);
+
+                            fail = fail
+                                && ((!(fabsf(errp) <= half_ulps))
+                                    && (!(fabsf(err2p) <= half_ulps))
+                                    && ((!(fabsf(errn) <= half_ulps))
+                                        && (!(fabsf(err2n) <= half_ulps))));
+                            if (fabsf(errp) < fabsf(err)) err = errp;
+                            if (fabsf(errn) < fabsf(err)) err = errn;
+                            if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
+                            if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
+
+                            // retry per section 6.5.3.4
+                            if (IsHalfResultSubnormal(fp_correctp, half_ulps)
+                                || IsHalfResultSubnormal(fp_correctn,
+                                                         half_ulps))
+                            {
+                                if (IsHalfResultSubnormal(fp_correct2p,
+                                                          half_ulps)
+                                    || IsHalfResultSubnormal(fp_correct2n,
+                                                             half_ulps))
+                                {
+                                    fail = fail
+                                        && !(HTF(test1[j]) == 0.0f
+                                             && HTF(test2[j]) == 0.0f);
+                                    if (!fail) err = err2 = 0.0f;
+                                }
+                                else
+                                {
+                                    fail = fail
+                                        && !(HTF(test1[j]) == 0.0f
+                                             && fabsf(err2) <= half_ulps);
+                                    if (!fail) err = 0.0f;
+                                }
+                            }
+                            else if (IsHalfResultSubnormal(fp_correct2p,
+                                                           half_ulps)
+                                     || IsHalfResultSubnormal(fp_correct2n,
+                                                              half_ulps))
+                            {
+                                fail = fail
+                                    && !(HTF(test2[j]) == 0.0f
+                                         && (fabsf(err) <= half_ulps));
+                                if (!fail) err2 = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError0)
+                    {
+                        maxError0 = fabsf(err);
+                        maxErrorVal0 = HTF(pIn[j]);
+                    }
+                    if (fabsf(err2) > maxError1)
+                    {
+                        maxError1 = fabsf(err2);
+                        maxErrorVal1 = HTF(pIn[j]);
+                    }
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
+                                   "*{%a, %a} vs. {%a, %a}\n",
+                                   f->name, sizeNames[k], err, err2,
+                                   HTF(pIn[j]), HTF(ref1[j]), HTF(ref2[j]),
+                                   HTF(test1[j]), HTF(test2[j]));
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10zu \n",
+                     i, step, bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
+             maxErrorVal1);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/unary_two_results_i_half.cpp b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
new file mode 100644
index 0000000000..5906c2837a
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_two_results_i_half.cpp
@@ -0,0 +1,347 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+
+#include <cinttypes>
+#include <climits>
+#include <cstring>
+
+namespace {
+
+cl_int BuildKernelFn_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::Int, ParameterType::Half,
+                              vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+cl_ulong abs_cl_long(cl_long i)
+{
+    cl_long mask = i >> 63;
+    return (i ^ mask) - mask;
+}
+
+} // anonymous namespace
+
+int TestFunc_HalfI_Half(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    KernelMatrix kernels;
+    float maxError = 0.0f;
+    int64_t maxError2 = 0;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal = 0.0f;
+    float maxErrorVal2 = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+
+    // sizeof(cl_half) < sizeof (int32_t)
+    // to prevent overflowing gOut_Ref2 it is necessary to use
+    // bigger type as denominator for buffer size calculation
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_int),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+
+    size_t bufferSizeLo = bufferElements * sizeof(cl_half);
+    size_t bufferSizeHi = bufferElements * sizeof(cl_int);
+
+    cl_ulong maxiError = 0;
+
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+
+    float half_ulps = f->half_ulps;
+
+    maxiError = half_ulps == INFINITY ? CL_ULONG_MAX : 0;
+
+    // Init the kernels
+    BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernelFn_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+        return error;
+
+    for (uint64_t i = 0; i < (1ULL << 16); i += step)
+    {
+        // Init input array
+        cl_half *pIn = (cl_half *)gIn;
+        for (size_t j = 0; j < bufferElements; j++) pIn[j] = (cl_ushort)i + j;
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSizeLo, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // Write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, bufferSizeLo);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, bufferSizeLo,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+
+                memset_pattern4(gOut2[j], &pattern, bufferSizeHi);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
+                                                  CL_FALSE, 0, bufferSizeHi,
+                                                  gOut2[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSizeLo, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 1 failed!\n");
+
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer2[j], &pattern,
+                                            sizeof(pattern), 0, bufferSizeHi, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer 2 failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            // align working group size with the bigger output type
+            size_t vectorSize = sizeValues[j] * sizeof(cl_int);
+            size_t localCount = (bufferSizeHi + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error =
+                     clSetKernelArg(kernels[j][thread_id], 1,
+                                    sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 2,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue)))
+        {
+            vlog_error("clFlush failed\n");
+            return error;
+        }
+
+        // Calculate the correctly rounded reference result
+        cl_half *ref1 = (cl_half *)gOut_Ref;
+        int32_t *ref2 = (int32_t *)gOut_Ref2;
+        for (size_t j = 0; j < bufferElements; j++)
+            ref1[j] = HFF((float)f->func.f_fpI(HTF(pIn[j]), ref2 + j));
+
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            cl_bool blocking =
+                (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], blocking, 0,
+                                         bufferSizeLo, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+            if ((error = clEnqueueReadBuffer(gQueue, gOutBuffer2[j], blocking,
+                                             0, bufferSizeHi, gOut2[j], 0, NULL,
+                                             NULL)))
+            {
+                vlog_error("ReadArray2 failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_half *test1 = (cl_half *)(gOut[k]);
+                int32_t *test2 = (int32_t *)(gOut2[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (ref1[j] != test1[j] || ref2[j] != test2[j])
+                {
+                    cl_half test = ((cl_half *)test1)[j];
+                    int correct2 = INT_MIN;
+                    float fp_correct =
+                        (float)f->func.f_fpI(HTF(pIn[j]), &correct2);
+                    cl_half correct = HFF(fp_correct);
+                    float err = correct != test
+                        ? Ulp_Error_Half(test, fp_correct)
+                        : 0.f;
+                    cl_long iErr = (int64_t)test2[j] - (int64_t)correct2;
+                    int fail = !(fabsf(err) <= half_ulps
+                                 && abs_cl_long(iErr) <= maxiError);
+                    if (ftz)
+                    {
+                        // retry per section 6.5.3.2
+                        if (IsHalfResultSubnormal(fp_correct, half_ulps))
+                        {
+                            fail = fail && !(test == 0.0f && iErr == 0);
+                            if (!fail) err = 0.0f;
+                        }
+
+                        // retry per section 6.5.3.3
+                        if (IsHalfSubnormal(pIn[j]))
+                        {
+                            int correct5, correct6;
+                            double fp_correct3 = f->func.f_fpI(0.0, &correct5);
+                            double fp_correct4 = f->func.f_fpI(-0.0, &correct6);
+
+                            float err2 = Ulp_Error_Half(test, fp_correct3);
+                            float err3 = Ulp_Error_Half(test, fp_correct4);
+
+                            cl_long iErr2 =
+                                (long long)test2[j] - (long long)correct5;
+                            cl_long iErr3 =
+                                (long long)test2[j] - (long long)correct6;
+
+                            // Did +0 work?
+                            if (fabsf(err2) <= half_ulps
+                                && abs_cl_long(iErr2) <= maxiError)
+                            {
+                                err = err2;
+                                iErr = iErr2;
+                                fail = 0;
+                            }
+                            // Did -0 work?
+                            else if (fabsf(err3) <= half_ulps
+                                     && abs_cl_long(iErr3) <= maxiError)
+                            {
+                                err = err3;
+                                iErr = iErr3;
+                                fail = 0;
+                            }
+
+                            // retry per section 6.5.3.4
+                            if (fail
+                                && (IsHalfResultSubnormal(correct2, half_ulps)
+                                    || IsHalfResultSubnormal(fp_correct3,
+                                                             half_ulps)))
+                            {
+                                fail = fail
+                                    && !(test == 0.0f
+                                         && (abs_cl_long(iErr2) <= maxiError
+                                             || abs_cl_long(iErr3)
+                                                 <= maxiError));
+                                if (!fail)
+                                {
+                                    err = 0.0f;
+                                    iErr = 0;
+                                }
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = pIn[j];
+                    }
+                    if (llabs(iErr) > maxError2)
+                    {
+                        maxError2 = llabs(iErr);
+                        maxErrorVal2 = pIn[j];
+                    }
+
+                    if (fail)
+                    {
+                        vlog_error("\nERROR: %s%s: {%f, %d} ulp error at %a: "
+                                   "*{%a, %d} vs. {%a, %d}\n",
+                                   f->name, sizeNames[k], err, (int)iErr,
+                                   HTF(pIn[j]), HTF(ref1[j]),
+                                   ((int *)gOut_Ref2)[j], HTF(test), test2[j]);
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10zu \n",
+                     i, step, bufferSizeHi);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+
+        vlog("\t{%8.2f, %" PRId64 "} @ {%a, %a}", maxError, maxError2,
+             maxErrorVal, maxErrorVal2);
+    }
+
+    vlog("\n");
+
+    return CL_SUCCESS;
+}
diff --git a/test_conformance/math_brute_force/unary_u_half.cpp b/test_conformance/math_brute_force/unary_u_half.cpp
new file mode 100644
index 0000000000..6f21ef3eee
--- /dev/null
+++ b/test_conformance/math_brute_force/unary_u_half.cpp
@@ -0,0 +1,239 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.h"
+#include "function_list.h"
+#include "test_functions.h"
+#include "utility.h"
+#include "reference_math.h"
+
+#include <cstring>
+#include <cinttypes>
+
+namespace {
+
+static cl_int BuildKernel_HalfFn(cl_uint job_id, cl_uint thread_id UNUSED,
+                                 void *p)
+{
+    BuildKernelInfo &info = *(BuildKernelInfo *)p;
+    auto generator = [](const std::string &kernel_name, const char *builtin,
+                        cl_uint vector_size_index) {
+        return GetUnaryKernel(kernel_name, builtin, ParameterType::Half,
+                              ParameterType::UShort, vector_size_index);
+    };
+    return BuildKernels(info, job_id, generator);
+}
+
+} // anonymous namespace
+
+int TestFunc_Half_UShort(const Func *f, MTdata d, bool relaxedMode)
+{
+    int error;
+    Programs programs;
+    KernelMatrix kernels;
+    const unsigned thread_id = 0; // Test is currently not multithreaded.
+    float maxError = 0.0f;
+    int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gHalfCapabilities);
+    float maxErrorVal = 0.0f;
+    uint64_t step = getTestStep(sizeof(cl_half), BUFFER_SIZE);
+    size_t bufferElements = std::min(BUFFER_SIZE / sizeof(cl_half),
+                                     size_t(1ULL << (sizeof(cl_half) * 8)));
+    size_t bufferSize = bufferElements * sizeof(cl_half);
+    logFunctionInfo(f->name, sizeof(cl_half), relaxedMode);
+    const char *name = f->name;
+    float half_ulps = f->half_ulps;
+
+    // Init the kernels
+    BuildKernelInfo build_info = { 1, kernels, programs, f->nameInCode };
+    if ((error = ThreadPool_Do(BuildKernel_HalfFn,
+                               gMaxVectorSizeIndex - gMinVectorSizeIndex,
+                               &build_info)))
+    {
+        return error;
+    }
+
+    for (uint64_t i = 0; i < (1ULL << 32); i += step)
+    {
+        // Init input array
+        cl_ushort *p = (cl_ushort *)gIn;
+        for (size_t j = 0; j < bufferElements; j++) p[j] = (uint16_t)i + j;
+
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
+                                          bufferSize, gIn, 0, NULL, NULL)))
+        {
+            vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
+            return error;
+        }
+
+        // write garbage into output arrays
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            uint32_t pattern = 0xacdcacdc;
+            if (gHostFill)
+            {
+                memset_pattern4(gOut[j], &pattern, bufferSize);
+                if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
+                                                  CL_FALSE, 0, bufferSize,
+                                                  gOut[j], 0, NULL, NULL)))
+                {
+                    vlog_error(
+                        "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
+                        error, j);
+                    return error;
+                }
+            }
+            else
+            {
+                error = clEnqueueFillBuffer(gQueue, gOutBuffer[j], &pattern,
+                                            sizeof(pattern), 0, bufferSize, 0,
+                                            NULL, NULL);
+                test_error(error, "clEnqueueFillBuffer failed!\n");
+            }
+        }
+
+        // Run the kernels
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            size_t vectorSize = sizeValues[j] * sizeof(cl_half);
+            size_t localCount = (bufferSize + vectorSize - 1) / vectorSize;
+            if ((error = clSetKernelArg(kernels[j][thread_id], 0,
+                                        sizeof(gOutBuffer[j]), &gOutBuffer[j])))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+            if ((error = clSetKernelArg(kernels[j][thread_id], 1,
+                                        sizeof(gInBuffer), &gInBuffer)))
+            {
+                LogBuildError(programs[j]);
+                return error;
+            }
+
+            if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
+                                                1, NULL, &localCount, NULL, 0,
+                                                NULL, NULL)))
+            {
+                vlog_error("FAILED -- could not execute kernel\n");
+                return error;
+            }
+        }
+
+        // Get that moving
+        if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
+
+        // Calculate the correctly rounded reference result
+        cl_half *r = (cl_half *)gOut_Ref;
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            if (!strcmp(name, "nan"))
+                r[j] = reference_nanh(p[j]);
+            else
+                r[j] = HFF(f->func.f_u(p[j]));
+        }
+        // Read the data back
+        for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
+        {
+            if ((error =
+                     clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
+                                         bufferSize, gOut[j], 0, NULL, NULL)))
+            {
+                vlog_error("ReadArray failed %d\n", error);
+                return error;
+            }
+        }
+
+        if (gSkipCorrectnessTesting) break;
+
+        // Verify data
+        cl_ushort *t = (cl_ushort *)gOut_Ref;
+        for (size_t j = 0; j < bufferElements; j++)
+        {
+            for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
+            {
+                cl_ushort *q = (cl_ushort *)(gOut[k]);
+
+                // If we aren't getting the correctly rounded result
+                if (t[j] != q[j])
+                {
+                    double test = cl_half_to_float(q[j]);
+                    double correct;
+                    if (!strcmp(name, "nan"))
+                        correct = cl_half_to_float(reference_nanh(p[j]));
+                    else
+                        correct = f->func.f_u(p[j]);
+
+                    float err = Ulp_Error_Half(q[j], correct);
+                    int fail = !(fabsf(err) <= half_ulps);
+
+                    if (fail)
+                    {
+                        if (ftz)
+                        {
+                            // retry per section 6.5.3.2
+                            if (IsHalfResultSubnormal(correct, half_ulps))
+                            {
+                                fail = fail && (test != 0.0f);
+                                if (!fail) err = 0.0f;
+                            }
+                        }
+                    }
+                    if (fabsf(err) > maxError)
+                    {
+                        maxError = fabsf(err);
+                        maxErrorVal = p[j];
+                    }
+                    if (fail)
+                    {
+                        vlog_error(
+                            "\n%s%s: %f ulp error at 0x%04x \nExpected: %a "
+                            "(0x%04x) \nActual: %a (0x%04x)\n",
+                            f->name, sizeNames[k], err, p[j],
+                            cl_half_to_float(r[j]), r[j], test, q[j]);
+                        return -1;
+                    }
+                }
+            }
+        }
+
+        if (0 == (i & 0x0fffffff))
+        {
+            if (gVerboseBruteForce)
+            {
+                vlog("base:%14" PRIu64 " step:%10" PRIu64
+                     "  bufferSize:%10zd \n",
+                     i, step, bufferSize);
+            }
+            else
+            {
+                vlog(".");
+            }
+            fflush(stdout);
+        }
+    }
+
+    if (!gSkipCorrectnessTesting)
+    {
+        if (gWimpyMode)
+            vlog("Wimp pass");
+        else
+            vlog("passed");
+    }
+
+    if (!gSkipCorrectnessTesting) vlog("\t%8.2f @ %a", maxError, maxErrorVal);
+    vlog("\n");
+
+    return error;
+}
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index 652d990a21..264fc7a435 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -22,6 +22,7 @@
 #include "harness/testHarness.h"
 #include "harness/ThreadPool.h"
 #include "harness/conversions.h"
+#include "CL/cl_half.h"
 
 #define BUFFER_SIZE (1024 * 1024 * 2)
 #define EMBEDDED_REDUCTION_FACTOR (64)
@@ -61,10 +62,20 @@ extern int gFastRelaxedDerived;
 extern int gWimpyMode;
 extern int gHostFill;
 extern int gIsInRTZMode;
+extern int gHasHalf;
+extern int gInfNanSupport;
+extern int gIsEmbedded;
 extern int gVerboseBruteForce;
 extern uint32_t gMaxVectorSizeIndex;
 extern uint32_t gMinVectorSizeIndex;
 extern cl_device_fp_config gFloatCapabilities;
+extern cl_device_fp_config gHalfCapabilities;
+extern RoundingMode gFloatToHalfRoundingMode;
+
+extern cl_half_rounding_mode gHalfRoundingMode;
+
+#define HFF(num) cl_half_from_float(num, gHalfRoundingMode)
+#define HTF(num) cl_half_to_float(num)
 
 #define LOWER_IS_BETTER 0
 #define HIGHER_IS_BETTER 1
@@ -115,6 +126,12 @@ inline int IsFloatResultSubnormal(double x, float ulps)
     return x < MAKE_HEX_DOUBLE(0x1.0p-126, 0x1, -126);
 }
 
+inline int IsHalfResultSubnormal(float x, float ulps)
+{
+    x = fabs(x) - MAKE_HEX_FLOAT(0x1.0p-24, 0x1, -24) * ulps;
+    return x < MAKE_HEX_FLOAT(0x1.0p-14, 0x1, -14);
+}
+
 inline int IsFloatResultSubnormalAbsError(double x, float abs_err)
 {
     x = x - abs_err;
@@ -157,6 +174,26 @@ inline int IsFloatNaN(double x)
     return ((u.u & 0x7fffffffU) > 0x7F800000U);
 }
 
+inline bool IsHalfNaN(const cl_half v)
+{
+    // Extract FP16 exponent and mantissa
+    uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+    uint16_t h_mant = ((cl_half)v) & 0x3FF;
+
+    // NaN test
+    return (h_exp == 0x1F && h_mant != 0);
+}
+
+inline bool IsHalfInfinity(const cl_half v)
+{
+    // Extract FP16 exponent and mantissa
+    uint16_t h_exp = (((cl_half)v) >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
+    uint16_t h_mant = ((cl_half)v) & 0x3FF;
+
+    // Inf test
+    return (h_exp == 0x1F && h_mant == 0);
+}
+
 cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
 
 // Windows (since long double got deprecated) sets the x87 to 53-bit precision

From 79fc236e4aa971713d6e9cb27d0fafa8986ade7b Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 16 Jan 2024 10:50:07 -0700
Subject: [PATCH 3/7] bruteforce: Remove unnecessary half to float conversion
 (#1874)

---
 .../math_brute_force/binary_operator_half.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
index e7f53af871..31e5f49f16 100644
--- a/test_conformance/math_brute_force/binary_operator_half.cpp
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -198,7 +198,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
             if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
         }
     }
-
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))
     {
@@ -374,8 +373,8 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         double correct2, correct3;
                         float err2, err3;
 
-                        correct2 = HTF(func.f_ff(0.0, s2[j]));
-                        correct3 = HTF(func.f_ff(-0.0, s2[j]));
+                        correct2 = func.f_ff(0.0, s2[j]);
+                        correct3 = func.f_ff(-0.0, s2[j]);
 
                         // Per section 10 paragraph 6, accept any result if an
                         // input or output is a infinity or NaN or overflow
@@ -407,16 +406,17 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                             if (!fail) err = 0.0f;
                         }
 
+
                         // try with both args as zero
                         if (IsHalfSubnormal(p2[j]))
                         {
                             double correct4, correct5;
                             float err4, err5;
 
-                            correct2 = HTF(func.f_ff(0.0, 0.0));
-                            correct3 = HTF(func.f_ff(-0.0, 0.0));
-                            correct4 = HTF(func.f_ff(0.0, -0.0));
-                            correct5 = HTF(func.f_ff(-0.0, -0.0));
+                            correct2 = func.f_ff(0.0, 0.0);
+                            correct3 = func.f_ff(-0.0, 0.0);
+                            correct4 = func.f_ff(0.0, -0.0);
+                            correct5 = func.f_ff(-0.0, -0.0);
 
                             // Per section 10 paragraph 6, accept any result if
                             // an input or output is a infinity or NaN or
@@ -466,8 +466,9 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                         double correct2, correct3;
                         float err2, err3;
 
-                        correct2 = HTF(func.f_ff(s[j], 0.0));
-                        correct3 = HTF(func.f_ff(s[j], -0.0));
+                        correct2 = func.f_ff(s[j], 0.0);
+                        correct3 = func.f_ff(s[j], -0.0);
+
 
                         // Per section 10 paragraph 6, accept any result if an
                         // input or output is a infinity or NaN or overflow

From d338b42e8f97da34dfcaf71683acb1bd31c09fcc Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald@gigawatt.nl>
Date: Tue, 6 Feb 2024 17:25:31 +0000
Subject: [PATCH 4/7] Fix testing of half-precision fma. (#1882)

Half-precision functions are generally tested against the
single-precision reference. This causes double rounding: first to single
precision, then from there to half precision. For the most part, it is
good enough, but specifically in the case of fma, a correctly rounded
result is required and is not obtained, for instance for arguments
0x1.eacp+7, 0x1.3f4p+4, 0x1.c04p+14, which produce an exact result of
0x1.065fffp+15 which should be rounded to half-prefcision 0x1.064p+15,
but was previously first rounded to single-precision 0x1.066p+15, and
from there to half-precision 0x1.068p+15. Testing against reference_fmal
gives us sufficient precision that double rounding does not cause
issues.

The f_fma(..., FLUSHED) calls for FTZ testing cannot be updated the same
way but do not need to be: these calls all have at least one constant
operand of zero. If one operand is zero, double rounding cannot be an
issue.
---
 test_common/harness/errorHelpers.cpp               |  2 +-
 test_common/harness/errorHelpers.h                 |  2 +-
 test_conformance/math_brute_force/ternary_half.cpp | 14 +++++++-------
 test_conformance/math_brute_force/utility.h        |  1 +
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/test_common/harness/errorHelpers.cpp b/test_common/harness/errorHelpers.cpp
index eaccf64119..e45c3b6a8c 100644
--- a/test_common/harness/errorHelpers.cpp
+++ b/test_common/harness/errorHelpers.cpp
@@ -369,7 +369,7 @@ static float Ulp_Error_Half_Float(float test, double reference)
     return (float)scalbn(testVal - reference, ulp_exp);
 }
 
-float Ulp_Error_Half(cl_half test, float reference)
+float Ulp_Error_Half(cl_half test, double reference)
 {
     return Ulp_Error_Half_Float(cl_half_to_float(test), reference);
 }
diff --git a/test_common/harness/errorHelpers.h b/test_common/harness/errorHelpers.h
index 3f1d8fb145..e6d4620b4d 100644
--- a/test_common/harness/errorHelpers.h
+++ b/test_common/harness/errorHelpers.h
@@ -185,7 +185,7 @@ static int vlog_win32(const char *format, ...);
 
 extern const char *IGetErrorString(int clErrorCode);
 
-extern float Ulp_Error_Half(cl_half test, float reference);
+extern float Ulp_Error_Half(cl_half test, double reference);
 extern float Ulp_Error(float test, double reference);
 extern float Ulp_Error_Double(double test, long double reference);
 
diff --git a/test_conformance/math_brute_force/ternary_half.cpp b/test_conformance/math_brute_force/ternary_half.cpp
index ba6dd4d480..5c9fd87c8c 100644
--- a/test_conformance/math_brute_force/ternary_half.cpp
+++ b/test_conformance/math_brute_force/ternary_half.cpp
@@ -237,8 +237,8 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
             for (size_t j = 0; j < bufferElements; j++)
             {
                 feclearexcept(FE_OVERFLOW);
-                res[j] = HFF((float)f->func.f_fma(
-                    HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
+                res[j] = HFD((double)f->dfunc.f_fff(HTF(hp0[j]), HTF(hp1[j]),
+                                                    HTF(hp2[j])));
                 overflow[j] =
                     FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
             }
@@ -246,8 +246,8 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
         else
         {
             for (size_t j = 0; j < bufferElements; j++)
-                res[j] = HFF((float)f->func.f_fma(
-                    HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]), CORRECTLY_ROUNDED));
+                res[j] = HFD((double)f->dfunc.f_fff(HTF(hp0[j]), HTF(hp1[j]),
+                                                    HTF(hp2[j])));
         }
 
         // Read the data back
@@ -277,9 +277,9 @@ int TestFunc_Half_Half_Half_Half(const Func *f, MTdata d, bool relaxedMode)
                 {
                     int fail;
                     cl_half test = ((cl_half *)q)[j];
-                    float ref1 = f->func.f_fma(HTF(hp0[j]), HTF(hp1[j]),
-                                               HTF(hp2[j]), CORRECTLY_ROUNDED);
-                    cl_half correct = HFF(ref1);
+                    double ref1 = (double)f->dfunc.f_fff(
+                        HTF(hp0[j]), HTF(hp1[j]), HTF(hp2[j]));
+                    cl_half correct = HFD(ref1);
 
                     // Per section 10 paragraph 6, accept any result if an input
                     // or output is a infinity or NaN or overflow
diff --git a/test_conformance/math_brute_force/utility.h b/test_conformance/math_brute_force/utility.h
index 264fc7a435..2635800a44 100644
--- a/test_conformance/math_brute_force/utility.h
+++ b/test_conformance/math_brute_force/utility.h
@@ -75,6 +75,7 @@ extern RoundingMode gFloatToHalfRoundingMode;
 extern cl_half_rounding_mode gHalfRoundingMode;
 
 #define HFF(num) cl_half_from_float(num, gHalfRoundingMode)
+#define HFD(num) cl_half_from_double(num, gHalfRoundingMode)
 #define HTF(num) cl_half_to_float(num)
 
 #define LOWER_IS_BETTER 0

From 620c689919aefa7e87b7a082c0b67677c4ccecbd Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Sat, 2 Mar 2024 16:48:45 -0800
Subject: [PATCH 5/7] update fp16 staging branch from main (#1903)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* allocations: Move results array from stack to heap (#1857)

* allocations: Fix stack overflow

* check format fixes

* Fix windows stack overflow. (#1839)

* thread_dimensions: Avoid combinations of very small LWS and very large GWS (#1856)

Modify the existing condition to include extremely small LWS like
1x1 on large GWS values

* c11_atomics: Reduce the loopcounter for sequential consistency tests (#1853)

Reduce the loop from 1000000 to 500000 since the former value
makes the test run too long and cause system issues on certain
platforms

* Limit individual allocation size using the global memory size (#1835)

Signed-off-by: Ahmed Hesham <ahmed.hesham@arm.com>

* geometrics: fix Wsign-compare warnings (#1855)

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

* integer_ops: fix -Wformat warnings (#1860)

The main sources of warnings were:

 * Printing of a `size_t` which requires the `%zu` specifier.

 * Printing of `cl_long`/`cl_ulong` which is now done using the
   `PRI*64` macros to ensure portability across 32 and 64-bit builds.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

* Replace OBSOLETE_FORAMT with OBSOLETE_FORMAT (#1776)

* Replace OBSOLETE_FORAMT with OBSOLETE_FORMAT

In imageHelpers.cpp and few other places in image tests, OBSOLETE_FORMAT is misspelled as OBSOLETE_FORAMT.
Fix misspelling by replcaing it with OBSOLETE_FORMAT.

Fixes #1769

* Remove code guarded by OBSOLETE_FORMAT

Remove code guarded by OBSOLETE_FORMAT
as suggested by review comments

Fixes #1769

* Fix formating issues for OBSOLETE_FORMAT changes

Fix formatting issues observed in files while removing
code guarded by OBSOLETE_FORMAT

Fixes #1769

* Some more formatting fixes

Some more formatting fixes to get CI clean

Fixes #1769

* Final Formating fixes

Final formatting fixes for #1769

* Enhancement: Thread dimensions user parameters (#1384)

* Fix format in the test scope

* Add user params to limit testing

Add parameters to reduce amount of testing.
Helpful for debugging or for machines with lower performance.

* Restore default value

* Print info only if testing params bigger than 0.

* [NFC] conversions: reenable Wunused-but-set-variable (#1845)

Remove an assigned-to but unused variable.

Reenable the Wunused-but-set-variable warning for the conversions
suite, as it now compiles cleanly with this warning enabled.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

* Fix bug of conversion from long to double (#1847)

* Fix bug of conversion from long to double

It the input is long type, it should be load as long type, not ulong.

* update long2float

* math_brute_force: fix exp/exp2 rlx ULP calculation (#1848)

Fix the ULP error calculation for the `exp` and `exp2` builtins in
relaxed math mode for the full profile.

Previously, the `ulps` value kept being added to while verifying the
result buffer in a loop.  `ulps` could even become a `NaN` when the
input argument being tested was a `NaN`.

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

* Enable LARGEADDRESSAWARE for 32 bit compilation (#1858)

* Enable LARGEADDRESSAWARE for 32 bit compilation

32-bit executables built with MSVC linker have only 2GB virtual memory
address space by default, which might not be sufficient for some tests.

Enable LARGEADDRESSAWARE linker flag for 32-bit targets to allow tests
to handle addresses larger than 2 gigabytes.

https://learn.microsoft.com/en-us/cpp/build/reference/largeaddressaware-handle-large-addresses?view=msvc-170

Signed-off-by: Guo, Yilong <yilong.guo@intel.com>

* Apply suggestion

Co-authored-by: Ben Ashbaugh <ben.ashbaugh@intel.com>

---------

Signed-off-by: Guo, Yilong <yilong.guo@intel.com>
Co-authored-by: Ben Ashbaugh <ben.ashbaugh@intel.com>

* fix return code when readwrite image is not supported (#1873)

This function (do_test) starts by testing write and read individually.
Both of them can have errors.

When readwrite image is not supported, the function returns
TEST_SKIPPED_ITSELF potentially masking errors leading to the test
returning EXIT_SUCCESS even with errors along the way.

* fix macos builds by avoiding double compilation of function_list.cpp for test_spir (#1866)

* modernize CMakeLists for test_spir

* add the operating system release to the sccache key

* include the math brute force function list vs. building it twice

* fix the license header on the spirv-new tests (#1865)

The source files for the spirv-new tests were using the older Khronos
license instead of the proper Apache license.  Fixed the license in
all source files.

* compiler: fix grammar in error message (#1877)

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

* Updated semaphore tests to use clSemaphoreReImportSyncFdKHR. (#1854)

* Updated semaphore tests to use clSemaphoreReImportSyncFdKHR.

Additionally updated common semaphore code to handle spec updates
that restrict simultaneous importing/exporting of handles.

* Fix build issues on CI

* gcc build issues

* Make clReImportSemaphoreSyncFdKHR a required API
call if cl_khr_external_semaphore_sync_fd is present.

* Implement signal and wait for all semaphore types.

* subgroups: fix for testing too large WG sizes (#1620)

It seemed to be a typo; the comment says that it
tries to fetch local size for a subgroup count with
above max WG size, but it just used the previous
subgroup count.

The test on purpose sets a SG count to be a larger
number than the max work-items in the work group.
Given the minimum SG size is 1 WI, it means that there
can be a maximum of maximum work-group size of SGs (of
1 WI of size). Thus, if we request a number of SGs that
exceeds the local size, the query should fail as expected.

* add SPIR-V version testing (#1861)

* basic SPIR-V 1.3 testing support

* updated script to compile for more SPIR-V versions

* switch to general SPIR-V versions test

* update copyright text and fix license

* improve output while test is running

* check for higher SPIR-V versions first

* fix formatting

* fix the reported platform information for math brute force (#1884)

When the math brute force test printed the platform version it always
printed information for the first platform in the system, which could
be different than the platform for the passed-in device.  Fixed by
querying the platform from the passed-in device instead.

* api tests fix: Use MTdataHolder in test_get_image_info (#1871)

* Minor fixes in mutable dispatch tests. (#1829)

* Minor fixes in mutable dispatch tests.

* Fix size of newWrapper in MutableDispatchSVMArguments.
* Fix errnoneus clCommandNDRangeKernelKHR call.

Signed-off-by: John Kesapides <john.kesapides@arm.com>

* * Set the row_pitch for imageInfo in MutableDispatchImage1DArguments
and MutableDispatchImage2DArguments. The row_pitch is
used by get_image_size() to calculate the size of
the host pointers by generate_random_image_data.

Signed-off-by: John Kesapides <john.kesapides@arm.com>

---------

Signed-off-by: John Kesapides <john.kesapides@arm.com>

* add test for cl_khr_spirv_linkonce_odr (#1226)

* initial version of the test with placeholders for linkonce_odr linkage

* add OpExtension SPV_KHR_linkonce_odr extension

* add check for extension

* switch to actual LinkOnceODR linkage

* fix formatting

* add a test case to ensure a function with linkonce_odr is exported

* add back the extension check

* fix formatting

* undo compiler optimization and actually add the call to function a

* [NFC] subgroups: remove unnecessary extern keywords (#1892)

In C and C++ all functions have external linkage by default.

Also remove the unused `gMTdata` and `test_pipe_functions`
declarations.

Fixes https://github.com/KhronosGroup/OpenCL-CTS/issues/1137

Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>

* Added cl_khr_fp16 extension support for test_decorate from spirv_new (#1770)

* Added cl_khr_fp16 extension support for test_decorate from spirv_new, work in progres

* Complemented test_decorate saturation test to support cl_khr_fp16 extension (issue #142)

* Fixed clang format

* scope of modifications:

-changed naming convention of saturation .spvasm files related to
test_decorate of spirv_new
-restored float to char/uchar saturation tests
-few minor corrections

* fix ranges for half testing

* fix formating

* one more formatting fix

* remove unused function

* use isnan instead of std::isnan

isnan is currently implemented as a macro, not as a function, so
we can't use std::isnan.

* fix Clang warning about inexact conversion

---------

Co-authored-by: Ben Ashbaugh <ben.ashbaugh@intel.com>

* add support for custom devices (#1891)

enable the CTS to run on custom devices

---------

Signed-off-by: Ahmed Hesham <ahmed.hesham@arm.com>
Signed-off-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
Signed-off-by: Guo, Yilong <yilong.guo@intel.com>
Signed-off-by: John Kesapides <john.kesapides@arm.com>
Co-authored-by: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Co-authored-by: Haonan Yang <haonan.yang@intel.com>
Co-authored-by: Ahmed Hesham <117350656+ahesham-arm@users.noreply.github.com>
Co-authored-by: Sven van Haastregt <sven.vanhaastregt@arm.com>
Co-authored-by: niranjanjoshi121 <43807392+niranjanjoshi121@users.noreply.github.com>
Co-authored-by: Grzegorz Wawiorko <grzegorz.wawiorko@intel.com>
Co-authored-by: Wenwan Xing <wenwan.xing@intel.com>
Co-authored-by: Yilong Guo <yilong.guo@intel.com>
Co-authored-by: Romaric Jodin <89833130+rjodinchr@users.noreply.github.com>
Co-authored-by: joshqti <127994991+joshqti@users.noreply.github.com>
Co-authored-by: Pekka Jääskeläinen <pekka.jaaskelainen@tuni.fi>
Co-authored-by: imilenkovic00 <155085410+imilenkovic00@users.noreply.github.com>
Co-authored-by: John Kesapides <46718829+JohnKesapidesARM@users.noreply.github.com>
Co-authored-by: Marcin Hajder <marcin.hajder@gmail.com>
Co-authored-by: Aharon Abramson <aharon.abramson@mobileye.com>
---
 CMakeLists.txt                                |    5 +
 test_common/harness/crc32.h                   |   31 +-
 test_common/harness/imageHelpers.cpp          |   36 +-
 test_common/harness/imageHelpers.h            |   42 -
 test_common/harness/testHarness.cpp           |   35 +-
 test_conformance/SVM/test_migrate.cpp         |   26 +-
 .../allocations/allocation_execute.cpp        |   13 +-
 test_conformance/allocations/main.cpp         |    5 +
 test_conformance/api/test_mem_object_info.cpp |    5 +-
 .../api/test_sub_group_dispatch.cpp           |    5 +-
 .../basic/test_work_item_functions.cpp        |   13 +-
 test_conformance/c11_atomics/test_atomics.cpp |   12 +-
 .../vulkan_wrapper/opencl_vulkan_wrapper.cpp  |  178 ++-
 .../vulkan_wrapper/opencl_vulkan_wrapper.hpp  |   51 +-
 .../compiler/test_feature_macro.cpp           |    4 +-
 test_conformance/conversions/CMakeLists.txt   |    2 +-
 .../conversions/conversions_data_info.h       |    6 +-
 test_conformance/conversions/fplib.cpp        |    3 -
 .../mutable_command_arguments.cpp             |    4 +-
 .../mutable_command_image_arguments.cpp       |    2 +
 .../mutable_command_info.cpp                  |    3 +-
 .../test_external_semaphore.cpp               |   88 +-
 test_conformance/geometrics/CMakeLists.txt    |    2 -
 .../geometrics/test_geometrics_double.cpp     |   17 +-
 test_conformance/images/common.cpp            |   20 +-
 .../images/kernel_read_write/main.cpp         |    2 +-
 test_conformance/integer_ops/test_abs.cpp     |   58 +-
 test_conformance/integer_ops/test_absdiff.cpp |   57 +-
 test_conformance/integer_ops/test_add_sat.cpp |   17 +-
 .../integer_ops/test_integers.cpp             |   48 +-
 .../integer_ops/test_intmad24.cpp             |   12 +-
 .../integer_ops/test_intmul24.cpp             |    6 +-
 .../integer_ops/test_popcount.cpp             |   45 +-
 test_conformance/integer_ops/test_sub_sat.cpp |   17 +-
 .../integer_ops/test_unary_ops.cpp            |   11 +-
 .../integer_ops/test_upsample.cpp             |    2 +-
 .../verification_and_generation_functions.cpp |  290 ++++-
 test_conformance/math_brute_force/main.cpp    |    5 +-
 .../math_brute_force/unary_float.cpp          |    9 +-
 test_conformance/spir/CMakeLists.txt          |   16 +-
 test_conformance/spir/main.cpp                |    5 +
 test_conformance/spirv_new/CMakeLists.txt     |   13 -
 test_conformance/spirv_new/assemble_spirv.py  |   58 +-
 test_conformance/spirv_new/main.cpp           |   42 +-
 test_conformance/spirv_new/procs.h            |   29 +-
 .../spirv_new/spirv_asm/basic.spvasm32        |   33 +
 .../spirv_new/spirv_asm/basic.spvasm64        |   38 +
 .../decorate_rounding_rte_half_short.spvasm32 |   42 +
 .../decorate_rounding_rte_half_short.spvasm64 |   46 +
 .../decorate_rounding_rtn_half_short.spvasm32 |   42 +
 .../decorate_rounding_rtn_half_short.spvasm64 |   46 +
 .../decorate_rounding_rtp_half_short.spvasm32 |   42 +
 .../decorate_rounding_rtp_half_short.spvasm64 |   46 +
 .../decorate_rounding_rtz_half_short.spvasm32 |   42 +
 .../decorate_rounding_rtz_half_short.spvasm64 |   46 +
 ...turated_conversion_double_to_int.spvasm32} |    2 +-
 ...turated_conversion_double_to_int.spvasm64} |    2 +-
 ...urated_conversion_double_to_uint.spvasm32} |    2 +-
 ...urated_conversion_double_to_uint.spvasm64} |    2 +-
 ...turated_conversion_float_to_char.spvasm32} |    2 +-
 ...turated_conversion_float_to_char.spvasm64} |    2 +-
 ...urated_conversion_float_to_short.spvasm32} |    2 +-
 ...urated_conversion_float_to_short.spvasm64} |    2 +-
 ...urated_conversion_float_to_uchar.spvasm32} |    2 +-
 ...urated_conversion_float_to_uchar.spvasm64} |    2 +-
 ...rated_conversion_float_to_ushort.spvasm32} |    2 +-
 ...rated_conversion_float_to_ushort.spvasm64} |    2 +-
 ...saturated_conversion_half_to_char.spvasm32 |   47 +
 ...saturated_conversion_half_to_char.spvasm64 |   51 +
 ...aturated_conversion_half_to_uchar.spvasm32 |   47 +
 ...aturated_conversion_half_to_uchar.spvasm64 |   51 +
 .../linkage_linkonce_odr_main.spvasm32        |   47 +
 .../linkage_linkonce_odr_main.spvasm64        |   51 +
 .../linkage_linkonce_odr_noa_main.spvasm32    |   44 +
 .../linkage_linkonce_odr_noa_main.spvasm64    |   48 +
 .../linkage_linkonce_odr_obj.spvasm32         |   28 +
 .../linkage_linkonce_odr_obj.spvasm64         |   28 +
 .../spirv_new/spirv_asm/spv1.1/basic.spvasm32 |   33 +
 .../spirv_new/spirv_asm/spv1.1/basic.spvasm64 |   38 +
 .../spirv_new/spirv_asm/spv1.2/basic.spvasm32 |   33 +
 .../spirv_new/spirv_asm/spv1.2/basic.spvasm64 |   38 +
 .../spirv_new/spirv_asm/spv1.3/basic.spvasm32 |   33 +
 .../spirv_new/spirv_asm/spv1.3/basic.spvasm64 |   38 +
 .../spirv_new/spirv_asm/spv1.4/basic.spvasm32 |   33 +
 .../spirv_new/spirv_asm/spv1.4/basic.spvasm64 |   38 +
 .../spirv_new/spirv_asm/spv1.5/basic.spvasm32 |   33 +
 .../spirv_new/spirv_asm/spv1.5/basic.spvasm64 |   38 +
 .../spirv_new/spirv_asm/spv1.6/basic.spvasm32 |   33 +
 .../spirv_new/spirv_asm/spv1.6/basic.spvasm64 |   38 +
 test_conformance/spirv_new/testBase.h         |   28 +-
 .../spirv_new/test_basic_versions.cpp         |  123 ++
 ...l_khr_spirv_no_integer_wrap_decoration.cpp |   27 +-
 test_conformance/spirv_new/test_decorate.cpp  |  339 +++--
 .../spirv_new/test_get_program_il.cpp         |   29 +-
 test_conformance/spirv_new/test_linkage.cpp   |  120 +-
 test_conformance/spirv_new/test_op_atomic.cpp |   27 +-
 test_conformance/spirv_new/test_op_branch.cpp |   27 +-
 .../spirv_new/test_op_branch_conditional.cpp  |   27 +-
 .../spirv_new/test_op_composite_construct.cpp |   27 +-
 .../spirv_new/test_op_constant.cpp            |   27 +-
 .../spirv_new/test_op_copy_object.cpp         |   27 +-
 test_conformance/spirv_new/test_op_fmath.cpp  |   27 +-
 .../spirv_new/test_op_function.cpp            |   27 +-
 .../spirv_new/test_op_lifetime.cpp            |   27 +-
 .../spirv_new/test_op_loop_merge.cpp          |   27 +-
 test_conformance/spirv_new/test_op_negate.cpp |   27 +-
 test_conformance/spirv_new/test_op_opaque.cpp |   27 +-
 test_conformance/spirv_new/test_op_phi.cpp    |   27 +-
 .../spirv_new/test_op_selection_merge.cpp     |   27 +-
 .../spirv_new/test_op_spec_constant.cpp       |   29 +-
 test_conformance/spirv_new/test_op_undef.cpp  |   27 +-
 .../spirv_new/test_op_vector_extract.cpp      |   27 +-
 .../spirv_new/test_op_vector_insert.cpp       |   27 +-
 .../spirv_new/test_op_vector_times_scalar.cpp |   27 +-
 test_conformance/spirv_new/types.hpp          |   34 +-
 test_conformance/subgroups/procs.h            |  100 +-
 test_conformance/thread_dimensions/main.cpp   |   79 +-
 test_conformance/thread_dimensions/procs.h    |   63 +-
 .../test_thread_dimensions.cpp                | 1140 +++++++++++------
 test_conformance/vulkan/main.cpp              |    4 +-
 .../vulkan/test_vulkan_interop_buffer.cpp     |   50 +-
 .../vulkan/test_vulkan_interop_image.cpp      |    8 +-
 122 files changed, 3879 insertions(+), 1445 deletions(-)
 create mode 100644 test_conformance/spirv_new/spirv_asm/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm64
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_int.spvasm32 => decorate_saturated_conversion_double_to_int.spvasm32} (97%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_uint.spvasm64 => decorate_saturated_conversion_double_to_int.spvasm64} (98%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_uint.spvasm32 => decorate_saturated_conversion_double_to_uint.spvasm32} (97%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_int.spvasm64 => decorate_saturated_conversion_double_to_uint.spvasm64} (98%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_char.spvasm32 => decorate_saturated_conversion_float_to_char.spvasm32} (98%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_char.spvasm64 => decorate_saturated_conversion_float_to_char.spvasm64} (98%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_short.spvasm32 => decorate_saturated_conversion_float_to_short.spvasm32} (97%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_short.spvasm64 => decorate_saturated_conversion_float_to_short.spvasm64} (98%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_uchar.spvasm32 => decorate_saturated_conversion_float_to_uchar.spvasm32} (97%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_uchar.spvasm64 => decorate_saturated_conversion_float_to_uchar.spvasm64} (98%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_ushort.spvasm32 => decorate_saturated_conversion_float_to_ushort.spvasm32} (97%)
 rename test_conformance/spirv_new/spirv_asm/{decorate_saturated_conversion_ushort.spvasm64 => decorate_saturated_conversion_float_to_ushort.spvasm64} (98%)
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm32
 create mode 100644 test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm64
 create mode 100644 test_conformance/spirv_new/test_basic_versions.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c9bbf6f9b..5cfef6b332 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -157,6 +157,11 @@ if( WIN32 AND "${CMAKE_CXX_COMPILER_ID}" MATCHES "Intel" )
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Qlong-double -Qpc80 /DWIN32 /D_WINDOWS /W3 /GR /EHsc -nologo -Od -D_CRT_NONSTDC_NO_WARNINGS -EHsc -Wall -Qdiag-disable:68,111,177,186,161,869,1028,2259,2553,181,239,265,1188 -fp:strict -fp:source")
 endif()
 
+# To handle addresses larger than 2 gigabytes for 32bit targets
+if(WIN32 AND ${CLConform_TARGET_ARCH} STREQUAL "x86")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /LARGEADDRESSAWARE")
+endif()
+
 list(APPEND CLConform_LIBRARIES ${OPENCL_LIBRARIES})
 if(ANDROID)
     list(APPEND CLConform_LIBRARIES m)
diff --git a/test_common/harness/crc32.h b/test_common/harness/crc32.h
index 6958701108..aa45016385 100644
--- a/test_common/harness/crc32.h
+++ b/test_common/harness/crc32.h
@@ -1,19 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc.
-All Rights Reserved.  This code is protected by copyright laws and
-contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed
-in whole or in part to third parties, and may not be reproduced, republished,
-distributed, transmitted, displayed, broadcast or otherwise exploited in any
-manner without the express prior written permission of Khronos Group.
-
-The receipt or possession of this code does not convey any rights to
-reproduce, disclose, or distribute its contents, or to
-manufacture, use, or sell anything that it may describe, in whole
-or in part other than under the terms of the Khronos Adopters
-Agreement or Khronos Conformance Test Source License Agreement as
-executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #ifndef CRC32_H_
 #define CRC32_H_
diff --git a/test_common/harness/imageHelpers.cpp b/test_common/harness/imageHelpers.cpp
index 3e1a34422b..49d5402a87 100644
--- a/test_common/harness/imageHelpers.cpp
+++ b/test_common/harness/imageHelpers.cpp
@@ -97,23 +97,9 @@ uint32_t get_channel_data_type_size(cl_channel_type channelType)
         case CL_UNSIGNED_INT32: return sizeof(cl_int);
 
         case CL_UNORM_SHORT_565:
-        case CL_UNORM_SHORT_555:
-#ifdef OBSOLETE_FORAMT
-        case CL_UNORM_SHORT_565_REV:
-        case CL_UNORM_SHORT_555_REV:
-#endif
-            return 2;
-
-#ifdef OBSOLETE_FORAMT
-        case CL_UNORM_INT_8888:
-        case CL_UNORM_INT_8888_REV: return 4;
-#endif
+        case CL_UNORM_SHORT_555: return 2;
 
-        case CL_UNORM_INT_101010:
-#ifdef OBSOLETE_FORAMT
-        case CL_UNORM_INT_101010_REV:
-#endif
-            return 4;
+        case CL_UNORM_INT_101010: return 4;
 
         case CL_FLOAT: return sizeof(cl_float);
 
@@ -294,23 +280,9 @@ uint32_t get_pixel_size(const cl_image_format *format)
             return get_format_channel_count(format) * sizeof(cl_int);
 
         case CL_UNORM_SHORT_565:
-        case CL_UNORM_SHORT_555:
-#ifdef OBSOLETE_FORAMT
-        case CL_UNORM_SHORT_565_REV:
-        case CL_UNORM_SHORT_555_REV:
-#endif
-            return 2;
-
-#ifdef OBSOLETE_FORAMT
-        case CL_UNORM_INT_8888:
-        case CL_UNORM_INT_8888_REV: return 4;
-#endif
+        case CL_UNORM_SHORT_555: return 2;
 
-        case CL_UNORM_INT_101010:
-#ifdef OBSOLETE_FORAMT
-        case CL_UNORM_INT_101010_REV:
-#endif
-            return 4;
+        case CL_UNORM_INT_101010: return 4;
 
         case CL_FLOAT:
             return get_format_channel_count(format) * sizeof(cl_float);
diff --git a/test_common/harness/imageHelpers.h b/test_common/harness/imageHelpers.h
index f8ae4fb960..455f0edb4b 100644
--- a/test_common/harness/imageHelpers.h
+++ b/test_common/harness/imageHelpers.h
@@ -347,48 +347,6 @@ void read_image_pixel(void *imageData, image_descriptor *imageInfo, int x,
             break;
         }
 
-#ifdef OBSOLETE_FORMAT
-        case CL_UNORM_SHORT_565_REV: {
-            unsigned short *dPtr = (unsigned short *)ptr;
-            tempData[2] = (T)(dPtr[0] >> 11);
-            tempData[1] = (T)((dPtr[0] >> 5) & 63);
-            tempData[0] = (T)(dPtr[0] & 31);
-            break;
-        }
-
-        case CL_UNORM_SHORT_555_REV: {
-            unsigned short *dPtr = (unsigned short *)ptr;
-            tempData[2] = (T)((dPtr[0] >> 10) & 31);
-            tempData[1] = (T)((dPtr[0] >> 5) & 31);
-            tempData[0] = (T)(dPtr[0] & 31);
-            break;
-        }
-
-        case CL_UNORM_INT_8888: {
-            unsigned int *dPtr = (unsigned int *)ptr;
-            tempData[3] = (T)(dPtr[0] >> 24);
-            tempData[2] = (T)((dPtr[0] >> 16) & 0xff);
-            tempData[1] = (T)((dPtr[0] >> 8) & 0xff);
-            tempData[0] = (T)(dPtr[0] & 0xff);
-            break;
-        }
-        case CL_UNORM_INT_8888_REV: {
-            unsigned int *dPtr = (unsigned int *)ptr;
-            tempData[0] = (T)(dPtr[0] >> 24);
-            tempData[1] = (T)((dPtr[0] >> 16) & 0xff);
-            tempData[2] = (T)((dPtr[0] >> 8) & 0xff);
-            tempData[3] = (T)(dPtr[0] & 0xff);
-            break;
-        }
-
-        case CL_UNORM_INT_101010_REV: {
-            unsigned int *dPtr = (unsigned int *)ptr;
-            tempData[2] = (T)((dPtr[0] >> 20) & 0x3ff);
-            tempData[1] = (T)((dPtr[0] >> 10) & 0x3ff);
-            tempData[0] = (T)(dPtr[0] & 0x3ff);
-            break;
-        }
-#endif
         case CL_UNORM_SHORT_555: {
             cl_ushort *dPtr = (cl_ushort *)ptr;
             tempData[0] = (T)((dPtr[0] >> 10) & 31);
diff --git a/test_common/harness/testHarness.cpp b/test_common/harness/testHarness.cpp
index 3e5d7c9501..0e3c49e9a3 100644
--- a/test_common/harness/testHarness.cpp
+++ b/test_common/harness/testHarness.cpp
@@ -185,6 +185,9 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum,
         else if (strcmp(env_mode, "accelerator") == 0
                  || strcmp(env_mode, "CL_DEVICE_TYPE_ACCELERATOR") == 0)
             device_type = CL_DEVICE_TYPE_ACCELERATOR;
+        else if (strcmp(env_mode, "custom") == 0
+                 || strcmp(env_mode, "CL_DEVICE_TYPE_CUSTOM") == 0)
+            device_type = CL_DEVICE_TYPE_CUSTOM;
         else if (strcmp(env_mode, "default") == 0
                  || strcmp(env_mode, "CL_DEVICE_TYPE_DEFAULT") == 0)
             device_type = CL_DEVICE_TYPE_DEFAULT;
@@ -314,6 +317,12 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum,
             device_type = CL_DEVICE_TYPE_ACCELERATOR;
             argc--;
         }
+        else if (strcmp(argv[argc - 1], "custom") == 0
+                 || strcmp(argv[argc - 1], "CL_DEVICE_TYPE_CUSTOM") == 0)
+        {
+            device_type = CL_DEVICE_TYPE_CUSTOM;
+            argc--;
+        }
         else if (strcmp(argv[argc - 1], "CL_DEVICE_TYPE_DEFAULT") == 0)
         {
             device_type = CL_DEVICE_TYPE_DEFAULT;
@@ -351,6 +360,9 @@ int runTestHarnessWithCheck(int argc, const char *argv[], int testNum,
         case CL_DEVICE_TYPE_ACCELERATOR:
             log_info("Requesting Accelerator device ");
             break;
+        case CL_DEVICE_TYPE_CUSTOM:
+            log_info("Requesting Custom device ");
+            break;
         case CL_DEVICE_TYPE_DEFAULT:
             log_info("Requesting Default device ");
             break;
@@ -1196,18 +1208,21 @@ Version get_device_spirv_il_version(cl_device_id device)
         ASSERT_SUCCESS(err, "clGetDeviceInfo");
     }
 
-    if (strstr(str.data(), "SPIR-V_1.0") != NULL)
-        return Version(1, 0);
-    else if (strstr(str.data(), "SPIR-V_1.1") != NULL)
-        return Version(1, 1);
-    else if (strstr(str.data(), "SPIR-V_1.2") != NULL)
-        return Version(1, 2);
-    else if (strstr(str.data(), "SPIR-V_1.3") != NULL)
-        return Version(1, 3);
+    // Because this query returns a space-separated list of IL version strings
+    // we should check for SPIR-V versions in reverse order, to return the
+    // highest version supported.
+    if (strstr(str.data(), "SPIR-V_1.5") != NULL)
+        return Version(1, 5);
     else if (strstr(str.data(), "SPIR-V_1.4") != NULL)
         return Version(1, 4);
-    else if (strstr(str.data(), "SPIR-V_1.5") != NULL)
-        return Version(1, 5);
+    else if (strstr(str.data(), "SPIR-V_1.3") != NULL)
+        return Version(1, 3);
+    else if (strstr(str.data(), "SPIR-V_1.2") != NULL)
+        return Version(1, 2);
+    else if (strstr(str.data(), "SPIR-V_1.1") != NULL)
+        return Version(1, 1);
+    else if (strstr(str.data(), "SPIR-V_1.0") != NULL)
+        return Version(1, 0);
 
     throw std::runtime_error(std::string("Unknown SPIR-V version: ")
                              + str.data());
diff --git a/test_conformance/SVM/test_migrate.cpp b/test_conformance/SVM/test_migrate.cpp
index f624bcd933..b767a70a29 100644
--- a/test_conformance/SVM/test_migrate.cpp
+++ b/test_conformance/SVM/test_migrate.cpp
@@ -16,6 +16,8 @@
 #include "common.h"
 #include "harness/mt19937.h"
 
+#include <vector>
+
 #define GLOBAL_SIZE 65536
 
 static const char *sources[] = {
@@ -75,9 +77,9 @@ wait_and_release(const char* s, cl_event* evs, int n)
 
 int test_svm_migrate(cl_device_id deviceID, cl_context c, cl_command_queue queue, int num_elements)
 {
-    cl_uint amem[GLOBAL_SIZE];
-    cl_uint bmem[GLOBAL_SIZE];
-    cl_uint cmem[GLOBAL_SIZE];
+    std::vector<cl_uint> amem(GLOBAL_SIZE);
+    std::vector<cl_uint> bmem(GLOBAL_SIZE);
+    std::vector<cl_uint> cmem(GLOBAL_SIZE);
     cl_event evs[20];
 
     const size_t global_size = GLOBAL_SIZE;
@@ -145,9 +147,9 @@ int test_svm_migrate(cl_device_id deviceID, cl_context c, cl_command_queue queue
     test_error(error, "clSetKernelArgSVMPointer failed");
 
     // Initialize host copy of data (and result)
-    fill_buffer(amem, global_size, seed);
-    fill_buffer(bmem, global_size, seed);
-    fill_buffer(cmem, global_size, seed);
+    fill_buffer(amem.data(), global_size, seed);
+    fill_buffer(bmem.data(), global_size, seed);
+    fill_buffer(cmem.data(), global_size, seed);
 
     // Now we're ready to start
     {
@@ -218,9 +220,9 @@ int test_svm_migrate(cl_device_id deviceID, cl_context c, cl_command_queue queue
     if (error)
         return -1;
 
-    memcpy((void *)asvm, (void *)amem, global_size*sizeof(cl_uint));
-    memcpy((void *)bsvm, (void *)bmem, global_size*sizeof(cl_uint));
-    memcpy((void *)csvm, (void *)cmem, global_size*sizeof(cl_uint));
+    memcpy((void *)asvm, (void *)amem.data(), global_size * sizeof(cl_uint));
+    memcpy((void *)bsvm, (void *)bmem.data(), global_size * sizeof(cl_uint));
+    memcpy((void *)csvm, (void *)cmem.data(), global_size * sizeof(cl_uint));
 
     {
         error = clEnqueueSVMUnmap(queues[1], (void *)asvm, 0, NULL, &evs[0]);
@@ -304,9 +306,9 @@ int test_svm_migrate(cl_device_id deviceID, cl_context c, cl_command_queue queue
         return -1;
 
     // Check kernel results
-    bool ok = check("memory a", (cl_uint *)asvm, amem, global_size);
-    ok &= check("memory b", (cl_uint *)bsvm, bmem, global_size);
-    ok &= check("memory c", (cl_uint *)csvm, cmem, global_size);
+    bool ok = check("memory a", (cl_uint *)asvm, amem.data(), global_size);
+    ok &= check("memory b", (cl_uint *)bsvm, bmem.data(), global_size);
+    ok &= check("memory c", (cl_uint *)csvm, cmem.data(), global_size);
 
     {
         void *ptrs[] = { asvm, bsvm, csvm };
diff --git a/test_conformance/allocations/allocation_execute.cpp b/test_conformance/allocations/allocation_execute.cpp
index fb19cccc73..1762711067 100644
--- a/test_conformance/allocations/allocation_execute.cpp
+++ b/test_conformance/allocations/allocation_execute.cpp
@@ -16,6 +16,8 @@
 #include "allocation_execute.h"
 #include "allocation_functions.h"
 
+#include <vector>
+
 
 const char *buffer_kernel_pattern = {
     "__kernel void sample_test(%s __global uint *result, __global %s *array_sizes, uint per_item)\n"
@@ -155,7 +157,8 @@ int execute_kernel(cl_context context, cl_command_queue *queue, cl_device_id dev
     size_t global_dims[3];
     cl_uint per_item;
     cl_uint per_item_uint;
-    cl_uint returned_results[NUM_OF_WORK_ITEMS], final_result;
+    cl_uint final_result;
+    std::vector<cl_uint> returned_results(NUM_OF_WORK_ITEMS);
     clEventWrapper event;
     cl_int event_status;
 
@@ -236,7 +239,9 @@ int execute_kernel(cl_context context, cl_command_queue *queue, cl_device_id dev
     }
 
     // Set the result
-    result_mem = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint)*NUM_OF_WORK_ITEMS, &returned_results, &error);
+    result_mem = clCreateBuffer(
+        context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        sizeof(cl_uint) * NUM_OF_WORK_ITEMS, returned_results.data(), &error);
     test_error(error, "clCreateBuffer failed");
     error = clSetKernelArg(kernel, i, sizeof(result_mem), &result_mem);
     test_error(error, "clSetKernelArg failed");
@@ -342,7 +347,9 @@ int execute_kernel(cl_context context, cl_command_queue *queue, cl_device_id dev
 
     // Verify the checksum.
     // Read back the result
-    error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0, sizeof(cl_uint)*NUM_OF_WORK_ITEMS, &returned_results, 0, NULL, NULL);
+    error = clEnqueueReadBuffer(*queue, result_mem, CL_TRUE, 0,
+                                sizeof(cl_uint) * NUM_OF_WORK_ITEMS,
+                                returned_results.data(), 0, NULL, NULL);
     test_error_abort(error, "clEnqueueReadBuffer failed");
     final_result = 0;
     if (test == BUFFER || test == IMAGE_READ || test == BUFFER_NON_BLOCKING || test == IMAGE_READ_NON_BLOCKING) {
diff --git a/test_conformance/allocations/main.cpp b/test_conformance/allocations/main.cpp
index 827072fc7f..6ef83c680d 100644
--- a/test_conformance/allocations/main.cpp
+++ b/test_conformance/allocations/main.cpp
@@ -93,6 +93,11 @@ test_status init_cl( cl_device_id device ) {
         // queue, kernel code on GPU.
         g_global_mem_size *= 0.60;
     }
+    /* Cap the allocation size as the global size was deduced */
+    if (g_max_individual_allocation_size > g_global_mem_size)
+    {
+        g_max_individual_allocation_size = g_global_mem_size;
+    }
 
     if( gReSeed )
     {
diff --git a/test_conformance/api/test_mem_object_info.cpp b/test_conformance/api/test_mem_object_info.cpp
index 7eedec856f..6228783a81 100644
--- a/test_conformance/api/test_mem_object_info.cpp
+++ b/test_conformance/api/test_mem_object_info.cpp
@@ -476,7 +476,8 @@ int test_get_image_info( cl_device_id deviceID, cl_context context, cl_mem_objec
         CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR,
         CL_MEM_HOST_NO_ACCESS | CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
     };
-    MTdata d;
+    MTdataHolder d_holder(gRandomSeed);
+    MTdata d = static_cast<MTdata>(d_holder);
 
     PASSIVE_REQUIRE_IMAGE_SUPPORT( deviceID )
 
@@ -495,8 +496,6 @@ int test_get_image_info( cl_device_id deviceID, cl_context context, cl_mem_objec
     imageInfo.buffer = NULL;
 #endif
 
-    d = init_genrand( gRandomSeed );
-
     for ( unsigned int i = 0; i < sizeof(imageFlags) / sizeof(cl_mem_flags); ++i )
     {
         imageInfo.image_row_pitch = 0;
diff --git a/test_conformance/api/test_sub_group_dispatch.cpp b/test_conformance/api/test_sub_group_dispatch.cpp
index 9a3bf95979..3375990b25 100644
--- a/test_conformance/api/test_sub_group_dispatch.cpp
+++ b/test_conformance/api/test_sub_group_dispatch.cpp
@@ -188,8 +188,9 @@ int test_sub_group_dispatch(cl_device_id deviceID, cl_context context, cl_comman
         }
     }
 
-    // test when input subgroup count exceeds max wg size
-    size_t large_sg_size = kernel_subgroup_count + 1;
+    // test when input subgroup count exceeds max wg size:
+    // there can be at most the local size of (1 WI) subgroups
+    size_t large_sg_size = max_local + 1;
     error = clGetKernelSubGroupInfo(kernel, deviceID, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, sizeof(size_t), &large_sg_size, sizeof(ret_ndrange1d), &ret_ndrange1d, &realSize);
         test_error(error, "clGetKernelSubGroupInfo failed for CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT");
     if (ret_ndrange1d != 0)
diff --git a/test_conformance/basic/test_work_item_functions.cpp b/test_conformance/basic/test_work_item_functions.cpp
index 9683a8342f..d326bb8bbc 100644
--- a/test_conformance/basic/test_work_item_functions.cpp
+++ b/test_conformance/basic/test_work_item_functions.cpp
@@ -20,7 +20,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-
+#include <vector>
 
 #include "procs.h"
 #include "harness/conversions.h"
@@ -72,7 +72,7 @@ int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_comma
     clProgramWrapper program;
     clKernelWrapper kernel;
     clMemWrapper outData;
-    work_item_data    testData[ 10240 ];
+    std::vector<work_item_data> testData(10240);
     size_t threads[3], localThreads[3];
     MTdata d;
 
@@ -80,7 +80,9 @@ int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_comma
     error = create_single_kernel_helper( context, &program, &kernel, 1, &workItemKernelCode, "sample_kernel" );
     test_error( error, "Unable to create testing kernel" );
 
-    outData = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof( testData ), NULL, &error );
+    outData =
+        clCreateBuffer(context, CL_MEM_READ_WRITE,
+                       sizeof(work_item_data) * testData.size(), NULL, &error);
     test_error( error, "Unable to create output buffer" );
 
     error = clSetKernelArg( kernel, 0, sizeof( outData ), &outData );
@@ -105,7 +107,10 @@ int test_work_item_functions(cl_device_id deviceID, cl_context context, cl_comma
             error = clEnqueueNDRangeKernel( queue, kernel, (cl_uint)dim, NULL, threads, localThreads, 0, NULL, NULL );
             test_error( error, "Unable to run kernel" );
 
-            error = clEnqueueReadBuffer( queue, outData, CL_TRUE, 0, sizeof( testData ), testData, 0, NULL, NULL );
+            error =
+                clEnqueueReadBuffer(queue, outData, CL_TRUE, 0,
+                                    sizeof(work_item_data) * testData.size(),
+                                    testData.data(), 0, NULL, NULL);
             test_error( error, "Unable to read results" );
 
             // Validate
diff --git a/test_conformance/c11_atomics/test_atomics.cpp b/test_conformance/c11_atomics/test_atomics.cpp
index ca2c224225..82438a743b 100644
--- a/test_conformance/c11_atomics/test_atomics.cpp
+++ b/test_conformance/c11_atomics/test_atomics.cpp
@@ -2827,7 +2827,7 @@ class CBasicTestFence
             // value from other thread
             // - reads value from other thread's variable
             // - repeats the above steps when both values are the same (and less
-            // than 1000000)
+            // than 500000)
             // - stores the last value read from other thread (in additional
             // variable) At the end of execution at least one thread should know
             // the last value from other thread
@@ -2846,7 +2846,7 @@ class CBasicTestFence
                   "memory_order_relaxed"
                 + MemoryScopeStr()
                 + ");\n"
-                  "  } while(myValue == hisValue && myValue < 1000000);\n"
+                  "  } while(myValue == hisValue && myValue < 500000);\n"
                   "  "
                 + nonAtomic + "[myId] = hisValue; \n";
         }
@@ -2972,7 +2972,7 @@ class CBasicTestFence
                 host_atomic_thread_fence(MemoryOrder());
                 hisValue = host_atomic_load<HostAtomicType, HostDataType>(
                     &destMemory[hisId], MEMORY_ORDER_RELAXED);
-            } while (myValue == hisValue && hisValue < 1000000);
+            } while (myValue == hisValue && hisValue < 500000);
             oldValues[tid] = hisValue;
         }
         else
@@ -3053,11 +3053,11 @@ class CBasicTestFence
                     if (myValue == hisValue)
                     {
                         // a draw - both threads should reach final value
-                        // 1000000
-                        if (myValue != 1000000)
+                        // 500000
+                        if (myValue != 500000)
                         {
                             log_error("ERROR: Invalid reference value #%u (%d "
-                                      "instead of 1000000)\n",
+                                      "instead of 500000)\n",
                                       workOffset + i, myValue);
                             correct = false;
                             return true;
diff --git a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
index b69be1197c..c137580227 100644
--- a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
+++ b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.cpp
@@ -34,8 +34,10 @@ pfnclEnqueueReleaseExternalMemObjectsKHR
     clEnqueueReleaseExternalMemObjectsKHRptr;
 pfnclReleaseSemaphoreKHR clReleaseSemaphoreKHRptr;
 pfnclGetSemaphoreHandleForTypeKHR clGetSemaphoreHandleForTypeKHRptr;
+pfnclReImportSemaphoreSyncFdKHR clReImportSemaphoreSyncFdKHRptr;
 
-void init_cl_vk_ext(cl_platform_id opencl_platform)
+void init_cl_vk_ext(cl_platform_id opencl_platform, cl_uint num_devices,
+                    cl_device_id *deviceIds)
 {
     clEnqueueWaitSemaphoresKHRptr =
         (pfnclEnqueueWaitSemaphoresKHR)clGetExtensionFunctionAddressForPlatform(
@@ -79,6 +81,21 @@ void init_cl_vk_ext(cl_platform_id opencl_platform)
         throw std::runtime_error("Failed to get the function pointer of "
                                  "clGetSemaphoreHandleForTypeKHRptr!");
     }
+
+    // Required only if cl_khr_external_semaphore_sync_fd is reported
+    clReImportSemaphoreSyncFdKHRptr = (pfnclReImportSemaphoreSyncFdKHR)
+        clGetExtensionFunctionAddressForPlatform(
+            opencl_platform, "clReImportSemaphoreSyncFdKHR");
+    for (cl_uint i = 0; i < num_devices; i++)
+    {
+        if (is_extension_available(deviceIds[i],
+                                   "cl_khr_external_semaphore_sync_fd")
+            && (NULL == clReImportSemaphoreSyncFdKHRptr))
+        {
+            throw std::runtime_error("Failed to get the function pointer of "
+                                     "clReImportSemaphoreSyncFdKHR!");
+        }
+    }
 }
 
 cl_int setMaxImageDimensions(cl_device_id deviceID, size_t &max_width,
@@ -669,7 +686,6 @@ clExternalMemoryImage::clExternalMemoryImage(
             break;
 #elif !defined(__APPLE__)
         case VULKAN_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD:
-            log_info(" Opaque file descriptors are not supported on Windows\n");
             fd = (int)deviceMemory.getHandle(externalMemoryHandleType);
             errcode_ret = check_external_memory_handle_type(
                 devList[0], CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR);
@@ -738,7 +754,9 @@ clExternalMemoryImage::clExternalMemoryImage() {}
 // clExternalSemaphore implementation //
 //////////////////////////////////////////
 
-clExternalSemaphore::clExternalSemaphore(
+clExternalSemaphore::~clExternalSemaphore() = default;
+
+clExternalImportableSemaphore::clExternalImportableSemaphore(
     const VulkanSemaphore &semaphore, cl_context context,
     VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
     cl_device_id deviceId)
@@ -759,17 +777,12 @@ clExternalSemaphore::clExternalSemaphore(
     switch (externalSemaphoreHandleType)
     {
         case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD:
-#ifdef _WIN32
-            ASSERT(0);
-#else
-            log_info(" Opaque file descriptors are not supported on Windows\n");
             fd = (int)semaphore.getHandle(externalSemaphoreHandleType);
             err = check_external_semaphore_handle_type(
                 devList[0], CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
             sema_props.push_back(
                 (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR);
             sema_props.push_back((cl_semaphore_properties_khr)fd);
-#endif
             break;
         case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_NT:
 #ifndef _WIN32
@@ -802,12 +815,10 @@ clExternalSemaphore::clExternalSemaphore(
         case VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD:
             err = check_external_semaphore_handle_type(
                 devList[0], CL_SEMAPHORE_HANDLE_SYNC_FD_KHR);
-            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
-                CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR));
+
             sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
                 CL_SEMAPHORE_HANDLE_SYNC_FD_KHR));
-            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(
-                CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR));
+            sema_props.push_back(static_cast<cl_semaphore_properties_khr>(-1));
             break;
         default:
             ASSERT(0);
@@ -837,7 +848,7 @@ clExternalSemaphore::clExternalSemaphore(
     }
 }
 
-clExternalSemaphore::~clExternalSemaphore() noexcept(false)
+clExternalImportableSemaphore::~clExternalImportableSemaphore()
 {
     cl_int err = clReleaseSemaphoreKHRptr(m_externalSemaphore);
     if (err != CL_SUCCESS)
@@ -846,7 +857,89 @@ clExternalSemaphore::~clExternalSemaphore() noexcept(false)
     }
 }
 
-int clExternalSemaphore::signal(cl_command_queue cmd_queue)
+int clExternalImportableSemaphore::wait(cl_command_queue cmd_queue)
+{
+    int err = CL_SUCCESS;
+    if (m_externalHandleType == VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD)
+    {
+        cl_int err = 0;
+        fd = (int)m_deviceSemaphore.getHandle(m_externalHandleType);
+        err = clReImportSemaphoreSyncFdKHRptr(m_externalSemaphore, nullptr, fd);
+        if (err != CL_SUCCESS)
+        {
+            return err;
+        }
+    }
+
+    err = clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore,
+                                        NULL, 0, NULL, NULL);
+    return err;
+}
+
+int clExternalImportableSemaphore::signal(cl_command_queue cmd_queue)
+{
+    return clEnqueueSignalSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore,
+                                           NULL, 0, NULL, NULL);
+}
+
+cl_semaphore_khr &clExternalImportableSemaphore::getCLSemaphore()
+{
+    return m_externalSemaphore;
+}
+
+
+clExternalExportableSemaphore::clExternalExportableSemaphore(
+    const VulkanSemaphore &semaphore, cl_context context,
+    VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
+    cl_device_id deviceId)
+    : m_deviceSemaphore(semaphore)
+{
+
+    cl_int err = 0;
+    cl_device_id devList[] = { deviceId, NULL };
+    m_externalHandleType = externalSemaphoreHandleType;
+    m_externalSemaphore = nullptr;
+    m_device = deviceId;
+    m_context = context;
+
+    std::vector<cl_semaphore_properties_khr> sema_props{
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR,
+    };
+    sema_props.push_back(
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR);
+    sema_props.push_back(
+        (cl_semaphore_properties_khr)getCLSemaphoreTypeFromVulkanType(
+            externalSemaphoreHandleType));
+    sema_props.push_back((cl_semaphore_properties_khr)
+                             CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR);
+    sema_props.push_back(
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR);
+    sema_props.push_back((cl_semaphore_properties_khr)devList[0]);
+    sema_props.push_back(
+        (cl_semaphore_properties_khr)CL_SEMAPHORE_DEVICE_HANDLE_LIST_END_KHR);
+    sema_props.push_back(0);
+    m_externalSemaphore =
+        clCreateSemaphoreWithPropertiesKHRptr(context, sema_props.data(), &err);
+    if (CL_SUCCESS != err)
+    {
+        log_error("clCreateSemaphoreWithPropertiesKHRptr failed with %d\n",
+                  err);
+        throw std::runtime_error(
+            "clCreateSemaphoreWithPropertiesKHRptr failed! ");
+    }
+}
+
+clExternalExportableSemaphore::~clExternalExportableSemaphore()
+{
+    cl_int err = clReleaseSemaphoreKHRptr(m_externalSemaphore);
+    if (err != CL_SUCCESS)
+    {
+        throw std::runtime_error("clReleaseSemaphoreKHR failed!");
+    }
+}
+
+int clExternalExportableSemaphore::signal(cl_command_queue cmd_queue)
 {
     int err = clEnqueueSignalSemaphoresKHRptr(
         cmd_queue, 1, &m_externalSemaphore, NULL, 0, NULL, nullptr);
@@ -886,60 +979,13 @@ int clExternalSemaphore::signal(cl_command_queue cmd_queue)
     return err;
 }
 
-int clExternalSemaphore::wait(cl_command_queue cmd_queue)
+int clExternalExportableSemaphore::wait(cl_command_queue command_queue)
 {
-    int err = CL_SUCCESS;
-    if (m_externalHandleType == VULKAN_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD)
-    {
-        cl_int err = 0;
-        cl_device_id devList[] = { m_device, NULL };
-        std::vector<cl_semaphore_properties_khr> sema_props{
-            (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_KHR,
-            (cl_semaphore_properties_khr)CL_SEMAPHORE_TYPE_BINARY_KHR,
-        };
-        fd = (int)m_deviceSemaphore.getHandle(m_externalHandleType);
-
-        err = check_external_semaphore_handle_type(
-            devList[0], CL_SEMAPHORE_HANDLE_SYNC_FD_KHR);
-        if (CL_SUCCESS != err)
-        {
-            log_error("CL_SEMAPHORE_HANDLE_SYNC_FD_KHR not supported\n");
-            return err;
-        }
-
-        sema_props.push_back(
-            (cl_semaphore_properties_khr)CL_SEMAPHORE_HANDLE_SYNC_FD_KHR);
-        sema_props.push_back((cl_semaphore_properties_khr)fd);
-
-        sema_props.push_back(0);
-
-        if (m_externalSemaphore)
-        {
-            err = clReleaseSemaphoreKHRptr(m_externalSemaphore);
-            if (err != CL_SUCCESS)
-            {
-                log_error("Failed to release CL external semaphore\n");
-                return err;
-            }
-            m_externalSemaphore = nullptr;
-        }
-
-        m_externalSemaphore = clCreateSemaphoreWithPropertiesKHRptr(
-            m_context, sema_props.data(), &err);
-        if (CL_SUCCESS != err)
-        {
-            log_error("clCreateSemaphoreWithPropertiesKHRptr failed with %d\n",
-                      err);
-            return err;
-        }
-    }
-
-    err = clEnqueueWaitSemaphoresKHRptr(cmd_queue, 1, &m_externalSemaphore,
-                                        NULL, 0, NULL, NULL);
-    return err;
+    return clEnqueueWaitSemaphoresKHRptr(command_queue, 1, &m_externalSemaphore,
+                                         NULL, 0, NULL, nullptr);
 }
 
-cl_semaphore_khr &clExternalSemaphore::getCLSemaphore()
+cl_semaphore_khr &clExternalExportableSemaphore::getCLSemaphore()
 {
     return m_externalSemaphore;
 }
@@ -1006,4 +1052,4 @@ VulkanImageTiling vkClExternalMemoryHandleTilingAssumption(
     }
 
     return mode;
-}
+}
\ No newline at end of file
diff --git a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp
index 12d467d8be..16389c44c7 100644
--- a/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp
+++ b/test_conformance/common/vulkan_wrapper/opencl_vulkan_wrapper.hpp
@@ -54,6 +54,9 @@ typedef cl_int (*pfnclGetSemaphoreHandleForTypeKHR)(
     cl_semaphore_khr sema_object, cl_device_id device,
     cl_external_semaphore_handle_type_khr handleType, size_t handle_size,
     void *handle, size_t *handleSize);
+typedef cl_int (*pfnclReImportSemaphoreSyncFdKHR)(
+    cl_semaphore_khr sema_object,
+    cl_semaphore_reimport_properties_khr *reimport_props, int fd);
 
 extern pfnclCreateSemaphoreWithPropertiesKHR
     clCreateSemaphoreWithPropertiesKHRptr;
@@ -64,6 +67,7 @@ extern pfnclEnqueueAcquireExternalMemObjectsKHR
 extern pfnclEnqueueReleaseExternalMemObjectsKHR
     clEnqueueReleaseExternalMemObjectsKHRptr;
 extern pfnclReleaseSemaphoreKHR clReleaseSemaphoreKHRptr;
+extern pfnclReImportSemaphoreSyncFdKHR pfnclReImportSemaphoreSyncFdKHRptr;
 
 cl_int getCLImageInfoFromVkImageInfo(const VkImageCreateInfo *, size_t,
                                      cl_image_format *, cl_image_desc *);
@@ -97,7 +101,6 @@ class clExternalMemoryImage {
     cl_mem m_externalMemory;
     int fd;
     void *handle;
-    cl_command_queue cmd_queue;
     clExternalMemoryImage();
 
 public:
@@ -112,6 +115,36 @@ class clExternalMemoryImage {
 };
 
 class clExternalSemaphore {
+public:
+    virtual int signal(cl_command_queue command_queue) = 0;
+    virtual int wait(cl_command_queue command_queue) = 0;
+    virtual cl_semaphore_khr &getCLSemaphore() = 0;
+    virtual ~clExternalSemaphore() = 0;
+};
+
+
+class clExternalImportableSemaphore : public virtual clExternalSemaphore {
+protected:
+    cl_semaphore_khr m_externalSemaphore;
+    VulkanExternalSemaphoreHandleType m_externalHandleType;
+    cl_device_id m_device;
+    cl_context m_context;
+    const VulkanSemaphore &m_deviceSemaphore;
+    int fd;
+    void *handle;
+
+public:
+    clExternalImportableSemaphore(
+        const VulkanSemaphore &deviceSemaphore, cl_context context,
+        VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
+        cl_device_id deviceId);
+    ~clExternalImportableSemaphore() override;
+    int wait(cl_command_queue command_queue) override;
+    int signal(cl_command_queue command_queue) override;
+    cl_semaphore_khr &getCLSemaphore() override;
+};
+
+class clExternalExportableSemaphore : public virtual clExternalSemaphore {
 protected:
     cl_semaphore_khr m_externalSemaphore;
     VulkanExternalSemaphoreHandleType m_externalHandleType;
@@ -122,21 +155,21 @@ class clExternalSemaphore {
     void *handle;
 
 public:
-    clExternalSemaphore(
+    clExternalExportableSemaphore(
         const VulkanSemaphore &deviceSemaphore, cl_context context,
         VulkanExternalSemaphoreHandleType externalSemaphoreHandleType,
         cl_device_id deviceId);
-    virtual ~clExternalSemaphore() noexcept(false);
-    int signal(cl_command_queue command_queue);
-    int wait(cl_command_queue command_queue);
-    cl_semaphore_khr &getCLSemaphore();
-    // operator openclExternalSemaphore_t() const;
+    ~clExternalExportableSemaphore() override;
+    int signal(cl_command_queue command_queue) override;
+    int wait(cl_command_queue command_queue) override;
+    cl_semaphore_khr &getCLSemaphore() override;
 };
 
-extern void init_cl_vk_ext(cl_platform_id);
+extern void init_cl_vk_ext(cl_platform_id, cl_uint num_devices,
+                           cl_device_id *deviceIds);
 
 VulkanImageTiling vkClExternalMemoryHandleTilingAssumption(
     cl_device_id deviceId,
     VulkanExternalMemoryHandleType vkExternalMemoryHandleType, int *error_ret);
 
-#endif // _opencl_vulkan_wrapper_hpp_
+#endif // _opencl_vulkan_wrapper_hpp_
\ No newline at end of file
diff --git a/test_conformance/compiler/test_feature_macro.cpp b/test_conformance/compiler/test_feature_macro.cpp
index ef3c002849..7858c3c25f 100644
--- a/test_conformance/compiler/test_feature_macro.cpp
+++ b/test_conformance/compiler/test_feature_macro.cpp
@@ -171,8 +171,8 @@ cl_int check_compiler_feature_info(cl_device_id deviceID, cl_context context,
     }
     else
     {
-        log_error("Error: The macro feature is defined and undefined "
-                  "in the same time\n");
+        log_error("Error: The feature macro is defined and undefined "
+                  "at the same time\n");
         error = OutputBuildLogs(program_supported, 1, &deviceID);
         test_error(error, "OutputBuildLogs failed.\n");
         error = OutputBuildLogs(program_not_supported, 1, &deviceID);
diff --git a/test_conformance/conversions/CMakeLists.txt b/test_conformance/conversions/CMakeLists.txt
index 8ed3ba184c..11106439b1 100644
--- a/test_conformance/conversions/CMakeLists.txt
+++ b/test_conformance/conversions/CMakeLists.txt
@@ -16,6 +16,6 @@ set_source_files_properties(
         COMPILE_FLAGS -march=i686)
 endif(NOT CMAKE_CL_64 AND NOT MSVC AND NOT ANDROID)
 
-set_gnulike_module_compile_flags("-Wno-unused-but-set-variable -Wno-sign-compare")
+set_gnulike_module_compile_flags("-Wno-sign-compare")
 
 include(../CMakeCommon.txt)
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index 043c509d1f..bf887edecc 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -467,11 +467,11 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
         if (std::is_same<cl_double, OutType>::value)
         {
 #if defined(_MSC_VER)
-            cl_ulong l = ((cl_ulong *)in)[0];
             double result;
 
             if (std::is_same<cl_ulong, InType>::value)
             {
+                cl_ulong l = ((cl_ulong *)in)[0];
                 cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1))
                                               : (cl_long)l;
 #if defined(_M_X64)
@@ -484,6 +484,7 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
             }
             else
             {
+                cl_long l = ((cl_long *)in)[0];
 #if defined(_M_X64)
                 _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), l));
 #else
@@ -504,10 +505,10 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
             cl_float outVal = 0.f;
 
 #if defined(_MSC_VER) && defined(_M_X64)
-            cl_ulong l = ((cl_ulong *)in)[0];
             float result;
             if (std::is_same<cl_ulong, InType>::value)
             {
+                cl_ulong l = ((cl_ulong *)in)[0];
                 cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1))
                                               : (cl_long)l;
                 _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), sl));
@@ -516,6 +517,7 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
             }
             else
             {
+                cl_long l = ((cl_long *)in)[0];
                 _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), l));
                 outVal = (l == 0 ? 0.0f : result); // Per IEEE-754-2008 5.4.1,
                                                    // 0's always convert to +0.0
diff --git a/test_conformance/conversions/fplib.cpp b/test_conformance/conversions/fplib.cpp
index 3b19b56df4..8e6caba7b0 100644
--- a/test_conformance/conversions/fplib.cpp
+++ b/test_conformance/conversions/fplib.cpp
@@ -198,7 +198,6 @@ float qcom_u64_2_f32(uint64_t data, bool sat, roundingMode rnd)
                 return as_float(result);
         }
         case qcomRTN: {
-            int inExact = 0;
             if (!data)
                 return 0.0f;
             uint32_t  exponent    = (127 + 64 - clz(data) - 1) << (FLT_MANT_DIG - 1); //add 1 for the implied 1.0 in normalized fp32 numbers
@@ -206,8 +205,6 @@ float qcom_u64_2_f32(uint64_t data, bool sat, roundingMode rnd)
             uint32_t  mantissa;
             if (mantShift >= 0){
                 uint64_t temp = (uint64_t)data >> mantShift;
-                if (temp << mantShift != data)
-                    inExact = 1;
                 mantissa = (uint32_t)temp;
             }
             else
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
index 5c8291f05d..55c27ccfe0 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_arguments.cpp
@@ -664,8 +664,8 @@ struct MutableDispatchSVMArguments : public BasicMutableCommandBufferTest
 
         // Allocate and initialize SVM for modified execution
 
-        cl_int *newWrapper =
-            (cl_int *)clSVMAlloc(context, CL_MEM_READ_WRITE, sizeof(cl_int), 0);
+        cl_int *newWrapper = (cl_int *)clSVMAlloc(context, CL_MEM_READ_WRITE,
+                                                  sizeof(cl_int *), 0);
         cl_int *newBuffer = (cl_int *)clSVMAlloc(
             context, CL_MEM_READ_WRITE, num_elements * sizeof(cl_int), 0);
         test_assert_error(newWrapper != nullptr && newBuffer != nullptr,
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
index b1ce25ec14..d8036e17dd 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_image_arguments.cpp
@@ -92,6 +92,7 @@ struct MutableDispatchImage1DArguments : public BasicMutableCommandBufferTest
         imageInfo.type = CL_MEM_OBJECT_IMAGE1D;
         imageInfo.format = &formats;
         imageInfo.width = 4;
+        imageInfo.rowPitch = imageInfo.width * get_pixel_size(imageInfo.format);
 
         BufferOwningPtr<char> imageValues_input, imageValues_output, outputData;
         MTdataHolder d(gRandomSeed);
@@ -285,6 +286,7 @@ struct MutableDispatchImage2DArguments : public BasicMutableCommandBufferTest
         imageInfo.width = 4;
         imageInfo.height = 4;
         imageInfo.format = &formats;
+        imageInfo.rowPitch = imageInfo.width * get_pixel_size(imageInfo.format);
 
         BufferOwningPtr<char> imageValues_input, imageValues_output;
 
diff --git a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
index 61600dc90f..71b9017ec3 100644
--- a/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
+++ b/test_conformance/extensions/cl_khr_command_buffer/cl_khr_command_buffer_mutable_dispatch/mutable_command_info.cpp
@@ -207,7 +207,7 @@ struct Dimensions : public InfoMutableCommandBufferTest
     {
         cl_int error = clCommandNDRangeKernelKHR(
             command_buffer, nullptr, nullptr, kernel, dimensions, nullptr,
-            &global_work_size, nullptr, 0, nullptr, nullptr, &command);
+            global_work_size_3d, nullptr, 0, nullptr, nullptr, &command);
         test_error(error, "clCommandNDRangeKernelKHR failed");
 
         cl_uint test_dimensions = 0;
@@ -231,6 +231,7 @@ struct Dimensions : public InfoMutableCommandBufferTest
 
     cl_mutable_command_khr command = nullptr;
     const size_t dimensions = 3;
+    const size_t global_work_size_3d[3] = { 64, 1, 1 };
 };
 
 struct InfoType : public InfoMutableCommandBufferTest
diff --git a/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp b/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
index 8c0c64f42c..8b282cbc71 100644
--- a/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
+++ b/test_conformance/extensions/cl_khr_external_semaphore/test_external_semaphore.cpp
@@ -64,7 +64,7 @@ static void log_info_semaphore_type(
     log_info("%s", semaphore_type_description.str().c_str());
 }
 
-static int init_vuikan_device()
+static int init_vuikan_device(cl_uint num_devices, cl_device_id* deviceIds)
 {
     cl_platform_id platform = nullptr;
 
@@ -77,7 +77,7 @@ static int init_vuikan_device()
         return err;
     }
 
-    init_cl_vk_ext(platform);
+    init_cl_vk_ext(platform, num_devices, deviceIds);
 
     return CL_SUCCESS;
 }
@@ -101,7 +101,7 @@ int test_external_semaphores_queries(cl_device_id deviceID, cl_context context,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -130,8 +130,8 @@ int test_external_semaphores_queries(cl_device_id deviceID, cl_context context,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                     vkExternalSemaphoreHandleType, deviceID);
+        clExternalImportableSemaphore sema_ext(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
 
         // Needed by the macro
         cl_semaphore_khr sema = sema_ext.getCLSemaphore();
@@ -181,7 +181,7 @@ int test_external_semaphores_multi_context(cl_device_id deviceID,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -219,10 +219,11 @@ int test_external_semaphores_multi_context(cl_device_id deviceID,
             return TEST_FAIL;
         }
 
-        clExternalSemaphore sema_ext_1(vkVk2CLSemaphore, context,
-                                       vkExternalSemaphoreHandleType, deviceID);
-        clExternalSemaphore sema_ext_2(vkVk2CLSemaphore, context2,
-                                       vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext_1(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext_2(vkVk2CLSemaphore, context2,
+                                                 vkExternalSemaphoreHandleType,
+                                                 deviceID);
 
         clCommandQueueWrapper queue1 =
             clCreateCommandQueue(context, deviceID, 0, &err);
@@ -288,7 +289,7 @@ static int semaphore_external_cross_queue_helper(cl_device_id deviceID,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -313,8 +314,8 @@ static int semaphore_external_cross_queue_helper(cl_device_id deviceID,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                     vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
 
         // Obtain pointers to semaphore's API
         GET_PFN(deviceID, clEnqueueSignalSemaphoresKHR);
@@ -362,7 +363,7 @@ int test_external_semaphores_simple_1(cl_device_id deviceID, cl_context context,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -392,8 +393,8 @@ int test_external_semaphores_simple_1(cl_device_id deviceID, cl_context context,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                     vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
 
         cl_int err = CL_SUCCESS;
 
@@ -439,7 +440,7 @@ int test_external_semaphores_simple_2(cl_device_id deviceID, cl_context context,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -468,8 +469,8 @@ int test_external_semaphores_simple_2(cl_device_id deviceID, cl_context context,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                     vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
 
         cl_int err = CL_SUCCESS;
 
@@ -545,7 +546,7 @@ int test_external_semaphores_reuse(cl_device_id deviceID, cl_context context,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -574,8 +575,8 @@ int test_external_semaphores_reuse(cl_device_id deviceID, cl_context context,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                     vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
 
         cl_int err = CL_SUCCESS;
 
@@ -668,7 +669,7 @@ static int external_semaphore_cross_queue_helper(cl_device_id deviceID,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -697,8 +698,8 @@ static int external_semaphore_cross_queue_helper(cl_device_id deviceID,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext(vkVk2CLSemaphore, context,
-                                     vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
 
         cl_int err = CL_SUCCESS;
 
@@ -785,7 +786,7 @@ int test_external_semaphores_cross_queues_io2(cl_device_id deviceID,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -823,10 +824,11 @@ int test_external_semaphores_cross_queues_io2(cl_device_id deviceID,
         VulkanSemaphore vkVk2CLSemaphore(vkDevice,
                                          vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext_1(vkVk2CLSemaphore, context,
-                                       vkExternalSemaphoreHandleType, deviceID);
-        clExternalSemaphore sema_ext_2(vkVk2CLSemaphore, context2,
-                                       vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext_1(
+            vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext_2(vkVk2CLSemaphore, context2,
+                                                 vkExternalSemaphoreHandleType,
+                                                 deviceID);
 
         clCommandQueueWrapper queue1 =
             clCreateCommandQueue(context, deviceID, 0, &err);
@@ -891,7 +893,7 @@ int test_external_semaphores_multi_signal(cl_device_id deviceID,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -922,10 +924,12 @@ int test_external_semaphores_multi_signal(cl_device_id deviceID,
         VulkanSemaphore vkVk2CLSemaphore2(vkDevice,
                                           vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext_1(vkVk2CLSemaphore1, context,
-                                       vkExternalSemaphoreHandleType, deviceID);
-        clExternalSemaphore sema_ext_2(vkVk2CLSemaphore2, context,
-                                       vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext_1(vkVk2CLSemaphore1, context,
+                                                 vkExternalSemaphoreHandleType,
+                                                 deviceID);
+        clExternalExportableSemaphore sema_ext_2(vkVk2CLSemaphore2, context,
+                                                 vkExternalSemaphoreHandleType,
+                                                 deviceID);
 
         cl_int err = CL_SUCCESS;
 
@@ -980,7 +984,7 @@ int test_external_semaphores_multi_wait(cl_device_id deviceID,
         return TEST_SKIPPED_ITSELF;
     }
 
-    if (init_vuikan_device())
+    if (init_vuikan_device(1, &deviceID))
     {
         log_info("Cannot initialise Vulkan. "
                  "Skipping test.\n");
@@ -1011,10 +1015,12 @@ int test_external_semaphores_multi_wait(cl_device_id deviceID,
         VulkanSemaphore vkVk2CLSemaphore2(vkDevice,
                                           vkExternalSemaphoreHandleType);
 
-        clExternalSemaphore sema_ext_1(vkVk2CLSemaphore1, context,
-                                       vkExternalSemaphoreHandleType, deviceID);
-        clExternalSemaphore sema_ext_2(vkVk2CLSemaphore2, context,
-                                       vkExternalSemaphoreHandleType, deviceID);
+        clExternalExportableSemaphore sema_ext_1(vkVk2CLSemaphore1, context,
+                                                 vkExternalSemaphoreHandleType,
+                                                 deviceID);
+        clExternalExportableSemaphore sema_ext_2(vkVk2CLSemaphore2, context,
+                                                 vkExternalSemaphoreHandleType,
+                                                 deviceID);
 
         cl_int err = CL_SUCCESS;
 
@@ -1056,4 +1062,4 @@ int test_external_semaphores_multi_wait(cl_device_id deviceID,
     }
 
     return TEST_PASS;
-}
+}
\ No newline at end of file
diff --git a/test_conformance/geometrics/CMakeLists.txt b/test_conformance/geometrics/CMakeLists.txt
index 8a6f25c6cf..3fee05fbd5 100644
--- a/test_conformance/geometrics/CMakeLists.txt
+++ b/test_conformance/geometrics/CMakeLists.txt
@@ -6,7 +6,5 @@ set(${MODULE_NAME}_SOURCES
     test_geometrics.cpp
 )
 
-set_gnulike_module_compile_flags("-Wno-sign-compare")
-
 include(../CMakeCommon.txt)
 
diff --git a/test_conformance/geometrics/test_geometrics_double.cpp b/test_conformance/geometrics/test_geometrics_double.cpp
index 222017e6c7..66a671fd33 100644
--- a/test_conformance/geometrics/test_geometrics_double.cpp
+++ b/test_conformance/geometrics/test_geometrics_double.cpp
@@ -189,7 +189,7 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
         clKernelWrapper kernel;
         clMemWrapper streams[3];
         cl_double testVector[4];
-        int error, i;
+        int error;
         size_t threads[1], localThreads[1];
         BufferOwningPtr<cl_double> A(malloc(bufSize));
         BufferOwningPtr<cl_double> B(malloc(bufSize));
@@ -203,7 +203,7 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
             return -1;
 
         /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */
-        for( i = 0; i < size * vecsize; i++ )
+        for (unsigned int i = 0; i < size * vecsize; i++)
         {
             inDataA[ i ] = get_random_double( -512.f, 512.f, d );
             inDataB[ i ] = get_random_double( -512.f, 512.f, d );
@@ -233,7 +233,7 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
         }
 
         /* Assign streams and execute */
-        for( i = 0; i < 3; i++ )
+        for (unsigned int i = 0; i < 3; i++)
         {
             error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]);
             test_error( error, "Unable to set indexed kernel arguments" );
@@ -253,7 +253,7 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
         test_error( error, "Unable to read output array!" );
 
         /* And verify! */
-        for( i = 0; i < size; i++ )
+        for (unsigned int i = 0; i < size; i++)
         {
             double errorTolerances[ 4 ];
             // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product
@@ -265,9 +265,12 @@ int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command
 
             if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] )
             {
-                log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
-                          i, testVector[0], testVector[1], testVector[2], testVector[3],
-                          outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] );
+                log_error("ERROR: Data sample %u does not validate! Expected "
+                          "(%a,%a,%a,%a), got (%a,%a,%a,%a)\n",
+                          i, testVector[0], testVector[1], testVector[2],
+                          testVector[3], outData[i * vecsize],
+                          outData[i * vecsize + 1], outData[i * vecsize + 2],
+                          outData[i * vecsize + 3]);
                 log_error( "    Input: (%a %a %a) and (%a %a %a)\n",
                           inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ],
                           inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] );
diff --git a/test_conformance/images/common.cpp b/test_conformance/images/common.cpp
index 7323f11c1c..0b2c956cc6 100644
--- a/test_conformance/images/common.cpp
+++ b/test_conformance/images/common.cpp
@@ -16,26 +16,12 @@
 #include "common.h"
 
 cl_channel_type floatFormats[] = {
-    CL_UNORM_SHORT_565,
-    CL_UNORM_SHORT_555,
-    CL_UNORM_INT_101010,
-#ifdef OBSOLETE_FORAMT
-    CL_UNORM_SHORT_565_REV,
-    CL_UNORM_SHORT_555_REV,
-    CL_UNORM_INT_8888,
-    CL_UNORM_INT_8888_REV,
-    CL_UNORM_INT_101010_REV,
-#endif
+    CL_UNORM_SHORT_565, CL_UNORM_SHORT_555, CL_UNORM_INT_101010,
 #ifdef CL_SFIXED14_APPLE
     CL_SFIXED14_APPLE,
 #endif
-    CL_UNORM_INT8,
-    CL_SNORM_INT8,
-    CL_UNORM_INT16,
-    CL_SNORM_INT16,
-    CL_FLOAT,
-    CL_HALF_FLOAT,
-    (cl_channel_type)-1,
+    CL_UNORM_INT8,      CL_SNORM_INT8,      CL_UNORM_INT16,      CL_SNORM_INT16,
+    CL_FLOAT,           CL_HALF_FLOAT,      (cl_channel_type)-1,
 };
 
 cl_channel_type intFormats[] = {
diff --git a/test_conformance/images/kernel_read_write/main.cpp b/test_conformance/images/kernel_read_write/main.cpp
index 0a93a97415..debbdf18a4 100644
--- a/test_conformance/images/kernel_read_write/main.cpp
+++ b/test_conformance/images/kernel_read_write/main.cpp
@@ -202,7 +202,7 @@ static int doTest( cl_device_id device, cl_context context, cl_command_queue que
     if ((testTypesToRun & kReadWriteTests)
         && checkForReadWriteImageSupport(device))
     {
-        return TEST_SKIPPED_ITSELF;
+        return ret;
     }
 
     if( ( testTypesToRun & kReadWriteTests ) && !gTestMipmaps )
diff --git a/test_conformance/integer_ops/test_abs.cpp b/test_conformance/integer_ops/test_abs.cpp
index 24d0555b99..1e1bb305a6 100644
--- a/test_conformance/integer_ops/test_abs.cpp
+++ b/test_conformance/integer_ops/test_abs.cpp
@@ -15,6 +15,8 @@
 //
 #include "harness/compat.h"
 
+#include <cinttypes>
+
 #include <stdio.h>
 #include <string.h>
 #include <limits.h>
@@ -35,7 +37,12 @@ static int verify_abs_char( const void *p, const void *q, size_t n, const char *
         if( inA[i] < 0 )
             r = -inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (char%s) 0x%2.2x) = *0x%2.2x vs "
+                     "0x%2.2x\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -52,7 +59,12 @@ static int verify_abs_short( const void *p, const void *q, size_t n, const char
         if( inA[i] < 0 )
             r = -inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (short%s) 0x%4.4x) = *0x%4.4x vs "
+                     "0x%4.4x\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -68,7 +80,12 @@ static int verify_abs_int( const void *p, const void *q, size_t n, const char *s
         if( inA[i] < 0 )
             r = -inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (int%s) 0x%2.2x) = *0x%8.8x vs 0x%8.8x\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (int%s) 0x%2.2x) = *0x%8.8x vs "
+                     "0x%8.8x\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -84,7 +101,12 @@ static int verify_abs_long( const void *p, const void *q, size_t n, const char *
         if( inA[i] < 0 )
             r = -inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (long%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (long%s) 0x%16.16" PRIx64
+                     ") = *0x%16.16" PRIx64 " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -100,7 +122,12 @@ static int verify_abs_uchar( const void *p, const void *q, size_t n, const char
     {
         cl_uchar r = inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (uchar%s) 0x%2.2x) = *0x%2.2x vs "
+                     "0x%2.2x\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -115,7 +142,12 @@ static int verify_abs_ushort( const void *p, const void *q, size_t n, const char
     {
         cl_ushort r = inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (short%s) 0x%4.4x) = *0x%4.4x vs "
+                     "0x%4.4x\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -129,7 +161,12 @@ static int verify_abs_uint( const void *p, const void *q, size_t n, const char *
     {
         cl_uint r = inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (int%s) 0x%2.2x) = *0x%8.8x vs 0x%8.8x\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (int%s) 0x%2.2x) = *0x%8.8x vs "
+                     "0x%8.8x\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -143,7 +180,12 @@ static int verify_abs_ulong( const void *p, const void *q, size_t n, const char
     {
         cl_ulong r = inA[i];
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for abs( (long%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i],r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for abs( (long%s) 0x%16.16" PRIx64
+                     ") = *0x%16.16" PRIx64 " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
diff --git a/test_conformance/integer_ops/test_absdiff.cpp b/test_conformance/integer_ops/test_absdiff.cpp
index 710b9c4eac..0d672f2d0c 100644
--- a/test_conformance/integer_ops/test_absdiff.cpp
+++ b/test_conformance/integer_ops/test_absdiff.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <cinttypes>
+
 #include "procs.h"
 
 template <class Integer>
@@ -43,7 +45,12 @@ static int verify_absdiff_char( const void *p, const void *q, const void *r, siz
     {
         cl_uchar r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (char%s) 0x%2.2x, (char%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (char%s) 0x%2.2x, (char%s) "
+                     "0x%2.2x) = *0x%2.2x vs 0x%2.2x\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -58,7 +65,12 @@ static int verify_absdiff_uchar( const void *p, const void *q, const void *r, si
     {
         cl_uchar r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (uchar%s) 0x%2.2x, (uchar%s) 0x%2.2x) = *0x%2.2x vs 0x%2.2x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (uchar%s) 0x%2.2x, (uchar%s) "
+                     "0x%2.2x) = *0x%2.2x vs 0x%2.2x\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -73,7 +85,12 @@ static int verify_absdiff_short( const void *p, const void *q, const void *r, si
     {
         cl_ushort r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (short%s) 0x%4.4x, (short%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (short%s) 0x%4.4x, (short%s) "
+                     "0x%4.4x) = *0x%4.4x vs 0x%4.4x\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -88,7 +105,12 @@ static int verify_absdiff_ushort( const void *p, const void *q, const void *r, s
     {
         cl_ushort r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (ushort%s) 0x%4.4x, (ushort%s) 0x%4.4x) = *0x%4.4x vs 0x%4.4x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (ushort%s) 0x%4.4x, (ushort%s) "
+                     "0x%4.4x) = *0x%4.4x vs 0x%4.4x\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -104,7 +126,9 @@ static int verify_absdiff_int( const void *p, const void *q, const void *r, size
         cl_uint r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
         {
-            log_info( "%ld) Failure for absdiff( (int%s) 0x%8.8x, (int%s) 0x%8.8x) = *0x%8.8x vs 0x%8.8x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] );
+            log_info("%zu) Failure for absdiff( (int%s) 0x%8.8x, (int%s) "
+                     "0x%8.8x) = *0x%8.8x vs 0x%8.8x\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
             return -1;
         }
     }
@@ -121,7 +145,12 @@ static int verify_absdiff_uint( const void *p, const void *q, const void *r, siz
     {
         cl_uint r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (uint%s) 0x%8.8x, (uint%s) 0x%8.8x) = *0x%8.8x vs 0x%8.8x\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (uint%s) 0x%8.8x, (uint%s) "
+                     "0x%8.8x) = *0x%8.8x vs 0x%8.8x\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -136,7 +165,13 @@ static int verify_absdiff_long( const void *p, const void *q, const void *r, siz
     {
         cl_ulong r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (long%s) 0x%16.16llx, (long%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (long%s) 0x%16.16" PRIx64
+                     ", (long%s) 0x%16.16" PRIx64 ") = *0x%16.16" PRIx64
+                     " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -151,7 +186,13 @@ static int verify_absdiff_ulong( const void *p, const void *q, const void *r, si
     {
         cl_ulong r = abs_diff(inA[i], inB[i]);
         if( r != outptr[i] )
-        { log_info( "%ld) Failure for absdiff( (ulong%s) 0x%16.16llx, (ulong%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%zu) Failure for absdiff( (ulong%s) 0x%16.16" PRIx64
+                     ", (ulong%s) 0x%16.16" PRIx64 ") = *0x%16.16" PRIx64
+                     " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
diff --git a/test_conformance/integer_ops/test_add_sat.cpp b/test_conformance/integer_ops/test_add_sat.cpp
index e33f5c672d..a62b979c90 100644
--- a/test_conformance/integer_ops/test_add_sat.cpp
+++ b/test_conformance/integer_ops/test_add_sat.cpp
@@ -22,6 +22,7 @@
 #include <sys/stat.h>
 
 #include <algorithm>
+#include <cinttypes>
 
 #include "procs.h"
 
@@ -140,7 +141,13 @@ static int verify_addsat_long( const cl_long *inA, const cl_long *inB, const cl_
                 r = CL_LONG_MIN;
         }
         if( r != outptr[i] )
-        { log_info( "%d) Failure for add_sat( (long%s) 0x%16.16llx, (long%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%d) Failure for add_sat( (long%s) 0x%16.16" PRIx64
+                     ", (long%s) 0x%16.16" PRIx64 ") = *0x%16.16" PRIx64
+                     " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -154,7 +161,13 @@ static int verify_addsat_ulong( const cl_ulong *inA, const cl_ulong *inB, const
         if( r < inA[i] )
             r = CL_ULONG_MAX;
         if( r != outptr[i] )
-        { log_info( "%d) Failure for add_sat( (ulong%s) 0x%16.16llx, (ulong%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%d) Failure for add_sat( (ulong%s) 0x%16.16" PRIx64
+                     ", (ulong%s) 0x%16.16" PRIx64 ") = *0x%16.16" PRIx64
+                     " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
diff --git a/test_conformance/integer_ops/test_integers.cpp b/test_conformance/integer_ops/test_integers.cpp
index 6fa18e1e0b..20f19a29ca 100644
--- a/test_conformance/integer_ops/test_integers.cpp
+++ b/test_conformance/integer_ops/test_integers.cpp
@@ -17,6 +17,7 @@
 #include "harness/conversions.h"
 
 #include <algorithm>
+#include <cinttypes>
 
 #define TEST_SIZE 512
 
@@ -198,13 +199,23 @@ int test_single_param_integer_kernel(cl_command_queue queue, cl_context context,
 
                     case 8:
                         if( useOpKernel )
-                            log_error( "ERROR: Data sample %d:%d does not validate! Expected (0x%16.16llx), got (0x%16.16llx), sources (0x%16.16llx, 0x%16.16llx)\n",
-                                      (int)i, (int)j, ((cl_ulong*)&expected)[0], *( (cl_ulong *)p ),
-                                      *( (cl_ulong *)in ), *( (cl_ulong *)in2 ) );
+                            log_error("ERROR: Data sample %d:%d does not "
+                                      "validate! Expected (0x%16.16" PRIx64
+                                      "), got (0x%16.16" PRIx64
+                                      "), sources (0x%16.16" PRIx64
+                                      ", 0x%16.16" PRIx64 ")\n",
+                                      (int)i, (int)j,
+                                      ((cl_ulong *)&expected)[0],
+                                      *((cl_ulong *)p), *((cl_ulong *)in),
+                                      *((cl_ulong *)in2));
                         else
-                        log_error( "ERROR: Data sample %d:%d does not validate! Expected (0x%16.16llx), got (0x%16.16llx), sources (0x%16.16llx)\n",
-                                  (int)i, (int)j, ((cl_ulong*)&expected)[0], *( (cl_ulong *)p ),
-                                            *( (cl_ulong *)in ) );
+                            log_error("ERROR: Data sample %d:%d does not "
+                                      "validate! Expected (0x%16.16" PRIx64
+                                      "), got (0x%16.16" PRIx64
+                                      "), sources (0x%16.16" PRIx64 ")\n",
+                                      (int)i, (int)j,
+                                      ((cl_ulong *)&expected)[0],
+                                      *((cl_ulong *)p), *((cl_ulong *)in));
                         break;
                 }
                 return -1;
@@ -750,10 +761,14 @@ int test_two_param_integer_kernel(cl_command_queue queue, cl_context context, co
                         break;
 
                     case 8:
-                        log_error( "ERROR: Data sample %d:%d does not validate! Expected (0x%16.16llx), got (0x%16.16llx), sources (0x%16.16llx, 0x%16.16llx)\n",
-                                  (int)i, (int)j, ((cl_ulong*)&expected)[ 0 ], *( (cl_ulong *)out ),
-                                            *( (cl_ulong *)inA ),
-                                            *( (cl_ulong *)inB ) );
+                        log_error("ERROR: Data sample %d:%d does not validate! "
+                                  "Expected (0x%16.16" PRIx64
+                                  "), got (0x%16.16" PRIx64
+                                  "), sources (0x%16.16" PRIx64
+                                  ", 0x%16.16" PRIx64 ")\n",
+                                  (int)i, (int)j, ((cl_ulong *)&expected)[0],
+                                  *((cl_ulong *)out), *((cl_ulong *)inA),
+                                  *((cl_ulong *)inB));
                         break;
                 }
                 return -1;
@@ -1417,11 +1432,14 @@ int test_three_param_integer_kernel(cl_command_queue queue, cl_context context,
                         break;
 
                     case 8:
-                        log_error( "ERROR: Data sample %d:%d does not validate! Expected (0x%16.16llx), got (0x%16.16llx), sources (0x%16.16llx, 0x%16.16llx, 0x%16.16llx)\n",
-                                  (int)i, (int)j, ((cl_ulong*)&expected)[ 0 ], *( (cl_ulong *)out ),
-                                            *( (cl_ulong *)inA ),
-                                            *( (cl_ulong *)inB ),
-                                            *( (cl_ulong *)inC ) );
+                        log_error("ERROR: Data sample %d:%d does not validate! "
+                                  "Expected (0x%16.16" PRIx64
+                                  "), got (0x%16.16" PRIx64
+                                  "), sources (0x%16.16" PRIx64
+                                  ", 0x%16.16" PRIx64 ", 0x%16.16" PRIx64 ")\n",
+                                  (int)i, (int)j, ((cl_ulong *)&expected)[0],
+                                  *((cl_ulong *)out), *((cl_ulong *)inA),
+                                  *((cl_ulong *)inB), *((cl_ulong *)inC));
                         break;
                 }
                 return -1;
diff --git a/test_conformance/integer_ops/test_intmad24.cpp b/test_conformance/integer_ops/test_intmad24.cpp
index 1b1d549c19..d0fb3af9f9 100644
--- a/test_conformance/integer_ops/test_intmad24.cpp
+++ b/test_conformance/integer_ops/test_intmad24.cpp
@@ -139,8 +139,10 @@ verify_int_mad24(int *inptrA, int *inptrB, int *inptrC, int *outptr, size_t n, s
         r = a * b + inptrC[i];
         if (r != outptr[i])
         {
-            log_error( "Failed at %ld)  0x%8.8x * 0x%8.8x + 0x%8.8x = *0x%8.8x vs 0x%8.8x\n", i, a, b, inptrC[i], r, outptr[i] );
-             return -1;
+            log_error("Failed at %zu)  0x%8.8x * 0x%8.8x + 0x%8.8x = *0x%8.8x "
+                      "vs 0x%8.8x\n",
+                      i, a, b, inptrC[i], r, outptr[i]);
+            return -1;
         }
     }
 
@@ -160,8 +162,10 @@ verify_uint_mad24(cl_uint *inptrA, cl_uint *inptrB, cl_uint *inptrC, cl_uint *ou
         r = a * b + inptrC[i];
         if (r != outptr[i])
         {
-            log_error( "Failed at %ld)  0x%8.8x * 0x%8.8x + 0x%8.8x = *0x%8.8x vs 0x%8.8x\n", i, a, b, inptrC[i], r, outptr[i] );
-             return -1;
+            log_error("Failed at %zu)  0x%8.8x * 0x%8.8x + 0x%8.8x = *0x%8.8x "
+                      "vs 0x%8.8x\n",
+                      i, a, b, inptrC[i], r, outptr[i]);
+            return -1;
         }
     }
 
diff --git a/test_conformance/integer_ops/test_intmul24.cpp b/test_conformance/integer_ops/test_intmul24.cpp
index 0985a6aeca..5ba683eea0 100644
--- a/test_conformance/integer_ops/test_intmul24.cpp
+++ b/test_conformance/integer_ops/test_intmul24.cpp
@@ -153,8 +153,10 @@ verify_uint_mul24(cl_uint *inptrA, cl_uint *inptrB, cl_uint *outptr, size_t n, s
         r = (inptrA[i] & 0xffffffU) * (inptrB[i] & 0xffffffU);
         if (r != outptr[i])
         {
-            log_error( "failed at %ld: 0x%8.8x * 0x%8.8x = *0x%8.8x vs 0x%8.8x\n", i, inptrA[i], inptrB[i], r, outptr[i] );
-             return -1;
+            log_error(
+                "failed at %zu: 0x%8.8x * 0x%8.8x = *0x%8.8x vs 0x%8.8x\n", i,
+                inptrA[i], inptrB[i], r, outptr[i]);
+            return -1;
         }
     }
 
diff --git a/test_conformance/integer_ops/test_popcount.cpp b/test_conformance/integer_ops/test_popcount.cpp
index 31e4061572..04f30c9c5e 100644
--- a/test_conformance/integer_ops/test_popcount.cpp
+++ b/test_conformance/integer_ops/test_popcount.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <cinttypes>
+
 #include "procs.h"
 
 #define str(s) #s
@@ -36,26 +38,29 @@
         } \
     }
 
-#define __verify_popcount_func(__T) \
-    static int verify_popcount_##__T( const void *p, const void *r, size_t n, const char *sizeName, size_t vecSize ) \
-    { \
-        const __T *inA = (const __T *) p; \
-        const __T *outptr = (const __T *) r; \
-        size_t i; \
-        int _n = sizeof(__T)*8; \
-        __T ref; \
-        for(i = 0; i < n; i++) \
-        { \
-            __T x = inA[i]; \
-            __T res = outptr[i]; \
-            __popcnt(x, __T, _n, ref); \
-            if(res != ref) \
-            { \
-                log_info( "%ld) Failure for popcount( (%s%s) 0x%x ) = *%d vs %d\n", i, str(__T), sizeName, x, (int)ref, (int)res ); \
-                return -1; \
-            }\
-        } \
-        return 0; \
+#define __verify_popcount_func(__T)                                            \
+    static int verify_popcount_##__T(const void *p, const void *r, size_t n,   \
+                                     const char *sizeName, size_t vecSize)     \
+    {                                                                          \
+        const __T *inA = (const __T *)p;                                       \
+        const __T *outptr = (const __T *)r;                                    \
+        size_t i;                                                              \
+        int _n = sizeof(__T) * 8;                                              \
+        __T ref;                                                               \
+        for (i = 0; i < n; i++)                                                \
+        {                                                                      \
+            __T x = inA[i];                                                    \
+            __T res = outptr[i];                                               \
+            __popcnt(x, __T, _n, ref);                                         \
+            if (res != ref)                                                    \
+            {                                                                  \
+                log_info(                                                      \
+                    "%zu) Failure for popcount( (%s%s) 0x%x ) = *%d vs %d\n",  \
+                    i, str(__T), sizeName, (int)x, (int)ref, (int)res);        \
+                return -1;                                                     \
+            }                                                                  \
+        }                                                                      \
+        return 0;                                                              \
     }
 
 __verify_popcount_func(cl_char);
diff --git a/test_conformance/integer_ops/test_sub_sat.cpp b/test_conformance/integer_ops/test_sub_sat.cpp
index 2a88ee0df7..d5348728de 100644
--- a/test_conformance/integer_ops/test_sub_sat.cpp
+++ b/test_conformance/integer_ops/test_sub_sat.cpp
@@ -22,6 +22,7 @@
 #include <sys/stat.h>
 
 #include <algorithm>
+#include <cinttypes>
 
 #include "procs.h"
 
@@ -140,7 +141,13 @@ static int verify_subsat_long( const cl_long *inA, const cl_long *inB, const cl_
                 r = CL_LONG_MIN;
         }
         if( r != outptr[i] )
-        { log_info( "%d) Failure for sub_sat( (long%s) 0x%16.16llx, (long%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%d) Failure for sub_sat( (long%s) 0x%16.16" PRIx64
+                     ", (long%s) 0x%16.16" PRIx64 ") = *0x%16.16" PRIx64
+                     " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
@@ -154,7 +161,13 @@ static int verify_subsat_ulong( const cl_ulong *inA, const cl_ulong *inB, const
         if(  inA[i] < inB[i] )
             r = 0;
         if( r != outptr[i] )
-        { log_info( "%d) Failure for sub_sat( (ulong%s) 0x%16.16llx, (ulong%s) 0x%16.16llx) = *0x%16.16llx vs 0x%16.16llx\n", i, sizeName, inA[i], sizeName, inB[i], r, outptr[i] ); return -1; }
+        {
+            log_info("%d) Failure for sub_sat( (ulong%s) 0x%16.16" PRIx64
+                     ", (ulong%s) 0x%16.16" PRIx64 ") = *0x%16.16" PRIx64
+                     " vs 0x%16.16" PRIx64 "\n",
+                     i, sizeName, inA[i], sizeName, inB[i], r, outptr[i]);
+            return -1;
+        }
     }
     return 0;
 }
diff --git a/test_conformance/integer_ops/test_unary_ops.cpp b/test_conformance/integer_ops/test_unary_ops.cpp
index c91c85aeb4..da3de6d1c4 100644
--- a/test_conformance/integer_ops/test_unary_ops.cpp
+++ b/test_conformance/integer_ops/test_unary_ops.cpp
@@ -16,6 +16,8 @@
 #include "testBase.h"
 #include "harness/conversions.h"
 
+#include <cinttypes>
+
 #define TEST_SIZE 512
 
 enum OpKonstants
@@ -71,8 +73,8 @@ int test_unary_op( cl_command_queue queue, cl_context context, OpKonstants which
     }
     else
     {
-        sprintf( loadLine, "vload%ld( tid, inOut )", vecSize );
-        sprintf( storeLine, "vstore%ld( inOutVal, tid, inOut )", vecSize );
+        sprintf(loadLine, "vload%zu( tid, inOut )", vecSize);
+        sprintf(storeLine, "vstore%zu( inOutVal, tid, inOut )", vecSize);
     }
 
     char sizeNames[][4] = { "", "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" };
@@ -159,8 +161,9 @@ template<typename T> int VerifyFn( void * actualPtr, void * inputPtr, size_t vec
 
             if( actualData[ index ] != nextVal )
             {
-                log_error( "ERROR: Validation failed on vector %ld:%ld (expected %lld, got %lld)", i, j,
-                          (cl_long)nextVal, (cl_long)actualData[ index ] );
+                log_error("ERROR: Validation failed on vector %zu:%zu "
+                          "(expected %" PRId64 ", got %" PRId64 ")",
+                          i, j, (cl_long)nextVal, (cl_long)actualData[index]);
                 return -1;
             }
         }
diff --git a/test_conformance/integer_ops/test_upsample.cpp b/test_conformance/integer_ops/test_upsample.cpp
index 9ae3f0c38b..33ecb586fa 100644
--- a/test_conformance/integer_ops/test_upsample.cpp
+++ b/test_conformance/integer_ops/test_upsample.cpp
@@ -213,7 +213,7 @@ void * create_upsample_data( ExplicitType type, void *sourceA, void *sourceB, si
             }
             break;
         default:
-            log_error( "ERROR: unknown type size: %ld\n", tSize );
+            log_error("ERROR: unknown type size: %zu\n", tSize);
             return NULL;
     }
 
diff --git a/test_conformance/integer_ops/verification_and_generation_functions.cpp b/test_conformance/integer_ops/verification_and_generation_functions.cpp
index 25fbe7174a..1b7459996e 100644
--- a/test_conformance/integer_ops/verification_and_generation_functions.cpp
+++ b/test_conformance/integer_ops/verification_and_generation_functions.cpp
@@ -20,6 +20,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 
+#include <cinttypes>
+
 #include "procs.h"
 #include "harness/conversions.h"
 
@@ -227,20 +229,50 @@ verify_long(int test, size_t vector_size, cl_long *inptrA, cl_long *inptrB, cl_l
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_long Verification failed at element %ld of %ld : 0x%llx %s 0x%llx = 0x%llx, got 0x%llx\n", i, n, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Vector shift failure at element %ld: original is 0x%llx %s %d (0x%llx)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
-                    log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %lld (0x%llx).\n", (int)log2(sizeof(cl_long)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
+                    log_error("cl_long Verification failed at element %zu of "
+                              "%zu : 0x%" PRIx64 " %s 0x%" PRIx64
+                              " = 0x%" PRIx64 ", got 0x%" PRIx64 "\n",
+                              i, n, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error(
+                        "\t1) Vector shift failure at element %zu: original is "
+                        "0x%" PRIx64 " %s %d (0x%" PRIx64 ")\n",
+                        i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("\t2) Take the %d LSBs of the shift to get the "
+                              "final shift amount %" PRId64 " (0x%" PRIx64
+                              ").\n",
+                              (int)log2(sizeof(cl_long) * 8),
+                              inptrB[i] & shift_mask, inptrB[i] & shift_mask);
                 }
                 else if (test == 10 || test == 11) {
 
-                    log_error("cl_long Verification failed at element %ld of %ld (%ld): 0x%llx %s 0x%llx = 0x%llx, got 0x%llx\n", i, n, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%llx %s %d (0x%llx)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
-                    log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %lld (0x%llx).\n", (int)log2(sizeof(cl_long)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
+                    log_error("cl_long Verification failed at element %zu of "
+                              "%zu (%zu): 0x%" PRIx64 " %s 0x%" PRIx64
+                              " = 0x%" PRIx64 ", got 0x%" PRIx64 "\n",
+                              i, n, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error(
+                        "\t1) Scalar shift failure at element %zu: original is "
+                        "0x%" PRIx64 " %s %d (0x%" PRIx64 ")\n",
+                        i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("\t2) Take the %d LSBs of the shift to get the "
+                              "final shift amount %" PRId64 " (0x%" PRIx64
+                              ").\n",
+                              (int)log2(sizeof(cl_long) * 8),
+                              inptrB[j] & shift_mask, inptrB[j] & shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%llx < 0x%llx) ? 0x%llx : 0x%llx = 0x%llx, got 0x%llx\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error("cl_int Verification failed at element %zu "
+                              "(%zu): (0x%" PRIx64 " < 0x%" PRIx64
+                              ") ? 0x%" PRIx64 " : 0x%" PRIx64 " = 0x%" PRIx64
+                              ", got 0x%" PRIx64 "\n",
+                              i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i],
+                              r, outptr[i]);
                 } else {
-                    log_error("cl_long Verification failed at element %ld of %ld: 0x%llx %s 0x%llx = 0x%llx, got 0x%llx\n", i, n, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_long Verification failed at element %zu of "
+                              "%zu: 0x%" PRIx64 " %s 0x%" PRIx64 " = 0x%" PRIx64
+                              ", got 0x%" PRIx64 "\n",
+                              i, n, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -423,19 +455,49 @@ verify_ulong(int test, size_t vector_size, cl_ulong *inptrA, cl_ulong *inptrB, c
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_ulong Verification failed at element %ld of %ld: 0x%llx %s 0x%llx = 0x%llx, got 0x%llx\n", i, n, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%llx %s %d (0x%llx)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
-                    log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %llu (0x%llx).\n", (int)log2(sizeof(cl_ulong)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
+                    log_error("cl_ulong Verification failed at element %zu of "
+                              "%zu: 0x%" PRIx64 " %s 0x%" PRIx64 " = 0x%" PRIx64
+                              ", got 0x%" PRIx64 "\n",
+                              i, n, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%" PRIx64 " %s %d (0x%" PRIx64 ")\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
+                    log_error("\t2) Take the %d LSBs of the shift to get the "
+                              "final shift amount %" PRIu64 " (0x%" PRIx64
+                              ").\n",
+                              (int)log2(sizeof(cl_ulong) * 8),
+                              inptrB[i] & shift_mask, inptrB[i] & shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_ulong Verification failed at element %ld of %ld (%ld): 0x%llx %s 0x%llx = 0x%llx, got 0x%llx\n", i, n, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%llx %s %d (0x%llx)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
-                    log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %lld (0x%llx).\n", (int)log2(sizeof(cl_long)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
+                    log_error("cl_ulong Verification failed at element %zu of "
+                              "%zu (%zu): 0x%" PRIx64 " %s 0x%" PRIx64
+                              " = 0x%" PRIx64 ", got 0x%" PRIx64 "\n",
+                              i, n, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error(
+                        "\t1) Scalar shift failure at element %zu: original is "
+                        "0x%" PRIx64 " %s %d (0x%" PRIx64 ")\n",
+                        i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("\t2) Take the %d LSBs of the shift to get the "
+                              "final shift amount %" PRId64 " (0x%" PRIx64
+                              ").\n",
+                              (int)log2(sizeof(cl_long) * 8),
+                              inptrB[j] & shift_mask, inptrB[j] & shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld of %ld (%ld): (0x%llx < 0x%llx) ? 0x%llx : 0x%llx = 0x%llx, got 0x%llx\n", i, n, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error("cl_int Verification failed at element %zu of "
+                              "%zu (%zu): (0x%" PRIx64 " < 0x%" PRIx64
+                              ") ? 0x%" PRIx64 " : 0x%" PRIx64 " = 0x%" PRIx64
+                              ", got 0x%" PRIx64 "\n",
+                              i, n, j, inptrA[j], inptrB[j], inptrA[i],
+                              inptrB[i], r, outptr[i]);
                 } else {
-                    log_error("cl_ulong Verification failed at element %ld of %ld: 0x%llx %s 0x%llx = 0x%llx, got 0x%llx\n", i, n, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_ulong Verification failed at element %zu of "
+                              "%zu: 0x%" PRIx64 " %s 0x%" PRIx64 " = 0x%" PRIx64
+                              ", got 0x%" PRIx64 "\n",
+                              i, n, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -624,19 +686,37 @@ verify_int(int test, size_t vector_size, cl_int *inptrA, cl_int *inptrB, cl_int
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_int Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("cl_int Verification failed at element %zu: 0x%x "
+                              "%s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_int)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_int Verification failed at element %ld (%ld): 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("cl_int Verification failed at element %zu "
+                              "(%zu): 0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error("\t1) Scalar shift failure at element %zu: "
+                              "original is 0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[j],
+                              inptrB[j]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_int)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error(
+                        "cl_int Verification failed at element %zu (%zu): "
+                        "(0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n",
+                        i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i], r,
+                        outptr[i]);
                 } else {
-                    log_error("cl_int Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_int Verification failed at element %zu: 0x%x "
+                              "%s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -819,19 +899,37 @@ verify_uint(int test, size_t vector_size, cl_uint *inptrA, cl_uint *inptrB, cl_u
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_uint Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("cl_uint Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_uint)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_uint Verification failed at element %ld (%ld): 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("cl_uint Verification failed at element %zu "
+                              "(%zu): 0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error("\t1) Scalar shift failure at element %zu: "
+                              "original is 0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[j],
+                              inptrB[j]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_uint)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error(
+                        "cl_int Verification failed at element %zu (%zu): "
+                        "(0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n",
+                        i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i], r,
+                        outptr[i]);
                 } else {
-                    log_error("cl_uint Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_uint Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -1015,19 +1113,37 @@ verify_short(int test, size_t vector_size, cl_short *inptrA, cl_short *inptrB, c
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_short Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("cl_short Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_short)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_short Verification failed at element %ld (%ld): 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("cl_short Verification failed at element %zu "
+                              "(%zu): 0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error("\t1) Scalar shift failure at element %zu: "
+                              "original is 0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[j],
+                              inptrB[j]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_short)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error(
+                        "cl_int Verification failed at element %zu (%zu): "
+                        "(0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n",
+                        i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i], r,
+                        outptr[i]);
                 } else {
-                    log_error("cl_short Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_short Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -1213,19 +1329,37 @@ verify_ushort(int test, size_t vector_size, cl_ushort *inptrA, cl_ushort *inptrB
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_ushort Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("cl_ushort Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_ushort)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_ushort Verification failed at element %ld (%ld): 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("cl_ushort Verification failed at element %zu "
+                              "(%zu): 0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error("\t1) Scalar shift failure at element %zu: "
+                              "original is 0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[j],
+                              inptrB[j]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_ushort)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error(
+                        "cl_int Verification failed at element %zu (%zu): "
+                        "(0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n",
+                        i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i], r,
+                        outptr[i]);
                 } else {
-                    log_error("cl_ushort Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_ushort Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -1413,19 +1547,37 @@ verify_char(int test, size_t vector_size, cl_char *inptrA, cl_char *inptrB, cl_c
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_char Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("cl_char Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_char)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_char Verification failed at element %ld (%ld): 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("cl_char Verification failed at element %zu "
+                              "(%zu): 0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error("\t1) Scalar shift failure at element %zu: "
+                              "original is 0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[j],
+                              inptrB[j]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_long)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error(
+                        "cl_int Verification failed at element %zu (%zu): "
+                        "(0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n",
+                        i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i], r,
+                        outptr[i]);
                 } else {
-                    log_error("cl_char Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_char Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
@@ -1619,19 +1771,37 @@ verify_uchar(int test, size_t vector_size, cl_uchar *inptrA, cl_uchar *inptrB, c
             if (r != outptr[i]) {
                 // Shift is tricky
                 if (test == 8 || test == 9) {
-                    log_error("cl_uchar Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
-                    log_error("\t1) Shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[i], inptrB[i]);
+                    log_error("cl_uchar Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
+                    log_error("\t1) Shift failure at element %zu: original is "
+                              "0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[i],
+                              inptrB[i]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_uchar)*8),  inptrB[i]&shift_mask, inptrB[i]&shift_mask);
                 }
                 else if (test == 10 || test == 11) {
-                    log_error("cl_uchar Verification failed at element %ld (%ld): 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[i], tests[test], inptrB[j], r, outptr[i]);
-                    log_error("\t1) Scalar shift failure at element %ld: original is 0x%x %s %d (0x%x)\n", i, inptrA[i], tests[test], (int)inptrB[j], inptrB[j]);
+                    log_error("cl_uchar Verification failed at element %zu "
+                              "(%zu): 0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, j, inptrA[i], tests[test], inptrB[j], r,
+                              outptr[i]);
+                    log_error("\t1) Scalar shift failure at element %zu: "
+                              "original is 0x%x %s %d (0x%x)\n",
+                              i, inptrA[i], tests[test], (int)inptrB[j],
+                              inptrB[j]);
                     log_error("\t2) Take the %d LSBs of the shift to get the final shift amount %d (0x%x).\n", (int)log2(sizeof(cl_uchar)*8),  inptrB[j]&shift_mask, inptrB[j]&shift_mask);
                 } else if (test == 13) {
-                    log_error("cl_int Verification failed at element %ld (%ld): (0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n", i, j, inptrA[j], inptrB[j],
-                              inptrA[i], inptrB[i], r, outptr[i]);
+                    log_error(
+                        "cl_int Verification failed at element %zu (%zu): "
+                        "(0x%x < 0x%x) ? 0x%x : 0x%x = 0x%x, got 0x%x\n",
+                        i, j, inptrA[j], inptrB[j], inptrA[i], inptrB[i], r,
+                        outptr[i]);
                 } else {
-                    log_error("cl_uchar Verification failed at element %ld: 0x%x %s 0x%x = 0x%x, got 0x%x\n", i, inptrA[i], tests[test], inptrB[i], r, outptr[i]);
+                    log_error("cl_uchar Verification failed at element %zu: "
+                              "0x%x %s 0x%x = 0x%x, got 0x%x\n",
+                              i, inptrA[i], tests[test], inptrB[i], r,
+                              outptr[i]);
                 }
                 count++;
                 if (count >= MAX_ERRORS_TO_PRINT) {
diff --git a/test_conformance/math_brute_force/main.cpp b/test_conformance/math_brute_force/main.cpp
index 8d8acb1b19..b09a224dd8 100644
--- a/test_conformance/math_brute_force/main.cpp
+++ b/test_conformance/math_brute_force/main.cpp
@@ -843,10 +843,11 @@ test_status InitCL(cl_device_id device)
     IsTininessDetectedBeforeRounding();
 
     cl_platform_id platform;
-    int err = clGetPlatformIDs(1, &platform, NULL);
+    int err = clGetDeviceInfo(gDevice, CL_DEVICE_PLATFORM, sizeof(platform),
+                              &platform, NULL);
     if (err)
     {
-        print_error(err, "clGetPlatformIDs failed");
+        print_error(err, "clGetDeviceInfo for CL_DEVICE_PLATFORM failed");
         return TEST_FAIL;
     }
 
diff --git a/test_conformance/math_brute_force/unary_float.cpp b/test_conformance/math_brute_force/unary_float.cpp
index 0c497bc435..9666d5ea49 100644
--- a/test_conformance/math_brute_force/unary_float.cpp
+++ b/test_conformance/math_brute_force/unary_float.cpp
@@ -303,15 +303,14 @@ cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
 
                     if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0)
                     {
-                        float exp_error = ulps;
-
+                        // For full profile, ULP depends on input value.
+                        // For embedded profile, ULP comes from functionList.
                         if (!gIsEmbedded)
                         {
-                            exp_error += floor(fabs(2 * s[j]));
+                            ulps = 3.0f + floor(fabs(2 * s[j]));
                         }
 
-                        fail = !(fabsf(err) <= exp_error);
-                        ulps = exp_error;
+                        fail = !(fabsf(err) <= ulps);
                     }
                     if (strcmp(fname, "tan") == 0)
                     {
diff --git a/test_conformance/spir/CMakeLists.txt b/test_conformance/spir/CMakeLists.txt
index f65c03139f..1ac49ac36d 100644
--- a/test_conformance/spir/CMakeLists.txt
+++ b/test_conformance/spir/CMakeLists.txt
@@ -1,6 +1,3 @@
-# Import function list from math_brute_force
-add_definitions(-DFUNCTION_LIST_ULPS_ONLY)
-
 set(SPIR_OUT ${CONFORMANCE_PREFIX}spir${CONFORMANCE_SUFFIX})
 
 set (SPIR_SOURCES
@@ -9,25 +6,18 @@ set (SPIR_SOURCES
     run_build_test.cpp
     run_services.cpp
     kernelargs.cpp
-    ../math_brute_force/function_list.cpp
 )
 
 add_executable(${SPIR_OUT}
     ${SPIR_SOURCES})
 
 if(UNIX)
-    set_target_properties(${SPIR_OUT} PROPERTIES
-       COMPILE_FLAGS "-fexceptions -frtti")
+    target_compile_options(${SPIR_OUT} PRIVATE -fexceptions -frtti)
 elseif(MSVC)
-    set_target_properties(${SPIR_OUT} PROPERTIES
-       COMPILE_FLAGS "/GR /EHs /EHc")
+    target_compile_options(${SPIR_OUT} PRIVATE /GR /EHs /EHc)
 endif()
 
-TARGET_LINK_LIBRARIES(${SPIR_OUT} harness
-    ${CLConform_LIBRARIES})
-
-
-set_source_files_properties(${SPIR_SOURCES} PROPERTIES LANGUAGE CXX)
+target_link_libraries(${SPIR_OUT} harness ${CLConform_LIBRARIES})
 
 # Need to copy the spir zips to sit beside the executable
 
diff --git a/test_conformance/spir/main.cpp b/test_conformance/spir/main.cpp
index b02da734ac..ca02e5c21b 100644
--- a/test_conformance/spir/main.cpp
+++ b/test_conformance/spir/main.cpp
@@ -13,6 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+
+// Import function list from math_brute_force
+#define FUNCTION_LIST_ULPS_ONLY
+#include "../math_brute_force/function_list.cpp"
+
 #include "harness/compat.h"
 
 #include <stdio.h>
diff --git a/test_conformance/spirv_new/CMakeLists.txt b/test_conformance/spirv_new/CMakeLists.txt
index 6872097505..89f43f26f1 100644
--- a/test_conformance/spirv_new/CMakeLists.txt
+++ b/test_conformance/spirv_new/CMakeLists.txt
@@ -1,16 +1,3 @@
-######################################################################################################
-#Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-#
-#This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-#This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-#third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-#broadcast or otherwise exploited in any manner without the express prior written permission
-#of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-#disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-#in whole or in part other than under the terms of the Khronos Adopters Agreement
-#or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-######################################################################################################
-
 set(MODULE_NAME SPIRV_NEW)
 
 file(GLOB SPIRV_NEW_SOURCES "*.cpp")
diff --git a/test_conformance/spirv_new/assemble_spirv.py b/test_conformance/spirv_new/assemble_spirv.py
index 99b16adf4b..d02e5421e5 100755
--- a/test_conformance/spirv_new/assemble_spirv.py
+++ b/test_conformance/spirv_new/assemble_spirv.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 #####################################################################
-# Copyright (c) 2020 The Khronos Group Inc. All Rights Reserved.
+# Copyright (c) 2020-2023 The Khronos Group Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -30,6 +30,16 @@
 import sys
 from textwrap import wrap
 
+# sub-directories for specific SPIR-V environments
+spirv_envs = [
+    '', # all files in the root directory are considered SPIR-V 1.0
+    'spv1.1',
+    'spv1.2',
+    'spv1.3',
+    'spv1.4',
+    'spv1.5',
+    'spv1.6',
+]
 
 def fatal(message):
     """Print an error message and exit with a non-zero status, to
@@ -39,7 +49,7 @@ def fatal(message):
     sys.exit(1)
 
 
-def assemble_spirv(asm_dir, bin_dir, spirv_as, verbose):
+def assemble_spirv(asm_dir, bin_dir, spirv_as, spirv_env, verbose):
     """Assemble SPIR-V source into binaries."""
 
     if not os.path.exists(bin_dir):
@@ -57,8 +67,8 @@ def assemble_spirv(asm_dir, bin_dir, spirv_as, verbose):
             bin_file = asm_file_root + asm_file_ext.replace('asm', '')
             bin_file_path = os.path.join(bin_dir, bin_file)
 
-            command = '"{}" --target-env spv1.0 "{}" -o "{}"'.format(
-                spirv_as, asm_file_path, bin_file_path)
+            command = '"{}" --target-env "{}" "{}" -o "{}"'.format(
+                spirv_as, spirv_env, asm_file_path, bin_file_path)
             if subprocess.call(command, shell=True) != 0:
                 assembly_failures = True
                 print('ERROR: Failure assembling {}: '
@@ -72,7 +82,7 @@ def assemble_spirv(asm_dir, bin_dir, spirv_as, verbose):
             'messages from the assembler, if any.')))
 
 
-def validate_spirv(bin_dir, spirv_val, verbose):
+def validate_spirv(bin_dir, spirv_val, spirv_env, verbose):
     """Validates SPIR-V binaries.  Ignores known failures."""
 
     validation_failures = False
@@ -83,8 +93,8 @@ def validate_spirv(bin_dir, spirv_val, verbose):
             if verbose:
                 print(' Validating {}'.format(bin_file))
 
-            command = '"{}" "{}"'.format(
-                spirv_val, bin_file_path)
+            command = '"{}" --target-env "{}" "{}"'.format(
+                spirv_val, spirv_env, bin_file_path)
             if subprocess.call(command, shell=True) != 0:
                 print('ERROR: Failure validating {}: '
                       'see above output.'.format(
@@ -95,8 +105,6 @@ def validate_spirv(bin_dir, spirv_val, verbose):
     if validation_failures:
         fatal('ERROR: Validation failure(s) found.  '
               'See above for validation output.')
-    else:
-        print('All SPIR-V binaries validated successfully.')
 
 
 def parse_args():
@@ -144,18 +152,26 @@ def main():
 
     args = parse_args()
 
-    print('Assembling SPIR-V source into binaries...')
-    assemble_spirv(args.source_dir, args.output_dir, args.assembler,
-                   args.verbose)
-    print('Finished assembling SPIR-V binaries.')
-    print()
-
-    if args.skip_validation:
-        print('Skipping validation of SPIR-V binaries as requested.')
-    else:
-        print('Validating SPIR-V binaries...')
-        validate_spirv(args.output_dir, args.validator, args.verbose)
-    print()
+    for subdir in spirv_envs:
+        src_dir = os.path.join(args.source_dir, subdir)
+        out_dir = os.path.join(args.output_dir, subdir)
+        spirv_env = 'spv1.0' if subdir == '' else subdir
+        print('Assembling SPIR-V source into binaries for target {}...'.
+              format(spirv_env))
+        assemble_spirv(src_dir, out_dir, args.assembler,
+                    spirv_env, args.verbose)
+        print('Finished assembling SPIR-V binaries.')
+        print()
+
+        if args.skip_validation:
+            print('Skipping validation of SPIR-V binaries as requested.')
+        else:
+            print('Validating SPIR-V binaries for target {}...'.
+                  format(spirv_env))
+            validate_spirv(out_dir, args.validator,
+                    spirv_env, args.verbose)
+            print('All SPIR-V binaries validated successfully.')
+        print()
 
     print('Done.')
 
diff --git a/test_conformance/spirv_new/main.cpp b/test_conformance/spirv_new/main.cpp
index 4156683737..fc3c0bec7f 100644
--- a/test_conformance/spirv_new/main.cpp
+++ b/test_conformance/spirv_new/main.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include <stdio.h>
 #include <string.h>
@@ -30,9 +33,12 @@ const std::string slash = "/";
 #endif
 
 const std::string spvExt = ".spv";
+bool gVersionSkip = false;
 std::string gAddrWidth = "";
 std::string spvBinariesPath = "spirv_bin";
-std::string spvBinariesPathArg = "--spirv-binaries-path";
+
+const std::string spvBinariesPathArg = "--spirv-binaries-path";
+const std::string spvVersionSkipArg = "--skip-spirv-version-check";
 
 std::vector<unsigned char> readBinary(const char *file_name)
 {
@@ -224,7 +230,10 @@ test_status InitCL(cl_device_id id)
 
 void printUsage() {
     log_info("Reading SPIR-V files from default '%s' path.\n", spvBinariesPath.c_str());
-    log_info("In case you want to set other directory use '%s' argument.\n", spvBinariesPathArg.c_str());
+    log_info("In case you want to set other directory use '%s' argument.\n",
+             spvBinariesPathArg.c_str());
+    log_info("To skip the SPIR-V version check use the '%s' argument.\n",
+             spvVersionSkipArg.c_str());
 }
 
 int main(int argc, const char *argv[])
@@ -243,6 +252,11 @@ int main(int argc, const char *argv[])
                 modifiedSpvBinariesPath = true;
             }
         }
+        if (argv[i] == spvVersionSkipArg)
+        {
+            gVersionSkip = true;
+            argsRemoveNum++;
+        }
 
         if (argsRemoveNum > 0) {
             for (int j = i; j < (argc - argsRemoveNum); ++j)
diff --git a/test_conformance/spirv_new/procs.h b/test_conformance/spirv_new/procs.h
index b293a52059..a80d4edc85 100644
--- a/test_conformance/spirv_new/procs.h
+++ b/test_conformance/spirv_new/procs.h
@@ -1,17 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to
-the Khronos Group, Inc. This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not
-be disclosed in whole or in part to third parties, and may not be reproduced,
-republished, distributed, transmitted, displayed, broadcast or otherwise
-exploited in any manner without the express prior written permission of Khronos
-Group. The receipt or possession of this code does not convey any rights to
-reproduce, disclose, or distribute its contents, or to manufacture, use, or sell
-anything that it may describe, in whole or in part other than under the terms of
-the Khronos Adopters Agreement or Khronos Conformance Test Source License
-Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #pragma once
 
diff --git a/test_conformance/spirv_new/spirv_asm/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/basic.spvasm32
new file mode 100644
index 0000000000..a640c8c9bb
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/basic.spvasm64
new file mode 100644
index 0000000000..662bffbf6f
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm32
new file mode 100644
index 0000000000..1e693709d8
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm32
@@ -0,0 +1,42 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 20
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rte_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTE
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+     %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %14 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %14
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %15 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %16 = OpCompositeExtract %uint %15 0
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %16
+         %18 = OpLoad %half %17 Aligned 2
+          %6 = OpConvertFToS %ushort %18
+         %19 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %16
+               OpStore %19 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm64
new file mode 100644
index 0000000000..db50de90d4
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rte_half_short.spvasm64
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 23
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Int16
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rte_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTE
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+       %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %14 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %14
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %ulong %16 0
+         %18 = OpShiftLeftLogical %ulong %17 %ulong_32
+         %19 = OpShiftRightArithmetic %ulong %18 %ulong_32
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %19
+         %21 = OpLoad %half %20 Aligned 2
+          %6 = OpConvertFToS %ushort %21
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %19
+               OpStore %22 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm32
new file mode 100644
index 0000000000..c7c496c402
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm32
@@ -0,0 +1,42 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 21
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rtn_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTN
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %15
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %17
+         %19 = OpLoad %half %18 Aligned 2
+          %6 = OpConvertFToS %ushort %19
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %17
+               OpStore %20 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm64
new file mode 100644
index 0000000000..a30f6450c8
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtn_half_short.spvasm64
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 23
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rtn_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTN
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+       %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+     %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %14 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %14
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %ulong %16 0
+         %18 = OpShiftLeftLogical %ulong %17 %ulong_32
+         %19 = OpShiftRightArithmetic %ulong %18 %ulong_32
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %19
+         %21 = OpLoad %half %20 Aligned 2
+          %6 = OpConvertFToS %ushort %21
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %19
+               OpStore %22 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm32
new file mode 100644
index 0000000000..43c7fa24d1
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm32
@@ -0,0 +1,42 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 21
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rtp_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTP
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+       %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+       %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %15
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %17
+         %19 = OpLoad %half %18 Aligned 2
+          %6 = OpConvertFToS %ushort %19
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %17
+               OpStore %20 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm64
new file mode 100644
index 0000000000..e3a6b4052e
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtp_half_short.spvasm64
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 23
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rtp_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTP
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+       %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+       %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %14 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %14
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %ulong %16 0
+         %18 = OpShiftLeftLogical %ulong %17 %ulong_32
+         %19 = OpShiftRightArithmetic %ulong %18 %ulong_32
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %19
+         %21 = OpLoad %half %20 Aligned 2
+          %6 = OpConvertFToS %ushort %21
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %19
+               OpStore %22 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm32
new file mode 100644
index 0000000000..2d931cbdb1
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm32
@@ -0,0 +1,42 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 21
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rtz_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTZ
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+       %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+       %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %15 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %15
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %uint %16 0
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %17
+         %19 = OpLoad %half %18 Aligned 2
+          %6 = OpConvertFToS %ushort %19
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %17
+               OpStore %20 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm64
new file mode 100644
index 0000000000..e237448f0c
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_rounding_rtz_half_short.spvasm64
@@ -0,0 +1,46 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 23
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Float16
+               OpCapability Int16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "decorate_rounding_rtz_half_short" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %in "in"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %6 FPRoundingMode RTZ
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+       %ushort = OpTypeInt 16 0
+%_ptr_CrossWorkgroup_ushort = OpTypePointer CrossWorkgroup %ushort
+       %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %14 = OpTypeFunction %void %_ptr_CrossWorkgroup_ushort %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %14
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_ushort
+         %in = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %16 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %17 = OpCompositeExtract %ulong %16 0
+         %18 = OpShiftLeftLogical %ulong %17 %ulong_32
+         %19 = OpShiftRightArithmetic %ulong %18 %ulong_32
+         %20 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %in %19
+         %21 = OpLoad %half %20 Aligned 2
+          %6 = OpConvertFToS %ushort %21
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_ushort %res %19
+               OpStore %22 %6
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_int.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_int.spvasm32
similarity index 97%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_int.spvasm32
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_int.spvasm32
index 3fa47c9755..1b811208a6 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_int.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_int.spvasm32
@@ -8,7 +8,7 @@
                OpCapability Kernel
                OpCapability Float64
                OpMemoryModel Physical32 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_int" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_double_to_int" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uint.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_int.spvasm64
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uint.spvasm64
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_int.spvasm64
index 7d9efb08bc..5bec065f64 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uint.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_int.spvasm64
@@ -9,7 +9,7 @@
                OpCapability Int64
                OpCapability Float64
                OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_uint" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_double_to_int" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uint.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_uint.spvasm32
similarity index 97%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uint.spvasm32
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_uint.spvasm32
index 0672489157..c48185d3e3 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uint.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_uint.spvasm32
@@ -8,7 +8,7 @@
                OpCapability Kernel
                OpCapability Float64
                OpMemoryModel Physical32 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_uint" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_double_to_uint" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_int.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_uint.spvasm64
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_int.spvasm64
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_uint.spvasm64
index 8609e20896..49d19b8a6c 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_int.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_double_to_uint.spvasm64
@@ -9,7 +9,7 @@
                OpCapability Int64
                OpCapability Float64
                OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_int" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_double_to_uint" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_char.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_char.spvasm32
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_char.spvasm32
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_char.spvasm32
index 5437067f93..5e1a9c261f 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_char.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_char.spvasm32
@@ -8,7 +8,7 @@
                OpCapability Kernel
                OpCapability Int8
                OpMemoryModel Physical32 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_char" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_char" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_char.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_char.spvasm64
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_char.spvasm64
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_char.spvasm64
index ba4d6492ac..af74058918 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_char.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_char.spvasm64
@@ -9,7 +9,7 @@
                OpCapability Int64
                OpCapability Int8
                OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_char" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_char" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_short.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_short.spvasm32
similarity index 97%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_short.spvasm32
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_short.spvasm32
index dbb3b44dc5..d256583048 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_short.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_short.spvasm32
@@ -8,7 +8,7 @@
                OpCapability Kernel
                OpCapability Int16
                OpMemoryModel Physical32 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_short" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_short" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_short.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_short.spvasm64
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_short.spvasm64
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_short.spvasm64
index 2915c12c0b..7b9cfa8052 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_short.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_short.spvasm64
@@ -9,7 +9,7 @@
                OpCapability Int64
                OpCapability Int16
                OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_short" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_short" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uchar.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_uchar.spvasm32
similarity index 97%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uchar.spvasm32
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_uchar.spvasm32
index 9bffb686c8..150a340224 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uchar.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_uchar.spvasm32
@@ -8,7 +8,7 @@
                OpCapability Kernel
                OpCapability Int8
                OpMemoryModel Physical32 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_uchar" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_uchar" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uchar.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_uchar.spvasm64
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uchar.spvasm64
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_uchar.spvasm64
index 354639fe65..3152a02695 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_uchar.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_uchar.spvasm64
@@ -9,7 +9,7 @@
                OpCapability Int64
                OpCapability Int8
                OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_uchar" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_uchar" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_ushort.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_ushort.spvasm32
similarity index 97%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_ushort.spvasm32
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_ushort.spvasm32
index ffbb41776e..26dc05f57d 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_ushort.spvasm32
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_ushort.spvasm32
@@ -8,7 +8,7 @@
                OpCapability Kernel
                OpCapability Int16
                OpMemoryModel Physical32 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_ushort" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_ushort" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_ushort.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_ushort.spvasm64
similarity index 98%
rename from test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_ushort.spvasm64
rename to test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_ushort.spvasm64
index 317f99299f..a89239bfa7 100644
--- a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_ushort.spvasm64
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_float_to_ushort.spvasm64
@@ -9,7 +9,7 @@
                OpCapability Int64
                OpCapability Int16
                OpMemoryModel Physical64 OpenCL
-               OpEntryPoint Kernel %1 "decorate_saturated_conversion_ushort" %gl_GlobalInvocationID
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_float_to_ushort" %gl_GlobalInvocationID
                OpName %res "res"
                OpName %lhs "lhs"
                OpName %rhs "rhs"
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm32
new file mode 100644
index 0000000000..713d37cd7e
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm32
@@ -0,0 +1,47 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 25
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int8
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_half_to_char" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %7 SaturatedConversion
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %uchar = OpTypeInt 8 0
+%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %17 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %18 = OpCompositeExtract %uint %17 0
+         %19 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %lhs %18
+         %20 = OpLoad %half %19 Aligned 2
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %18
+         %22 = OpLoad %half %21 Aligned 2
+         %23 = OpFMul %half %20 %22
+          %7 = OpConvertFToS %uchar %23
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %res %18
+               OpStore %24 %7
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm64
new file mode 100644
index 0000000000..10d8caa542
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_char.spvasm64
@@ -0,0 +1,51 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 28
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Int8
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_half_to_char" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %7 SaturatedConversion
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+      %uchar = OpTypeInt 8 0
+%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %lhs %21
+         %23 = OpLoad %half %22 Aligned 2
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %21
+         %25 = OpLoad %half %24 Aligned 2
+         %26 = OpFMul %half %23 %25
+          %7 = OpConvertFToS %uchar %26
+         %27 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %res %21
+               OpStore %27 %7
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm32 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm32
new file mode 100644
index 0000000000..41b6830271
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm32
@@ -0,0 +1,47 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 25
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int8
+               OpCapability Float16
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_half_to_uchar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %7 SaturatedConversion
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+      %uchar = OpTypeInt 8 0
+%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_half
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %17 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 0
+         %18 = OpCompositeExtract %uint %17 0
+         %19 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %lhs %18
+         %20 = OpLoad %half %19 Aligned 2
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %18
+         %22 = OpLoad %half %21 Aligned 2
+         %23 = OpFMul %half %20 %22
+          %7 = OpConvertFToU %uchar %23
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %res %18
+               OpStore %24 %7
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm64 b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm64
new file mode 100644
index 0000000000..066b9d367f
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/decorate_saturated_conversion_half_to_uchar.spvasm64
@@ -0,0 +1,51 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos SPIR-V Tools Assembler; 0
+; Bound: 28
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpCapability Int8
+               OpCapability Float16
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %1 "decorate_saturated_conversion_half_to_uchar" %gl_GlobalInvocationID
+               OpName %res "res"
+               OpName %lhs "lhs"
+               OpName %rhs "rhs"
+               OpName %entry "entry"
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_GlobalInvocationId" Import
+               OpDecorate %7 SaturatedConversion
+      %ulong = OpTypeInt 64 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+      %uchar = OpTypeInt 8 0
+%_ptr_CrossWorkgroup_uchar = OpTypePointer CrossWorkgroup %uchar
+      %half = OpTypeFloat 16
+%_ptr_CrossWorkgroup_half = OpTypePointer CrossWorkgroup %half
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_uchar %_ptr_CrossWorkgroup_half %_ptr_CrossWorkgroup_half
+   %ulong_32 = OpConstant %ulong 32
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+          %1 = OpFunction %void None %16
+        %res = OpFunctionParameter %_ptr_CrossWorkgroup_uchar
+        %lhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+        %rhs = OpFunctionParameter %_ptr_CrossWorkgroup_half
+      %entry = OpLabel
+         %18 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 0
+         %19 = OpCompositeExtract %ulong %18 0
+         %20 = OpShiftLeftLogical %ulong %19 %ulong_32
+         %21 = OpShiftRightArithmetic %ulong %20 %ulong_32
+         %22 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %lhs %21
+         %23 = OpLoad %half %22 Aligned 2
+         %24 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_half %rhs %21
+         %25 = OpLoad %half %24 Aligned 2
+         %26 = OpFMul %half %23 %25
+          %7 = OpConvertFToU %uchar %26
+         %27 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uchar %res %21
+               OpStore %27 %7
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm32 b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm32
new file mode 100644
index 0000000000..dbdbe32e00
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm32
@@ -0,0 +1,47 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 27
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpExtension "SPV_KHR_linkonce_odr"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %17 "test_linkonce_odr" %__spirv_BuiltInGlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId Constant
+               OpDecorate %18 FuncParamAttr NoCapture
+               OpDecorate %a LinkageAttributes "a" LinkOnceODR
+               OpDecorate %b LinkageAttributes "b" Import
+               OpDecorate %__spirv_BuiltInGlobalInvocationId LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+          %6 = OpTypeFunction %uint %uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint
+%__spirv_BuiltInGlobalInvocationId = OpVariable %_ptr_Input_v3uint Input
+          %b = OpFunction %uint None %6
+          %8 = OpFunctionParameter %uint
+               OpFunctionEnd
+          %a = OpFunction %uint Pure %6
+         %10 = OpFunctionParameter %uint
+         %11 = OpLabel
+         %13 = OpIAdd %uint %10 %uint_5
+               OpReturnValue %13
+               OpFunctionEnd
+         %17 = OpFunction %void None %16
+         %18 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %19 = OpLabel
+         %20 = OpLoad %v3uint %__spirv_BuiltInGlobalInvocationId Aligned 16
+         %21 = OpCompositeExtract %uint %20 0
+         %22 = OpFunctionCall %uint %a %21
+         %23 = OpFunctionCall %uint %b %21
+         %24 = OpIAdd %uint %22 %23
+         %25 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %18 %21
+               OpStore %25 %24 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm64 b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm64
new file mode 100644
index 0000000000..243ab6b70e
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_main.spvasm64
@@ -0,0 +1,51 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 30
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpExtension "SPV_KHR_linkonce_odr"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %18 "test_linkonce_odr" %__spirv_BuiltInGlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId Constant
+               OpDecorate %19 FuncParamAttr NoCapture
+               OpDecorate %a LinkageAttributes "a" LinkOnceODR
+               OpDecorate %b LinkageAttributes "b" Import
+               OpDecorate %__spirv_BuiltInGlobalInvocationId LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+          %7 = OpTypeFunction %uint %uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+         %17 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint
+%__spirv_BuiltInGlobalInvocationId = OpVariable %_ptr_Input_v3ulong Input
+          %b = OpFunction %uint None %7
+          %9 = OpFunctionParameter %uint
+               OpFunctionEnd
+          %a = OpFunction %uint Pure %7
+         %11 = OpFunctionParameter %uint
+         %12 = OpLabel
+         %14 = OpIAdd %uint %11 %uint_5
+               OpReturnValue %14
+               OpFunctionEnd
+         %18 = OpFunction %void None %17
+         %19 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %20 = OpLabel
+         %21 = OpLoad %v3ulong %__spirv_BuiltInGlobalInvocationId Aligned 32
+         %22 = OpCompositeExtract %ulong %21 0
+         %23 = OpUConvert %uint %22
+         %24 = OpFunctionCall %uint %a %23
+         %25 = OpFunctionCall %uint %b %23
+         %26 = OpIAdd %uint %24 %25
+         %27 = OpSConvert %ulong %23
+         %28 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %19 %27
+               OpStore %28 %26 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm32 b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm32
new file mode 100644
index 0000000000..e0b01b6695
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm32
@@ -0,0 +1,44 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 27
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpExtension "SPV_KHR_linkonce_odr"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %17 "test_linkonce_odr" %__spirv_BuiltInGlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId Constant
+               OpDecorate %18 FuncParamAttr NoCapture
+               OpDecorate %a LinkageAttributes "a" Import
+               OpDecorate %b LinkageAttributes "b" Import
+               OpDecorate %__spirv_BuiltInGlobalInvocationId LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+          %6 = OpTypeFunction %uint %uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+         %16 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint
+%__spirv_BuiltInGlobalInvocationId = OpVariable %_ptr_Input_v3uint Input
+          %b = OpFunction %uint None %6
+          %8 = OpFunctionParameter %uint
+               OpFunctionEnd
+          %a = OpFunction %uint None %6
+         %10 = OpFunctionParameter %uint
+               OpFunctionEnd
+         %17 = OpFunction %void None %16
+         %18 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %19 = OpLabel
+         %20 = OpLoad %v3uint %__spirv_BuiltInGlobalInvocationId Aligned 16
+         %21 = OpCompositeExtract %uint %20 0
+         %22 = OpFunctionCall %uint %a %21
+         %23 = OpFunctionCall %uint %b %21
+         %24 = OpIAdd %uint %22 %23
+         %25 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %18 %21
+               OpStore %25 %24 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm64 b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm64
new file mode 100644
index 0000000000..bb02202765
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_noa_main.spvasm64
@@ -0,0 +1,48 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 30
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+               OpExtension "SPV_KHR_linkonce_odr"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %18 "test_linkonce_odr" %__spirv_BuiltInGlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
+               OpDecorate %__spirv_BuiltInGlobalInvocationId Constant
+               OpDecorate %19 FuncParamAttr NoCapture
+               OpDecorate %a LinkageAttributes "a" Import
+               OpDecorate %b LinkageAttributes "b" Import
+               OpDecorate %__spirv_BuiltInGlobalInvocationId LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+          %7 = OpTypeFunction %uint %uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+         %17 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint
+%__spirv_BuiltInGlobalInvocationId = OpVariable %_ptr_Input_v3ulong Input
+          %b = OpFunction %uint None %7
+          %9 = OpFunctionParameter %uint
+               OpFunctionEnd
+          %a = OpFunction %uint None %7
+         %11 = OpFunctionParameter %uint
+               OpFunctionEnd
+         %18 = OpFunction %void None %17
+         %19 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %20 = OpLabel
+         %21 = OpLoad %v3ulong %__spirv_BuiltInGlobalInvocationId Aligned 32
+         %22 = OpCompositeExtract %ulong %21 0
+         %23 = OpUConvert %uint %22
+         %24 = OpFunctionCall %uint %a %23
+         %25 = OpFunctionCall %uint %b %23
+         %26 = OpIAdd %uint %24 %25
+         %27 = OpSConvert %ulong %23
+         %28 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %19 %27
+               OpStore %28 %26 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm32 b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm32
new file mode 100644
index 0000000000..59c3f9d936
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm32
@@ -0,0 +1,28 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 14
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpExtension "SPV_KHR_linkonce_odr"
+               OpMemoryModel Physical32 OpenCL
+               OpDecorate %a LinkageAttributes "a" LinkOnceODR
+               OpDecorate %b LinkageAttributes "b" Export
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+     %uint_0 = OpConstant %uint 0
+          %3 = OpTypeFunction %uint %uint
+          %a = OpFunction %uint Pure %3
+          %5 = OpFunctionParameter %uint
+          %6 = OpLabel
+          %8 = OpIAdd %uint %5 %uint_5
+               OpReturnValue %8
+               OpFunctionEnd
+          %b = OpFunction %uint Pure %3
+         %10 = OpFunctionParameter %uint
+         %11 = OpLabel
+         %13 = OpISub %uint %uint_0 %10
+               OpReturnValue %13
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm64 b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm64
new file mode 100644
index 0000000000..5df6fdfecf
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/linkage_linkonce_odr_obj.spvasm64
@@ -0,0 +1,28 @@
+; SPIR-V
+; Version: 1.0
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 14
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpExtension "SPV_KHR_linkonce_odr"
+               OpMemoryModel Physical64 OpenCL
+               OpDecorate %a LinkageAttributes "a" LinkOnceODR
+               OpDecorate %b LinkageAttributes "b" Export
+       %uint = OpTypeInt 32 0
+     %uint_5 = OpConstant %uint 5
+     %uint_0 = OpConstant %uint 0
+          %3 = OpTypeFunction %uint %uint
+          %a = OpFunction %uint Pure %3
+          %5 = OpFunctionParameter %uint
+          %6 = OpLabel
+          %8 = OpIAdd %uint %5 %uint_5
+               OpReturnValue %8
+               OpFunctionEnd
+          %b = OpFunction %uint Pure %3
+         %10 = OpFunctionParameter %uint
+         %11 = OpLabel
+         %13 = OpISub %uint %uint_0 %10
+               OpReturnValue %13
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm32
new file mode 100644
index 0000000000..2388c8402e
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.1
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm64
new file mode 100644
index 0000000000..80bc770a34
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.1/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.1
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm32
new file mode 100644
index 0000000000..f3233224ca
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.2
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm64
new file mode 100644
index 0000000000..fcf8b44e73
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.2/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.2
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm32
new file mode 100644
index 0000000000..b47fbbf507
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm64
new file mode 100644
index 0000000000..ba5d232c45
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.3/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.3
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm32
new file mode 100644
index 0000000000..407ef51d4c
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm64
new file mode 100644
index 0000000000..c2debf9c80
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.4/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.4
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm32
new file mode 100644
index 0000000000..6b51ad5fb8
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.5
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm64
new file mode 100644
index 0000000000..fefc130c0e
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.5/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.5
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm32 b/test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm32
new file mode 100644
index 0000000000..ff5745ae32
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm32
@@ -0,0 +1,33 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 18
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical32 OpenCL
+               OpEntryPoint Kernel %9 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+       %uint = OpTypeInt 32 0
+     %v3uint = OpTypeVector %uint 3
+%_ptr_Input_v3uint = OpTypePointer Input %v3uint
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %8 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3uint Input
+          %9 = OpFunction %void None %8
+         %10 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpLabel
+         %13 = OpLoad %v3uint %gl_GlobalInvocationID Aligned 16
+         %14 = OpCompositeExtract %uint %13 0
+         %15 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %14
+         %16 = OpLoad %uint %15 Aligned 4
+         %17 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %10 %14
+               OpStore %17 %16 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm64 b/test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm64
new file mode 100644
index 0000000000..bed9d3aa0d
--- /dev/null
+++ b/test_conformance/spirv_new/spirv_asm/spv1.6/basic.spvasm64
@@ -0,0 +1,38 @@
+; SPIR-V
+; Version: 1.6
+; Generator: Khronos LLVM/SPIR-V Translator; 14
+; Bound: 22
+; Schema: 0
+               OpCapability Addresses
+               OpCapability Linkage
+               OpCapability Kernel
+               OpCapability Int64
+          %1 = OpExtInstImport "OpenCL.std"
+               OpMemoryModel Physical64 OpenCL
+               OpEntryPoint Kernel %10 "test_basic" %gl_GlobalInvocationID
+               OpDecorate %gl_GlobalInvocationID BuiltIn GlobalInvocationId
+               OpDecorate %gl_GlobalInvocationID Constant
+               OpDecorate %gl_GlobalInvocationID LinkageAttributes "__spirv_BuiltInGlobalInvocationId" Import
+      %ulong = OpTypeInt 64 0
+       %uint = OpTypeInt 32 0
+    %v3ulong = OpTypeVector %ulong 3
+%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
+       %void = OpTypeVoid
+%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
+          %9 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint %_ptr_CrossWorkgroup_uint
+%gl_GlobalInvocationID = OpVariable %_ptr_Input_v3ulong Input
+         %10 = OpFunction %void None %9
+         %11 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %12 = OpFunctionParameter %_ptr_CrossWorkgroup_uint
+         %13 = OpLabel
+         %14 = OpLoad %v3ulong %gl_GlobalInvocationID Aligned 32
+         %15 = OpCompositeExtract %ulong %14 0
+         %16 = OpUConvert %uint %15
+         %17 = OpSConvert %ulong %16
+         %18 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %12 %17
+         %19 = OpLoad %uint %18 Aligned 4
+         %20 = OpSConvert %ulong %16
+         %21 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %11 %20
+               OpStore %21 %19 Aligned 4
+               OpReturn
+               OpFunctionEnd
diff --git a/test_conformance/spirv_new/testBase.h b/test_conformance/spirv_new/testBase.h
index 1bcd6df5bf..54fe15bdab 100644
--- a/test_conformance/spirv_new/testBase.h
+++ b/test_conformance/spirv_new/testBase.h
@@ -1,15 +1,19 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
 #pragma once
 
 #ifndef _testBase_h
diff --git a/test_conformance/spirv_new/test_basic_versions.cpp b/test_conformance/spirv_new/test_basic_versions.cpp
new file mode 100644
index 0000000000..afe173906a
--- /dev/null
+++ b/test_conformance/spirv_new/test_basic_versions.cpp
@@ -0,0 +1,123 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "testBase.h"
+#include "types.hpp"
+
+#include <map>
+#include <sstream>
+#include <string>
+
+extern bool gVersionSkip;
+
+TEST_SPIRV_FUNC(basic_versions)
+{
+    cl_int error = CL_SUCCESS;
+
+    MTdataHolder d(gRandomSeed);
+
+    std::vector<cl_int> h_src(num_elements);
+    generate_random_data(kInt, h_src.size(), d, h_src.data());
+
+    clMemWrapper src =
+        clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+                       h_src.size() * sizeof(cl_int), h_src.data(), &error);
+    test_error(error, "Unable to create source buffer");
+
+    clMemWrapper dst =
+        clCreateBuffer(context, 0, h_src.size() * sizeof(cl_int), NULL, &error);
+    test_error(error, "Unable to create destination buffer");
+
+    std::map<std::string, std::string> mapILtoSubdir({
+        { "SPIR-V_1.0", "" }, // SPIR-V 1.0 files are in the base directory
+        { "SPIR-V_1.1", "spv1.1" },
+        { "SPIR-V_1.2", "spv1.2" },
+        { "SPIR-V_1.3", "spv1.3" },
+        { "SPIR-V_1.4", "spv1.4" },
+        { "SPIR-V_1.5", "spv1.5" },
+        { "SPIR-V_1.6", "spv1.6" },
+    });
+
+    size_t sz = 0;
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IL_VERSION, 0, NULL, &sz);
+    test_error(error, "Unable to query device IL versions size");
+
+    std::string ilVersions;
+    ilVersions.resize(sz);
+    error = clGetDeviceInfo(deviceID, CL_DEVICE_IL_VERSION, sz, &ilVersions[0],
+                            NULL);
+    test_error(error, "Unable to query device IL versions string");
+
+    for (auto& testCase : mapILtoSubdir)
+    {
+        if (gVersionSkip)
+        {
+            log_info("    Skipping version check for %s.\n",
+                     testCase.first.c_str());
+        }
+        else if (ilVersions.find(testCase.first) == std::string::npos)
+        {
+            log_info("    Version %s is not supported; skipping test.\n",
+                     testCase.first.c_str());
+            continue;
+        }
+        else
+        {
+            log_info("    testing %s...\n", testCase.first.c_str());
+        }
+
+        const cl_int zero = 0;
+        error =
+            clEnqueueFillBuffer(queue, dst, &zero, sizeof(zero), 0,
+                                h_src.size() * sizeof(cl_int), 0, NULL, NULL);
+        test_error(error, "Unable to initialize destination buffer");
+
+        std::string filename = testCase.second + "/basic";
+
+        clProgramWrapper prog;
+        error = get_program_with_il(prog, deviceID, context, filename.c_str());
+        test_error(error, "Unable to build SPIR-V program");
+
+        clKernelWrapper kernel = clCreateKernel(prog, "test_basic", &error);
+        test_error(error, "Unable to create SPIR-V kernel");
+
+        error |= clSetKernelArg(kernel, 0, sizeof(dst), &dst);
+        error |= clSetKernelArg(kernel, 1, sizeof(src), &src);
+        test_error(error, "Unable to set kernel arguments");
+
+        size_t global = num_elements;
+        error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0,
+                                       NULL, NULL);
+        test_error(error, "Unable to enqueue kernel");
+
+        std::vector<cl_int> h_dst(num_elements);
+        error = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0,
+                                    h_dst.size() * sizeof(cl_int), h_dst.data(),
+                                    0, NULL, NULL);
+        test_error(error, "Unable to read destination buffer");
+
+        for (int i = 0; i < num_elements; i++)
+        {
+            if (h_dst[i] != h_src[i])
+            {
+                log_error("Values do not match at location %d\n", i);
+                return TEST_FAIL;
+            }
+        }
+    }
+
+    return TEST_PASS;
+}
diff --git a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
index 0728ea0379..95227b28ff 100644
--- a/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
+++ b/test_conformance/spirv_new/test_cl_khr_spirv_no_integer_wrap_decoration.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2018 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2018-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_decorate.cpp b/test_conformance/spirv_new/test_decorate.cpp
index ccd7431554..db58e6b77b 100644
--- a/test_conformance/spirv_new/test_decorate.cpp
+++ b/test_conformance/spirv_new/test_decorate.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed h_in whole or h_in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited h_in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-h_in whole or h_in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
@@ -19,10 +22,7 @@ or Khronos Conformance Test Source License Agreement as executed between Khronos
 #include <limits>
 #include <cmath>
 
-#ifndef isnan
-// Ensure isnan is always present as a macro
-#define isnan std::isnan
-#endif
+#include <CL/cl_half.h>
 
 long double reference_remainderl(long double x, long double y);
 int gIsInRTZMode = 0;
@@ -30,7 +30,6 @@ int gDeviceILogb0 = 1;
 int gDeviceILogbNaN = 1;
 int gCheckTininessBeforeRounding = 1;
 
-
 static int verify_results(cl_device_id deviceID,
                           cl_context context,
                           cl_command_queue queue,
@@ -44,7 +43,8 @@ static int verify_results(cl_device_id deviceID,
     cl_int err = 0;
 
     RandomSeed seed(gRandomSeed);
-    for (int i = 0; i < num; i++) {
+    for (int i = 0; i < num; i++)
+    {
         h_lhs[i] = genrand<cl_int>(seed);
         h_rhs[i] = genrand<cl_int>(seed);
     }
@@ -86,8 +86,10 @@ static int verify_results(cl_device_id deviceID,
     err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, bytes, &h_res[0], 0, NULL, NULL);
     SPIRV_CHECK_ERROR(err, "Failed to read to output");
 
-    for (int i = 0; i < num; i++) {
-        if (h_res[i] != (h_lhs[i] + h_rhs[i])) {
+    for (int i = 0; i < num; i++)
+    {
+        if (h_res[i] != (h_lhs[i] + h_rhs[i]))
+        {
             log_error("Values do not match at location %d\n", i);
             return -1;
         }
@@ -132,12 +134,10 @@ TEST_SPIRV_FUNC(decorate_constant)
 
 TEST_SPIRV_FUNC(decorate_cpacked)
 {
-    PACKED(
-        struct packed_struct_t {
-            cl_int ival;
-            cl_char cval;
-        }
-        );
+    PACKED(struct packed_struct_t {
+        cl_int ival;
+        cl_char cval;
+    });
 
     typedef struct packed_struct_t packed_t;
 
@@ -166,9 +166,10 @@ TEST_SPIRV_FUNC(decorate_cpacked)
     err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, bytes, &h_res[0], 0, NULL, NULL);
     SPIRV_CHECK_ERROR(err, "Failed to read to output");
 
-    for (int i = 0; i < num; i++) {
-        if (h_res[i].ival != 2100483600 ||
-            h_res[i].cval != 127) {
+    for (int i = 0; i < num; i++)
+    {
+        if (h_res[i].ival != 2100483600 || h_res[i].cval != 127)
+        {
             log_error("Values do not match at location %d\n", i);
             return -1;
         }
@@ -177,20 +178,79 @@ TEST_SPIRV_FUNC(decorate_cpacked)
     return 0;
 }
 
-template<typename Ti, typename Tl, typename To>
-int verify_saturated_results(cl_device_id deviceID,
-                             cl_context context,
-                             cl_command_queue queue,
-                             const char *kname,
-                             const clProgramWrapper &prog)
+template <typename Ti, typename Tl, typename To>
+static inline Ti generate_saturated_lhs_input(RandomSeed &seed)
 {
-    if(std::string(kname).find("double") != std::string::npos) {
-        if(!is_extension_available(deviceID, "cl_khr_fp64")) {
-            log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
-            return 0;
+    constexpr auto loVal = std::numeric_limits<To>::min();
+    constexpr auto hiVal = std::numeric_limits<To>::max();
+    constexpr Tl range = (Tl)(hiVal) - (Tl)(loVal);
+
+    if (std::is_same<cl_half, Ti>::value)
+    {
+        return cl_half_from_float(genrand<float>(seed) * range, CL_HALF_RTE);
+    }
+
+    return genrand<Ti>(seed) * range;
+}
+
+template <typename Ti, typename Tl, typename To>
+static inline Ti generate_saturated_rhs_input(RandomSeed &seed)
+{
+    constexpr auto hiVal = std::numeric_limits<To>::max();
+
+    Tl val = genrand<Tl>(seed) % hiVal;
+    if (std::is_same<cl_half, Ti>::value)
+    {
+        if (val > 0 && val * 20 < hiVal)
+        {
+            return cl_half_from_float(NAN, CL_HALF_RTE);
         }
+        return cl_half_from_float(val, CL_HALF_RTE);
     }
 
+    if (val > 0 && val * 20 < hiVal)
+    {
+        return (Ti)NAN;
+    }
+    return val;
+}
+
+template <typename Ti, typename Tl, typename To>
+static inline To compute_saturated_output(Ti lhs, Ti rhs)
+{
+    constexpr auto loVal = std::numeric_limits<To>::min();
+    constexpr auto hiVal = std::numeric_limits<To>::max();
+
+    if (std::is_same<Ti, cl_half>::value)
+    {
+        cl_float f = cl_half_to_float(lhs) * cl_half_to_float(rhs);
+
+        // Quantize to fp16:
+        f = cl_half_to_float(cl_half_from_float(f, CL_HALF_RTE));
+
+        To val = (To)std::min<float>(std::max<float>(f, loVal), hiVal);
+        if (isnan(cl_half_from_float(rhs, CL_HALF_RTE)))
+        {
+            val = 0;
+        }
+        return val;
+    }
+
+    Tl ival = (Tl)(lhs * rhs);
+    To val = (To)std::min<Ti>(std::max<Ti>(ival, loVal), hiVal);
+
+    if (isnan(rhs))
+    {
+        val = 0;
+    }
+    return val;
+}
+
+template <typename Ti, typename Tl, typename To>
+int verify_saturated_results(cl_device_id deviceID, cl_context context,
+                             cl_command_queue queue, const char *kname,
+                             const clProgramWrapper &prog)
+{
     cl_int err = 0;
 
     const int num = 1 << 20;
@@ -204,21 +264,11 @@ int verify_saturated_results(cl_device_id deviceID,
     std::vector<Ti> h_lhs(num);
     std::vector<Ti> h_rhs(num);
 
-    To loVal = std::numeric_limits<To>::min();
-    To hiVal = std::numeric_limits<To>::max();
-
-    Tl range = (Tl)(hiVal) - (Tl)(loVal);
-
     RandomSeed seed(gRandomSeed);
-    for (int i = 0; i < num; i++) {
-        h_lhs[i] = genrand<Ti>(seed) * range;
-        Tl val = (genrand<Tl>(seed) % hiVal);
-        // randomly set some values on rhs to NaN
-        if (val * 20 < hiVal) {
-            h_rhs[i] = NAN;
-        } else {
-            h_rhs[i] = (Ti)(val);
-        }
+    for (int i = 0; i < num; i++)
+    {
+        h_lhs[i] = generate_saturated_lhs_input<Ti, Tl, To>(seed);
+        h_rhs[i] = generate_saturated_rhs_input<Ti, Tl, To>(seed);
     }
 
     clMemWrapper lhs = clCreateBuffer(context, CL_MEM_READ_ONLY, in_bytes, NULL, &err);
@@ -253,16 +303,13 @@ int verify_saturated_results(cl_device_id deviceID,
     err = clEnqueueReadBuffer(queue, res, CL_TRUE, 0, out_bytes, &h_res[0], 0, NULL, NULL);
     SPIRV_CHECK_ERROR(err, "Failed to read to output");
 
-    for (int i = 0; i < num; i++) {
-        Tl ival = (Tl)(h_lhs[i] * h_rhs[i]);
-        To val = (To)std::min<Ti>(std::max<Ti>(ival, loVal), hiVal);
-
-        if (isnan(h_rhs[i])) {
-            val = 0;
-        }
+    for (int i = 0; i < num; i++)
+    {
+        To val = compute_saturated_output<Ti, Tl, To>(h_lhs[i], h_rhs[i]);
 
-        if (val != h_res[i]) {
-            log_error("Value error at %d\n", i);
+        if (val != h_res[i])
+        {
+            log_error("Value error at %d: got %d, want %d\n", i, val, h_res[i]);
             return -1;
         }
     }
@@ -278,31 +325,47 @@ int test_saturate_full(cl_device_id deviceID,
                        const char *name,
                        const char *types)
 {
-    if(std::string(types).find("double") != std::string::npos) {
-        if(!is_extension_available(deviceID, "cl_khr_fp64")) {
-            log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
+    if (std::string(types).find("double") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp64"))
+        {
+            log_info("Extension cl_khr_fp64 not supported; skipping double "
+                     "tests.\n");
             return 0;
         }
     }
+
+    if (std::string(types).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
+            return 0;
+        }
+    }
+
     clProgramWrapper prog;
     cl_int err = 0;
     err = get_program_with_il(prog, deviceID, context, name);
     SPIRV_CHECK_ERROR(err, "Failed to build program");
-    return verify_saturated_results<Ti, Tl, To>(deviceID, context, queue, name, prog);
+    return verify_saturated_results<Ti, Tl, To>(deviceID, context, queue, name,
+                                                prog);
 }
 
-#define TEST_SATURATED_CONVERSION(Ti, Tl, To)           \
-    TEST_SPIRV_FUNC(decorate_saturated_conversion_##To) \
-    {                                                   \
-        typedef cl_##Ti cl_Ti;                          \
-        typedef cl_##Tl cl_Tl;                          \
-        typedef cl_##To cl_To;                          \
-        return test_saturate_full<cl_Ti, cl_Tl, cl_To>  \
-            (deviceID, context, queue,                  \
-             "decorate_saturated_conversion_" #To,      \
-             #Ti #Tl #To);                              \
-    }                                                   \
+#define TEST_SATURATED_CONVERSION(Ti, Tl, To)                                  \
+    TEST_SPIRV_FUNC(decorate_saturated_conversion_##Ti##_to_##To)              \
+    {                                                                          \
+        typedef cl_##Ti cl_Ti;                                                 \
+        typedef cl_##Tl cl_Tl;                                                 \
+        typedef cl_##To cl_To;                                                 \
+        const char *name = "decorate_saturated_conversion_" #Ti "_to_" #To;    \
+        return test_saturate_full<cl_Ti, cl_Tl, cl_To>(                        \
+            deviceID, context, queue, name, #Ti #Tl #To);                      \
+    }
 
+TEST_SATURATED_CONVERSION(half, short, char)
+TEST_SATURATED_CONVERSION(half, ushort, uchar)
 TEST_SATURATED_CONVERSION(float, int, char)
 TEST_SATURATED_CONVERSION(float, uint, uchar)
 TEST_SATURATED_CONVERSION(float, int, short)
@@ -318,13 +381,26 @@ int test_fp_rounding(cl_device_id deviceID,
                      std::vector<Ti> &h_in,
                      std::vector<To> &h_out)
 {
-    if(std::string(name).find("double") != std::string::npos) {
-        if(!is_extension_available(deviceID, "cl_khr_fp64")) {
-            log_info("Extension cl_khr_fp64 not supported; skipping double tests.\n");
+    if (std::string(name).find("double") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp64"))
+        {
+            log_info("Extension cl_khr_fp64 not supported; skipping double "
+                     "tests.\n");
+            return 0;
+        }
+    }
+
+    if (std::string(name).find("half") != std::string::npos)
+    {
+        if (!is_extension_available(deviceID, "cl_khr_fp16"))
+        {
+            log_info(
+                "Extension cl_khr_fp16 not supported; skipping half tests.\n");
             return 0;
         }
     }
- 
+
     const int num = h_in.size();
     const size_t in_bytes = num * sizeof(Ti);
     const size_t out_bytes = num * sizeof(To);
@@ -359,9 +435,12 @@ int test_fp_rounding(cl_device_id deviceID,
     err = clEnqueueReadBuffer(queue, out, CL_TRUE, 0, out_bytes, &h_res[0], 0, NULL, NULL);
     SPIRV_CHECK_ERROR(err, "Failed to read from output");
 
-    for (int i = 0; i < num; i++) {
-        if (h_res[i] != h_out[i]) {
-            log_error("Values do not match at location %d. Original :%lf, Expected: %ld, Found %ld\n",
+    for (int i = 0; i < num; i++)
+    {
+        if (h_res[i] != h_out[i])
+        {
+            log_error("Values do not match at location %d. Original :%lf, "
+                      "Expected: %ld, Found %ld\n",
                       i, h_in[i], h_out[i], h_res[i]);
             return -1;
         }
@@ -370,60 +449,80 @@ int test_fp_rounding(cl_device_id deviceID,
     return 0;
 }
 
-template<typename Ti, typename To>
-inline To round_to_zero(Ti in)
+template <typename T> static inline double to_double(T in) { return in; }
+
+template <> inline double to_double(cl_half in) { return cl_half_to_float(in); }
+
+template <typename Ti, typename To> static inline To round_to_zero(Ti in)
 {
-    To out = (To)(in);
-    return out;
+    return (To)to_double(in);
 }
 
-template<typename T>
-int sign(T val)
+template <typename T> static inline int sign(T val)
 {
     if (val < 0) return -1;
     if (val > 0) return 1;
     return 0;
 }
 
-template<typename Ti, typename To>
-inline To round_to_even(Ti in)
+template <typename Ti, typename To> static inline To round_to_even(Ti in)
 {
-    // https://en.wikipedia.org/wiki/Rounding#Round_half_to_even
-    return std::floor(in + 0.5) - 1 + std::abs(sign(reference_remainderl((long double)in, 2) - 0.5));
+    double din = to_double(in);
+    return std::floor(din + 0.5) - 1
+        + std::abs(sign(reference_remainderl((long double)din, 2) - 0.5));
 }
 
-template<typename Ti, typename To>
-inline To round_to_posinf(Ti in)
+template <typename Ti, typename To> static inline To round_to_posinf(Ti in)
 {
-    To out = std::ceil(in);
-    return out;
+    return std::ceil(to_double(in));
 }
 
-template<typename Ti, typename To>
-inline To round_to_neginf(Ti in)
+template <typename Ti, typename To> static inline To round_to_neginf(Ti in)
 {
-    To out = std::floor(in);
-    return out;
+    return std::floor(to_double(in));
 }
 
-#define TEST_SPIRV_FP_ROUNDING_DECORATE(name, func, Ti, To)             \
-    TEST_SPIRV_FUNC(decorate_fp_rounding_mode_##name##_##Ti##_##To)     \
-    {                                                                   \
-        typedef cl_##Ti clTi;                                           \
-        typedef cl_##To clTo;                                           \
-        const int num = 1 << 16;                                        \
-        std::vector<clTi> in(num);                                      \
-        std::vector<clTo>  out(num);                                    \
-        RandomSeed seed(gRandomSeed);                                   \
-                                                                        \
-        for (int i = 0; i < num; i++) {                                 \
-            in[i] = num * genrand<clTi>(seed) - num/2;                  \
-            out[i] = func<clTi, clTo>(in[i]);                           \
-        }                                                               \
-        const char *name = "decorate_rounding_" #name "_" #Ti "_" #To;  \
-        return test_fp_rounding(deviceID, context, queue,               \
-                                name, in, out);                         \
-    }                                                                   \
+template <typename Ti, typename To>
+static inline Ti generate_fprounding_input(RandomSeed &seed)
+{
+    if (std::is_same<cl_half, Ti>::value)
+    {
+        constexpr auto minVal =
+            static_cast<cl_float>(std::numeric_limits<To>::min() / 2);
+        constexpr auto maxVal =
+            static_cast<cl_float>(std::numeric_limits<To>::max() / 2);
+        cl_float f = genrandReal_range<cl_float>(minVal, maxVal, seed);
+        return cl_half_from_float(f, CL_HALF_RTE);
+    }
+
+    constexpr auto minVal = static_cast<Ti>(std::numeric_limits<To>::min() / 2);
+    constexpr auto maxVal = static_cast<Ti>(std::numeric_limits<To>::max() / 2);
+    return genrandReal_range<Ti>(minVal, maxVal, seed);
+}
+
+#define TEST_SPIRV_FP_ROUNDING_DECORATE(name, func, Ti, To)                    \
+    TEST_SPIRV_FUNC(decorate_fp_rounding_mode_##name##_##Ti##_##To)            \
+    {                                                                          \
+        typedef cl_##Ti clTi;                                                  \
+        typedef cl_##To clTo;                                                  \
+        const int num = 1 << 16;                                               \
+        std::vector<clTi> in(num);                                             \
+        std::vector<clTo> out(num);                                            \
+        RandomSeed seed(gRandomSeed);                                          \
+                                                                               \
+        for (int i = 0; i < num; i++)                                          \
+        {                                                                      \
+            in[i] = generate_fprounding_input<clTi, clTo>(seed);               \
+            out[i] = func<clTi, clTo>(in[i]);                                  \
+        }                                                                      \
+        const char *name = "decorate_rounding_" #name "_" #Ti "_" #To;         \
+        return test_fp_rounding(deviceID, context, queue, name, in, out);      \
+    }
+
+TEST_SPIRV_FP_ROUNDING_DECORATE(rte, round_to_even, half, short);
+TEST_SPIRV_FP_ROUNDING_DECORATE(rtz, round_to_zero, half, short);
+TEST_SPIRV_FP_ROUNDING_DECORATE(rtp, round_to_posinf, half, short);
+TEST_SPIRV_FP_ROUNDING_DECORATE(rtn, round_to_neginf, half, short);
 
 TEST_SPIRV_FP_ROUNDING_DECORATE(rte, round_to_even, float, int);
 TEST_SPIRV_FP_ROUNDING_DECORATE(rtz, round_to_zero, float, int);
diff --git a/test_conformance/spirv_new/test_get_program_il.cpp b/test_conformance/spirv_new/test_get_program_il.cpp
index c535eb5367..e84f224292 100644
--- a/test_conformance/spirv_new/test_get_program_il.cpp
+++ b/test_conformance/spirv_new/test_get_program_il.cpp
@@ -1,17 +1,18 @@
-/******************************************************************
-Copyright (c) 2020 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to
-the Khronos Group, Inc. This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not
-be disclosed in whole or in part to third parties, and may not be reproduced,
-republished, distributed, transmitted, displayed, broadcast or otherwise
-exploited in any manner without the express prior written permission of Khronos
-Group. The receipt or possession of this code does not convey any rights to
-reproduce, disclose, or distribute its contents, or to manufacture, use, or sell
-anything that it may describe, in whole or in part other than under the terms of
-the Khronos Adopters Agreement or Khronos Conformance Test Source License
-Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2020-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 
diff --git a/test_conformance/spirv_new/test_linkage.cpp b/test_conformance/spirv_new/test_linkage.cpp
index cf518c3ed0..ea17040ad2 100644
--- a/test_conformance/spirv_new/test_linkage.cpp
+++ b/test_conformance/spirv_new/test_linkage.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed h_in whole or h_in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited h_in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-h_in whole or h_in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
@@ -141,3 +144,96 @@ TEST_SPIRV_FUNC(linkage_import_function_link)
 
     return 0;
 }
+
+static int test_linkonce_odr_helper(cl_device_id deviceID, cl_context context,
+                                    cl_command_queue queue,
+                                    const char *main_module_filename)
+{
+    cl_int err = 0;
+
+    clProgramWrapper prog_obj;
+    err = test_linkage_compile(deviceID, context, queue,
+                               "linkage_linkonce_odr_obj", prog_obj);
+    SPIRV_CHECK_ERROR(err, "Failed to compile export program");
+
+    clProgramWrapper prog_main;
+    err = test_linkage_compile(deviceID, context, queue, main_module_filename,
+                               prog_main);
+    SPIRV_CHECK_ERROR(err, "Failed to compile import program");
+
+    cl_program progs[] = { prog_obj, prog_main };
+
+    clProgramWrapper prog =
+        clLinkProgram(context, 1, &deviceID, NULL, 2, progs, NULL, NULL, &err);
+    SPIRV_CHECK_ERROR(err, "Failed to link programs");
+
+    clKernelWrapper kernel = clCreateKernel(prog, "test_linkonce_odr", &err);
+    SPIRV_CHECK_ERROR(err, "Failed to create spv kernel");
+
+    const int num = 256;
+    std::vector<cl_int> h_in(num);
+    RandomSeed seed(gRandomSeed);
+    for (int i = 0; i < num; i++)
+    {
+        h_in[i] = genrand<cl_int>(seed) % 2048;
+    }
+
+    size_t bytes = sizeof(cl_int) * num;
+    clMemWrapper in =
+        clCreateBuffer(context, CL_MEM_READ_WRITE, bytes, NULL, &err);
+    SPIRV_CHECK_ERROR(err, "Failed to create  in buffer");
+
+    err = clEnqueueWriteBuffer(queue, in, CL_TRUE, 0, bytes, &h_in[0], 0, NULL,
+                               NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to copy to in buffer");
+
+    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in);
+    SPIRV_CHECK_ERROR(err, "Failed to set arg 1");
+
+    size_t global = num;
+    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL,
+                                 NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to enqueue cl kernel");
+
+    std::vector<cl_int> h_out(num);
+    err = clEnqueueReadBuffer(queue, in, CL_TRUE, 0, bytes, &h_out[0], 0, NULL,
+                              NULL);
+    SPIRV_CHECK_ERROR(err, "Failed to read to output");
+
+    for (int i = 0; i < num; i++)
+    {
+        if (h_out[i] != 5)
+        {
+            log_error("Incorrect values at location %d\n", i);
+            return TEST_FAIL;
+        }
+    }
+
+    return TEST_PASS;
+}
+
+TEST_SPIRV_FUNC(linkage_linkonce_odr)
+{
+    if (!is_extension_available(deviceID, "cl_khr_spirv_linkonce_odr"))
+    {
+        log_info("Extension cl_khr_spirv_linkonce_odr not supported; skipping "
+                 "tests.\n");
+        return TEST_SKIPPED_ITSELF;
+    }
+
+    int result = TEST_PASS;
+
+    // For this test, use the default main module, which has an "a" function
+    // with the linkonce_odr linkage type.  This ensures that having two "a"
+    // functions with linkonce_odr works properly.
+    result |= test_linkonce_odr_helper(deviceID, context, queue,
+                                       "linkage_linkonce_odr_main");
+
+    // For this test, use a main module without the "a" function.  This ensures
+    // that the "a" function is properly exported with the linkonce_odr linkage
+    // type.
+    result |= test_linkonce_odr_helper(deviceID, context, queue,
+                                       "linkage_linkonce_odr_noa_main");
+
+    return result;
+}
diff --git a/test_conformance/spirv_new/test_op_atomic.cpp b/test_conformance/spirv_new/test_op_atomic.cpp
index e4b6feb12b..0ee2cfb0b1 100644
--- a/test_conformance/spirv_new/test_op_atomic.cpp
+++ b/test_conformance/spirv_new/test_op_atomic.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_branch.cpp b/test_conformance/spirv_new/test_op_branch.cpp
index ec61afd9e1..f87de6c60a 100644
--- a/test_conformance/spirv_new/test_op_branch.cpp
+++ b/test_conformance/spirv_new/test_op_branch.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_branch_conditional.cpp b/test_conformance/spirv_new/test_op_branch_conditional.cpp
index ad94b824d6..8f62879807 100644
--- a/test_conformance/spirv_new/test_op_branch_conditional.cpp
+++ b/test_conformance/spirv_new/test_op_branch_conditional.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_composite_construct.cpp b/test_conformance/spirv_new/test_op_composite_construct.cpp
index e009eadf80..7ca8762611 100644
--- a/test_conformance/spirv_new/test_op_composite_construct.cpp
+++ b/test_conformance/spirv_new/test_op_composite_construct.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_constant.cpp b/test_conformance/spirv_new/test_op_constant.cpp
index 7c3c146c8e..c026fd42ff 100644
--- a/test_conformance/spirv_new/test_op_constant.cpp
+++ b/test_conformance/spirv_new/test_op_constant.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_copy_object.cpp b/test_conformance/spirv_new/test_op_copy_object.cpp
index 868300d3fb..b012960efc 100644
--- a/test_conformance/spirv_new/test_op_copy_object.cpp
+++ b/test_conformance/spirv_new/test_op_copy_object.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_fmath.cpp b/test_conformance/spirv_new/test_op_fmath.cpp
index 3cf0183733..de887bfee2 100644
--- a/test_conformance/spirv_new/test_op_fmath.cpp
+++ b/test_conformance/spirv_new/test_op_fmath.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_function.cpp b/test_conformance/spirv_new/test_op_function.cpp
index 16183e8058..4a0f4d2632 100644
--- a/test_conformance/spirv_new/test_op_function.cpp
+++ b/test_conformance/spirv_new/test_op_function.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_lifetime.cpp b/test_conformance/spirv_new/test_op_lifetime.cpp
index b60e14d694..86e7ce0661 100644
--- a/test_conformance/spirv_new/test_op_lifetime.cpp
+++ b/test_conformance/spirv_new/test_op_lifetime.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_loop_merge.cpp b/test_conformance/spirv_new/test_op_loop_merge.cpp
index 23d257d97c..3cac328471 100644
--- a/test_conformance/spirv_new/test_op_loop_merge.cpp
+++ b/test_conformance/spirv_new/test_op_loop_merge.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_negate.cpp b/test_conformance/spirv_new/test_op_negate.cpp
index 5009be9316..281b1f8ce7 100644
--- a/test_conformance/spirv_new/test_op_negate.cpp
+++ b/test_conformance/spirv_new/test_op_negate.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_opaque.cpp b/test_conformance/spirv_new/test_op_opaque.cpp
index e621606154..52b54a25b8 100644
--- a/test_conformance/spirv_new/test_op_opaque.cpp
+++ b/test_conformance/spirv_new/test_op_opaque.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_phi.cpp b/test_conformance/spirv_new/test_op_phi.cpp
index f9c69d79fc..77c1a57248 100644
--- a/test_conformance/spirv_new/test_op_phi.cpp
+++ b/test_conformance/spirv_new/test_op_phi.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_selection_merge.cpp b/test_conformance/spirv_new/test_op_selection_merge.cpp
index 6ea47f356d..2b06ce85c0 100644
--- a/test_conformance/spirv_new/test_op_selection_merge.cpp
+++ b/test_conformance/spirv_new/test_op_selection_merge.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_spec_constant.cpp b/test_conformance/spirv_new/test_op_spec_constant.cpp
index a280a4f7af..67ca7b5fea 100644
--- a/test_conformance/spirv_new/test_op_spec_constant.cpp
+++ b/test_conformance/spirv_new/test_op_spec_constant.cpp
@@ -1,17 +1,18 @@
-/******************************************************************
-Copyright (c) 2020 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to
-the Khronos Group, Inc. This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not
-be disclosed in whole or in part to third parties, and may not be reproduced,
-republished, distributed, transmitted, displayed, broadcast or otherwise
-exploited in any manner without the express prior written permission of Khronos
-Group. The receipt or possession of this code does not convey any rights to
-reproduce, disclose, or distribute its contents, or to manufacture, use, or sell
-anything that it may describe, in whole or in part other than under the terms of
-the Khronos Adopters Agreement or Khronos Conformance Test Source License
-Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2020-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_undef.cpp b/test_conformance/spirv_new/test_op_undef.cpp
index 659ab1a930..43610f82b4 100644
--- a/test_conformance/spirv_new/test_op_undef.cpp
+++ b/test_conformance/spirv_new/test_op_undef.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_vector_extract.cpp b/test_conformance/spirv_new/test_op_vector_extract.cpp
index f77aa7a2e1..62a155b407 100644
--- a/test_conformance/spirv_new/test_op_vector_extract.cpp
+++ b/test_conformance/spirv_new/test_op_vector_extract.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_vector_insert.cpp b/test_conformance/spirv_new/test_op_vector_insert.cpp
index 62fc78cb51..ed47238557 100644
--- a/test_conformance/spirv_new/test_op_vector_insert.cpp
+++ b/test_conformance/spirv_new/test_op_vector_insert.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
index 0be4e8b71c..da79d3bea3 100644
--- a/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
+++ b/test_conformance/spirv_new/test_op_vector_times_scalar.cpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #include "testBase.h"
 #include "types.hpp"
diff --git a/test_conformance/spirv_new/types.hpp b/test_conformance/spirv_new/types.hpp
index 728b244558..e40197afe5 100644
--- a/test_conformance/spirv_new/types.hpp
+++ b/test_conformance/spirv_new/types.hpp
@@ -1,15 +1,18 @@
-/******************************************************************
-Copyright (c) 2016 The Khronos Group Inc. All Rights Reserved.
-
-This code is protected by copyright laws and contains material proprietary to the Khronos Group, Inc.
-This is UNPUBLISHED PROPRIETARY SOURCE CODE that may not be disclosed in whole or in part to
-third parties, and may not be reproduced, republished, distributed, transmitted, displayed,
-broadcast or otherwise exploited in any manner without the express prior written permission
-of Khronos Group. The receipt or possession of this code does not convey any rights to reproduce,
-disclose, or distribute its contents, or to manufacture, use, or sell anything that it may describe,
-in whole or in part other than under the terms of the Khronos Adopters Agreement
-or Khronos Conformance Test Source License Agreement as executed between Khronos and the recipient.
-******************************************************************/
+//
+// Copyright (c) 2016-2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
 
 #pragma once
 #include <CL/cl.h>
@@ -91,6 +94,13 @@ T genrandReal(RandomSeed &seed)
     return genrand_real1(seed);
 }
 
+// Longer-term this could be refactored out and replace random_float():
+template <typename T> T genrandReal_range(T low, T high, RandomSeed &seed)
+{
+    T t = genrand_real1(seed);
+    return (1.0 - t) * low + t * high;
+}
+
 template<typename T, int N>
 T genrandRealVec(RandomSeed &seed)
 {
diff --git a/test_conformance/subgroups/procs.h b/test_conformance/subgroups/procs.h
index af6444c056..ba40a9f90e 100644
--- a/test_conformance/subgroups/procs.h
+++ b/test_conformance/subgroups/procs.h
@@ -21,67 +21,51 @@
 #include "harness/errorHelpers.h"
 #include "harness/conversions.h"
 #include "harness/typeWrappers.h"
-#include "harness/mt19937.h"
 
-extern MTdata gMTdata;
-
-extern int test_sub_group_info_ext(cl_device_id device, cl_context context,
-                                   cl_command_queue queue, int num_elements);
-extern int test_sub_group_info_core(cl_device_id device, cl_context context,
-                                    cl_command_queue queue, int num_elements);
-extern int test_work_item_functions_ext(cl_device_id device, cl_context context,
-                                        cl_command_queue queue,
-                                        int num_elements);
-extern int test_work_item_functions_core(cl_device_id device,
-                                         cl_context context,
-                                         cl_command_queue queue,
-                                         int num_elements);
-extern int test_subgroup_functions_ext(cl_device_id device, cl_context context,
-                                       cl_command_queue queue,
-                                       int num_elements);
-extern int test_subgroup_functions_core(cl_device_id device, cl_context context,
-                                        cl_command_queue queue,
-                                        int num_elements);
-extern int test_barrier_functions_ext(cl_device_id device, cl_context context,
-                                      cl_command_queue queue, int num_elements);
-extern int test_barrier_functions_core(cl_device_id device, cl_context context,
-                                       cl_command_queue queue,
-                                       int num_elements);
-extern int test_pipe_functions(cl_device_id device, cl_context context,
+int test_sub_group_info_ext(cl_device_id device, cl_context context,
+                            cl_command_queue queue, int num_elements);
+int test_sub_group_info_core(cl_device_id device, cl_context context,
+                             cl_command_queue queue, int num_elements);
+int test_work_item_functions_ext(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements);
+int test_work_item_functions_core(cl_device_id device, cl_context context,
+                                  cl_command_queue queue, int num_elements);
+int test_subgroup_functions_ext(cl_device_id device, cl_context context,
+                                cl_command_queue queue, int num_elements);
+int test_subgroup_functions_core(cl_device_id device, cl_context context,
+                                 cl_command_queue queue, int num_elements);
+int test_barrier_functions_ext(cl_device_id device, cl_context context,
                                cl_command_queue queue, int num_elements);
-extern int test_ifp_ext(cl_device_id device, cl_context context,
-                        cl_command_queue queue, int num_elements);
-extern int test_ifp_core(cl_device_id device, cl_context context,
-                         cl_command_queue queue, int num_elements);
-extern int test_subgroup_functions_extended_types(cl_device_id device,
-                                                  cl_context context,
-                                                  cl_command_queue queue,
-                                                  int num_elements);
-extern int test_subgroup_functions_non_uniform_vote(cl_device_id device,
-                                                    cl_context context,
-                                                    cl_command_queue queue,
-                                                    int num_elements);
-extern int test_subgroup_functions_non_uniform_arithmetic(
-    cl_device_id device, cl_context context, cl_command_queue queue,
-    int num_elements);
-extern int test_subgroup_functions_ballot(cl_device_id device,
-                                          cl_context context,
-                                          cl_command_queue queue,
-                                          int num_elements);
-extern int test_subgroup_functions_clustered_reduce(cl_device_id device,
-                                                    cl_context context,
-                                                    cl_command_queue queue,
-                                                    int num_elements);
-extern int test_subgroup_functions_shuffle(cl_device_id device,
+int test_barrier_functions_core(cl_device_id device, cl_context context,
+                                cl_command_queue queue, int num_elements);
+int test_ifp_ext(cl_device_id device, cl_context context,
+                 cl_command_queue queue, int num_elements);
+int test_ifp_core(cl_device_id device, cl_context context,
+                  cl_command_queue queue, int num_elements);
+int test_subgroup_functions_extended_types(cl_device_id device,
                                            cl_context context,
                                            cl_command_queue queue,
                                            int num_elements);
-extern int test_subgroup_functions_shuffle_relative(cl_device_id device,
-                                                    cl_context context,
-                                                    cl_command_queue queue,
-                                                    int num_elements);
-extern int test_subgroup_functions_rotate(cl_device_id device,
-                                          cl_context context,
-                                          cl_command_queue queue,
-                                          int num_elements);
+int test_subgroup_functions_non_uniform_vote(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
+                                                   cl_context context,
+                                                   cl_command_queue queue,
+                                                   int num_elements);
+int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements);
+int test_subgroup_functions_clustered_reduce(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
+                                    cl_command_queue queue, int num_elements);
+int test_subgroup_functions_shuffle_relative(cl_device_id device,
+                                             cl_context context,
+                                             cl_command_queue queue,
+                                             int num_elements);
+int test_subgroup_functions_rotate(cl_device_id device, cl_context context,
+                                   cl_command_queue queue, int num_elements);
 #endif /*_procs_h*/
diff --git a/test_conformance/thread_dimensions/main.cpp b/test_conformance/thread_dimensions/main.cpp
index 9a1ce609af..236d773171 100644
--- a/test_conformance/thread_dimensions/main.cpp
+++ b/test_conformance/thread_dimensions/main.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -19,25 +19,74 @@
 #include <string.h>
 #include "procs.h"
 
+// Additional parameters to limit test scope (-n,-b,-x)
+cl_uint maxThreadDimension = 0;
+cl_uint bufferSize = 0;
+cl_uint bufferStep = 0;
+
 test_definition test_list[] = {
-    ADD_TEST( quick_1d_explicit_local ),
-    ADD_TEST( quick_2d_explicit_local ),
-    ADD_TEST( quick_3d_explicit_local ),
-    ADD_TEST( quick_1d_implicit_local ),
-    ADD_TEST( quick_2d_implicit_local ),
-    ADD_TEST( quick_3d_implicit_local ),
-    ADD_TEST( full_1d_explicit_local ),
-    ADD_TEST( full_2d_explicit_local ),
-    ADD_TEST( full_3d_explicit_local ),
-    ADD_TEST( full_1d_implicit_local ),
-    ADD_TEST( full_2d_implicit_local ),
-    ADD_TEST( full_3d_implicit_local ),
+    ADD_TEST(quick_1d_explicit_local), ADD_TEST(quick_2d_explicit_local),
+    ADD_TEST(quick_3d_explicit_local), ADD_TEST(quick_1d_implicit_local),
+    ADD_TEST(quick_2d_implicit_local), ADD_TEST(quick_3d_implicit_local),
+    ADD_TEST(full_1d_explicit_local),  ADD_TEST(full_2d_explicit_local),
+    ADD_TEST(full_3d_explicit_local),  ADD_TEST(full_1d_implicit_local),
+    ADD_TEST(full_2d_implicit_local),  ADD_TEST(full_3d_implicit_local),
 };
 
-const int test_num = ARRAY_SIZE( test_list );
+const int test_num = ARRAY_SIZE(test_list);
 
 int main(int argc, const char *argv[])
 {
+    int delArg = 0;
+    for (auto i = 0; i < argc; i++)
+    {
+        delArg = 0;
+
+        if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
+        {
+            log_info("Thread dimensions options:\n");
+            log_info("\t-n\tMaximum thread dimension value\n");
+            log_info("\t-b\tSpecifies a buffer size for calculations\n");
+            log_info("\t-x\tSpecifies a step for calculations\n");
+        }
+        if (strcmp(argv[i], "-n") == 0)
+        {
+            delArg++;
+            if (atoi(argv[i + 1]) < 1)
+            {
+                log_info("ERROR: -n Maximum thread dimension value must be "
+                         "greater than 0");
+                return TEST_FAIL;
+            }
+            maxThreadDimension = atoi(argv[i + 1]);
+            delArg++;
+        }
+        if (strcmp(argv[i], "-b") == 0)
+        {
+            delArg++;
+            if (atoi(argv[i + 1]) < 1)
+            {
+                log_info("ERROR: -b Buffer size must be greater than 0");
+                return TEST_FAIL;
+            }
+            bufferSize = atoi(argv[i + 1]);
+            delArg++;
+        }
+        if (strcmp(argv[i], "-x") == 0)
+        {
+            delArg++;
+            if (atoi(argv[i + 1]) < 1)
+            {
+                log_info("ERROR: -x Buffer step must be greater than 0");
+                return TEST_FAIL;
+            }
+            bufferStep = atoi(argv[i + 1]);
+            delArg++;
+        }
+        for (int j = i; j < argc - delArg; j++) argv[j] = argv[j + delArg];
+        argc -= delArg;
+        i -= delArg;
+    }
+
     return runTestHarness(argc, argv, test_num, test_list, false, 0);
 }
-
diff --git a/test_conformance/thread_dimensions/procs.h b/test_conformance/thread_dimensions/procs.h
index d01d3c507d..261d8bf2a6 100644
--- a/test_conformance/thread_dimensions/procs.h
+++ b/test_conformance/thread_dimensions/procs.h
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -21,17 +21,52 @@
 
 extern const int kVectorSizeCount;
 
-extern int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-
-extern int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
-extern int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements);
+extern int test_quick_1d_explicit_local(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
+extern int test_quick_2d_explicit_local(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
+extern int test_quick_3d_explicit_local(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
+extern int test_quick_1d_implicit_local(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
+extern int test_quick_2d_implicit_local(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
+extern int test_quick_3d_implicit_local(cl_device_id deviceID,
+                                        cl_context context,
+                                        cl_command_queue queue,
+                                        int num_elements);
 
+extern int test_full_1d_explicit_local(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_full_2d_explicit_local(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_full_3d_explicit_local(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_full_1d_implicit_local(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_full_2d_implicit_local(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
+extern int test_full_3d_implicit_local(cl_device_id deviceID,
+                                       cl_context context,
+                                       cl_command_queue queue,
+                                       int num_elements);
diff --git a/test_conformance/thread_dimensions/test_thread_dimensions.cpp b/test_conformance/thread_dimensions/test_thread_dimensions.cpp
index c8d22c66a5..8eec15c103 100644
--- a/test_conformance/thread_dimensions/test_thread_dimensions.cpp
+++ b/test_conformance/thread_dimensions/test_thread_dimensions.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -25,38 +25,46 @@
 #define ITERATIONS 4
 #define DEBUG 0
 
-// If the environment variable DO_NOT_LIMIT_THREAD_SIZE is not set, the test will limit the maximum total
-// global dimensions tested to this value.
-#define MAX_TOTAL_GLOBAL_THREADS_FOR_TEST (1<<24)
+// If the environment variable DO_NOT_LIMIT_THREAD_SIZE is not set, the test
+// will limit the maximum total global dimensions tested to this value.
+#define MAX_TOTAL_GLOBAL_THREADS_FOR_TEST (1 << 24)
 int limit_size = 0;
 
-static int
-get_maximums(cl_kernel kernel, cl_context context,
-             size_t *max_workgroup_size_result,
-             cl_ulong *max_allcoation_result,
-             cl_ulong *max_physical_result) {
+extern cl_uint maxThreadDimension;
+extern cl_uint bufferSize;
+extern cl_uint bufferStep;
+
+static int get_maximums(cl_kernel kernel, cl_context context,
+                        size_t *max_workgroup_size_result,
+                        cl_ulong *max_allcoation_result,
+                        cl_ulong *max_physical_result)
+{
     int err = 0;
     cl_uint i;
     cl_device_id *devices;
 
     // Get all the devices in the device group
     size_t num_devices_returned;
-    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &num_devices_returned);
-    if(err != CL_SUCCESS)
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL,
+                           &num_devices_returned);
+    if (err != CL_SUCCESS)
     {
         log_error("clGetContextInfo() failed (%d).\n", err);
         return -10;
     }
     devices = (cl_device_id *)malloc(num_devices_returned);
-    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices_returned, devices, NULL);
-    if(err != CL_SUCCESS)
+    err = clGetContextInfo(context, CL_CONTEXT_DEVICES, num_devices_returned,
+                           devices, NULL);
+    if (err != CL_SUCCESS)
     {
         log_error("clGetContextInfo() failed (%d).\n", err);
         return -10;
     }
     num_devices_returned /= sizeof(cl_device_id);
-    if (num_devices_returned > 1) log_info("%d devices in device group.\n", (int)num_devices_returned);
-    if (num_devices_returned < 1) {
+    if (num_devices_returned > 1)
+        log_info("%d devices in device group.\n", (int)num_devices_returned);
+    if (num_devices_returned < 1)
+    {
         log_error("0 devices found for this kernel.\n");
         return -1;
     }
@@ -69,12 +77,16 @@ get_maximums(cl_kernel kernel, cl_context context,
     cl_ulong max_physical = 0;
     cl_ulong current_physical = 0;
 
-    for (i=0; i<num_devices_returned; i++) {
+    for (i = 0; i < num_devices_returned; i++)
+    {
         // Max workgroup size for this kernel on this device
-        err = clGetKernelWorkGroupInfo(kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(current_workgroup_size), &current_workgroup_size, NULL);
-        if(err != CL_SUCCESS)
+        err = clGetKernelWorkGroupInfo(
+            kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE,
+            sizeof(current_workgroup_size), &current_workgroup_size, NULL);
+        if (err != CL_SUCCESS)
         {
-            log_error("clGetKernelWorkGroupInfo() failed (%d) for device %d.\n", err, i);
+            log_error("clGetKernelWorkGroupInfo() failed (%d) for device %d.\n",
+                      err, i);
             return -10;
         }
         if (max_workgroup_size == 0)
@@ -83,10 +95,14 @@ get_maximums(cl_kernel kernel, cl_context context,
             max_workgroup_size = current_workgroup_size;
 
         // Get the maximum allocation size
-        err = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(current_allocation), &current_allocation, NULL);
-        if(err != CL_SUCCESS)
+        err = clGetDeviceInfo(devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                              sizeof(current_allocation), &current_allocation,
+                              NULL);
+        if (err != CL_SUCCESS)
         {
-            log_error("clGetDeviceConfigInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) failed (%d) for device %d.\n", err, i);
+            log_error("clGetDeviceConfigInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE) "
+                      "failed (%d) for device %d.\n",
+                      err, i);
             return -10;
         }
         if (max_allocation == 0)
@@ -95,10 +111,14 @@ get_maximums(cl_kernel kernel, cl_context context,
             max_allocation = current_allocation;
 
         // Get the maximum physical size
-        err = clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(current_physical), &current_physical, NULL);
-        if(err != CL_SUCCESS)
+        err =
+            clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE,
+                            sizeof(current_physical), &current_physical, NULL);
+        if (err != CL_SUCCESS)
         {
-            log_error("clGetDeviceConfigInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed (%d) for device %d.\n", err, i);
+            log_error("clGetDeviceConfigInfo(CL_DEVICE_GLOBAL_MEM_SIZE) failed "
+                      "(%d) for device %d.\n",
+                      err, i);
             return -10;
         }
         if (max_physical == 0)
@@ -108,8 +128,11 @@ get_maximums(cl_kernel kernel, cl_context context,
     }
     free(devices);
 
-    log_info("Device maximums: max local workgroup size:%d, max allocation size: %g MB, max physical memory %gMB\n",
-             (int)max_workgroup_size, (double)(max_allocation/1024.0/1024.0), (double)(max_physical/1024.0/1024.0));
+    log_info("Device maximums: max local workgroup size:%d, max allocation "
+             "size: %g MB, max physical memory %gMB\n",
+             (int)max_workgroup_size,
+             (double)(max_allocation / 1024.0 / 1024.0),
+             (double)(max_physical / 1024.0 / 1024.0));
     *max_workgroup_size_result = max_workgroup_size;
     *max_allcoation_result = max_allocation;
     *max_physical_result = max_physical;
@@ -117,128 +140,152 @@ get_maximums(cl_kernel kernel, cl_context context,
 }
 
 static const char *thread_dimension_kernel_code_atomic_long =
-"\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
-"          uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
-"          ulong start_address,  ulong end_address)\n"
-"{\n"
-"    uint error = 0;\n"
-"            if (get_global_id(0) >= final_x_size)\n"
-"                error = 64;\n"
-"            if (get_global_id(1) >= final_y_size)\n"
-"                error = 128;\n"
-"            if (get_global_id(2) >= final_z_size)\n"
-"                error = 256;\n"
-"\n"
-"        unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n"
-"                (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n"
-"        if ((t_address >= start_address) && (t_address < end_address))\n"
-"                atom_add(&dst[t_address-start_address], 1u);\n"
-"        if (error)\n"
-"                atom_or(&dst[t_address-start_address], error);\n"
-"\n"
-"}\n";
+    "\n"
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+    "__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
+    "          uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
+    "          ulong start_address,  ulong end_address)\n"
+    "{\n"
+    "    uint error = 0;\n"
+    "            if (get_global_id(0) >= final_x_size)\n"
+    "                error = 64;\n"
+    "            if (get_global_id(1) >= final_y_size)\n"
+    "                error = 128;\n"
+    "            if (get_global_id(2) >= final_z_size)\n"
+    "                error = 256;\n"
+    "\n"
+    "        unsigned long t_address = (unsigned "
+    "long)get_global_id(2)*(unsigned long)final_y_size*(unsigned "
+    "long)final_x_size + \n"
+    "                (unsigned long)get_global_id(1)*(unsigned "
+    "long)final_x_size + (unsigned long)get_global_id(0);\n"
+    "        if ((t_address >= start_address) && (t_address < end_address))\n"
+    "                atom_add(&dst[t_address-start_address], 1u);\n"
+    "        if (error)\n"
+    "                atom_or(&dst[t_address-start_address], error);\n"
+    "\n"
+    "}\n";
 
 static const char *thread_dimension_kernel_code_not_atomic_long =
-"\n"
-"__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
-"          uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
-"          ulong start_address,  ulong end_address)\n"
-"{\n"
-"    uint error = 0;\n"
-"            if (get_global_id(0) >= final_x_size)\n"
-"                error = 64;\n"
-"            if (get_global_id(1) >= final_y_size)\n"
-"                error = 128;\n"
-"            if (get_global_id(2) >= final_z_size)\n"
-"                error = 256;\n"
-"\n"
-"        unsigned long t_address = (unsigned long)get_global_id(2)*(unsigned long)final_y_size*(unsigned long)final_x_size + \n"
-"                (unsigned long)get_global_id(1)*(unsigned long)final_x_size + (unsigned long)get_global_id(0);\n"
-"        if ((t_address >= start_address) && (t_address < end_address))\n"
-"                dst[t_address-start_address]++;\n"
-"        if (error)\n"
-"                dst[t_address-start_address]|=error;\n"
-"\n"
-"}\n";
+    "\n"
+    "__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
+    "          uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
+    "          ulong start_address,  ulong end_address)\n"
+    "{\n"
+    "    uint error = 0;\n"
+    "            if (get_global_id(0) >= final_x_size)\n"
+    "                error = 64;\n"
+    "            if (get_global_id(1) >= final_y_size)\n"
+    "                error = 128;\n"
+    "            if (get_global_id(2) >= final_z_size)\n"
+    "                error = 256;\n"
+    "\n"
+    "        unsigned long t_address = (unsigned "
+    "long)get_global_id(2)*(unsigned long)final_y_size*(unsigned "
+    "long)final_x_size + \n"
+    "                (unsigned long)get_global_id(1)*(unsigned "
+    "long)final_x_size + (unsigned long)get_global_id(0);\n"
+    "        if ((t_address >= start_address) && (t_address < end_address))\n"
+    "                dst[t_address-start_address]++;\n"
+    "        if (error)\n"
+    "                dst[t_address-start_address]|=error;\n"
+    "\n"
+    "}\n";
 
 static const char *thread_dimension_kernel_code_atomic_not_long =
-"\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
-"         uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
-"         uint start_address,  uint end_address)\n"
-"{\n"
-"    uint error = 0;\n"
-"           if (get_global_id(0) >= final_x_size)\n"
-"               error = 64;\n"
-"           if (get_global_id(1) >= final_y_size)\n"
-"               error = 128;\n"
-"           if (get_global_id(2) >= final_z_size)\n"
-"               error = 256;\n"
-"\n"
-"       unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_size + \n"
-"               (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n"
-"       if ((t_address >= start_address) && (t_address < end_address))\n"
-"               atom_add(&dst[t_address-start_address], 1u);\n"
-"       if (error)\n"
-"               atom_or(&dst[t_address-start_address], error);\n"
-"\n"
-"}\n";
+    "\n"
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+    "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+    "__kernel void test_thread_dimension_atomic(__global uint *dst, \n"
+    "         uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
+    "         uint start_address,  uint end_address)\n"
+    "{\n"
+    "    uint error = 0;\n"
+    "           if (get_global_id(0) >= final_x_size)\n"
+    "               error = 64;\n"
+    "           if (get_global_id(1) >= final_y_size)\n"
+    "               error = 128;\n"
+    "           if (get_global_id(2) >= final_z_size)\n"
+    "               error = 256;\n"
+    "\n"
+    "       unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned "
+    "int)final_y_size*(unsigned int)final_x_size + \n"
+    "               (unsigned int)get_global_id(1)*(unsigned int)final_x_size "
+    "+ (unsigned int)get_global_id(0);\n"
+    "       if ((t_address >= start_address) && (t_address < end_address))\n"
+    "               atom_add(&dst[t_address-start_address], 1u);\n"
+    "       if (error)\n"
+    "               atom_or(&dst[t_address-start_address], error);\n"
+    "\n"
+    "}\n";
 
 static const char *thread_dimension_kernel_code_not_atomic_not_long =
-"\n"
-"__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
-"         uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
-"         uint start_address,  uint end_address)\n"
-"{\n"
-"    uint error = 0;\n"
-"           if (get_global_id(0) >= final_x_size)\n"
-"               error = 64;\n"
-"           if (get_global_id(1) >= final_y_size)\n"
-"               error = 128;\n"
-"           if (get_global_id(2) >= final_z_size)\n"
-"               error = 256;\n"
-"\n"
-"       unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned int)final_y_size*(unsigned int)final_x_size + \n"
-"               (unsigned int)get_global_id(1)*(unsigned int)final_x_size + (unsigned int)get_global_id(0);\n"
-"       if ((t_address >= start_address) && (t_address < end_address))\n"
-"               dst[t_address-start_address]++;\n"
-"       if (error)\n"
-"               dst[t_address-start_address]|=error;\n"
-"\n"
-"}\n";
+    "\n"
+    "__kernel void test_thread_dimension_not_atomic(__global uint *dst, \n"
+    "         uint final_x_size,   uint final_y_size,   uint final_z_size,\n"
+    "         uint start_address,  uint end_address)\n"
+    "{\n"
+    "    uint error = 0;\n"
+    "           if (get_global_id(0) >= final_x_size)\n"
+    "               error = 64;\n"
+    "           if (get_global_id(1) >= final_y_size)\n"
+    "               error = 128;\n"
+    "           if (get_global_id(2) >= final_z_size)\n"
+    "               error = 256;\n"
+    "\n"
+    "       unsigned int t_address = (unsigned int)get_global_id(2)*(unsigned "
+    "int)final_y_size*(unsigned int)final_x_size + \n"
+    "               (unsigned int)get_global_id(1)*(unsigned int)final_x_size "
+    "+ (unsigned int)get_global_id(0);\n"
+    "       if ((t_address >= start_address) && (t_address < end_address))\n"
+    "               dst[t_address-start_address]++;\n"
+    "       if (error)\n"
+    "               dst[t_address-start_address]|=error;\n"
+    "\n"
+    "}\n";
 
 char dim_str[128];
-char *
-print_dimensions(size_t x, size_t y, size_t z, cl_uint dim) {
+char *print_dimensions(size_t x, size_t y, size_t z, cl_uint dim)
+{
     // Not thread safe...
-    if (dim == 1) {
+    if (dim == 1)
+    {
         snprintf(dim_str, 128, "[%d]", (int)x);
-    } else if (dim == 2) {
+    }
+    else if (dim == 2)
+    {
         snprintf(dim_str, 128, "[%d x %d]", (int)x, (int)y);
-    } else if (dim == 3) {
+    }
+    else if (dim == 3)
+    {
         snprintf(dim_str, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z);
-    } else {
+    }
+    else
+    {
         snprintf(dim_str, 128, "INVALID DIM: %d", dim);
     }
     return dim_str;
 }
 
 char dim_str2[128];
-char *
-print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) {
+char *print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim)
+{
     // Not thread safe...
-    if (dim == 1) {
+    if (dim == 1)
+    {
         snprintf(dim_str2, 128, "[%d]", (int)x);
-    } else if (dim == 2) {
+    }
+    else if (dim == 2)
+    {
         snprintf(dim_str2, 128, "[%d x %d]", (int)x, (int)y);
-    } else if (dim == 3) {
+    }
+    else if (dim == 3)
+    {
         snprintf(dim_str2, 128, "[%d x %d x %d]", (int)x, (int)y, (int)z);
-    } else {
+    }
+    else
+    {
         snprintf(dim_str2, 128, "INVALID DIM: %d", dim);
     }
     return dim_str2;
@@ -246,57 +293,64 @@ print_dimensions2(size_t x, size_t y, size_t z, cl_uint dim) {
 
 
 /*
- This tests thread dimensions by executing a kernel across a range of dimensions.
- Each kernel instance does an atomic write into a specific location in a buffer to
- ensure that the correct dimensions are run. To handle large dimensions, the kernel
- masks its execution region internally. This allows a small (128MB) buffer to be used
- for very large executions by running the kernel multiple times.
+ This tests thread dimensions by executing a kernel across a range of
+ dimensions. Each kernel instance does an atomic write into a specific location
+ in a buffer to ensure that the correct dimensions are run. To handle large
+ dimensions, the kernel masks its execution region internally. This allows a
+ small (128MB) buffer to be used for very large executions by running the kernel
+ multiple times.
  */
-int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_mem array, cl_uint memory_size, cl_uint dimensions,
+int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel,
+             cl_mem array, cl_uint memory_size, cl_uint dimensions,
              cl_uint final_x_size, cl_uint final_y_size, cl_uint final_z_size,
              cl_uint local_x_size, cl_uint local_y_size, cl_uint local_z_size,
              int explict_local)
 {
     cl_uint errors = 0;
     size_t global_size[3], local_size[3];
-    global_size[0] = final_x_size;        local_size[0] = local_x_size;
-    global_size[1] = final_y_size;        local_size[1] = local_y_size;
-    global_size[2] = final_z_size;        local_size[2] = local_z_size;
+    global_size[0] = final_x_size;
+    local_size[0] = local_x_size;
+    global_size[1] = final_y_size;
+    local_size[1] = local_y_size;
+    global_size[2] = final_z_size;
+    local_size[2] = local_z_size;
 
     cl_ulong start_valid_memory_address = 0;
     cl_ulong end_valid_memory_address = memory_size;
-    cl_ulong last_memory_address = (cl_ulong)final_x_size*(cl_ulong)final_y_size*(cl_ulong)final_z_size*sizeof(cl_uint);
+    cl_ulong last_memory_address = (cl_ulong)final_x_size
+        * (cl_ulong)final_y_size * (cl_ulong)final_z_size * sizeof(cl_uint);
     if (end_valid_memory_address > last_memory_address)
         end_valid_memory_address = last_memory_address;
 
-    int number_of_iterations_required = (int)ceil((double)last_memory_address/(double)memory_size);
-    log_info("\t\tTest requires %gMB (%d test iterations using an allocation of %gMB).\n",
-             (double)last_memory_address/(1024.0*1024.0), number_of_iterations_required, (double)memory_size/(1024.0*1024.0));
-    //log_info("Last memory address: %llu, memory_size: %llu\n", last_memory_address, memory_size);
+    int number_of_iterations_required =
+        (int)ceil((double)last_memory_address / (double)memory_size);
+    log_info("\t\tTest requires %gMB (%d test iterations using an allocation "
+             "of %gMB).\n",
+             (double)last_memory_address / (1024.0 * 1024.0),
+             number_of_iterations_required,
+             (double)memory_size / (1024.0 * 1024.0));
+    // log_info("Last memory address: %llu, memory_size: %llu\n",
+    // last_memory_address, memory_size);
 
     while (end_valid_memory_address <= last_memory_address)
     {
         int err;
         const int fill_pattern = 0x0;
-        err = clEnqueueFillBuffer(queue, 
-                                  array, 
-                                  (void*)&fill_pattern, 
-                                  sizeof(fill_pattern), 
-                                  0, 
-                                  memory_size, 
-                                  0, 
-                                  NULL, 
+        err = clEnqueueFillBuffer(queue, array, (void *)&fill_pattern,
+                                  sizeof(fill_pattern), 0, memory_size, 0, NULL,
                                   NULL);
-        if (err != CL_SUCCESS) {
-            print_error( err, "Failed to set fill buffer.");
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Failed to set fill buffer.");
             return -3;
         }
 
-        cl_ulong start_valid_index = start_valid_memory_address/sizeof(cl_uint);
-        cl_ulong end_valid_index = end_valid_memory_address/sizeof(cl_uint);
+        cl_ulong start_valid_index =
+            start_valid_memory_address / sizeof(cl_uint);
+        cl_ulong end_valid_index = end_valid_memory_address / sizeof(cl_uint);
 
-        cl_uint start_valid_index_int = (cl_uint) start_valid_index;
-        cl_uint end_valid_index_int   = (cl_uint) end_valid_index;
+        cl_uint start_valid_index_int = (cl_uint)start_valid_index;
+        cl_uint end_valid_index_int = (cl_uint)end_valid_index;
 
         // Set the arguments
         err = clSetKernelArg(kernel, 0, sizeof(array), &array);
@@ -305,115 +359,149 @@ int run_test(cl_context context, cl_command_queue queue, cl_kernel kernel, cl_me
         err |= clSetKernelArg(kernel, 3, sizeof(final_z_size), &final_z_size);
         if (gHasLong)
         {
-            err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index), &start_valid_index);
-            err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index), &end_valid_index);
+            err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index),
+                                  &start_valid_index);
+            err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index),
+                                  &end_valid_index);
         }
         else
         {
-            err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index_int), &start_valid_index_int);
-            err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index_int), &end_valid_index_int);
+            err |= clSetKernelArg(kernel, 4, sizeof(start_valid_index_int),
+                                  &start_valid_index_int);
+            err |= clSetKernelArg(kernel, 5, sizeof(end_valid_index_int),
+                                  &end_valid_index_int);
         }
 
-        if (err != CL_SUCCESS) {
-            print_error( err, "Failed to set arguments.");
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Failed to set arguments.");
             return -3;
         }
 
 
         // Execute the kernel
-        if (explict_local == 0) {
-            err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, NULL, 0, NULL, NULL);
-            if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, NULL local, %d dim, start address %llu, end address %llu.\n",
-                                print_dimensions(global_size[0], global_size[1], global_size[2], dimensions),
-                                dimensions, start_valid_memory_address, end_valid_memory_address);
-        } else {
-            err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL, global_size, local_size, 0, NULL, NULL);
-            if (DEBUG) log_info("\t\t\tExecuting kernel with global %s, local %s, %d dim, start address %llu, end address %llu.\n",
-                                print_dimensions(global_size[0], global_size[1], global_size[2], dimensions), print_dimensions2(local_size[0], local_size[1], local_size[2], dimensions),
-                                dimensions, start_valid_memory_address, end_valid_memory_address);
+        if (explict_local == 0)
+        {
+            err = clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL,
+                                         global_size, NULL, 0, NULL, NULL);
+            if (DEBUG)
+                log_info("\t\t\tExecuting kernel with global %s, NULL local, "
+                         "%d dim, start address %llu, end address %llu.\n",
+                         print_dimensions(global_size[0], global_size[1],
+                                          global_size[2], dimensions),
+                         dimensions, start_valid_memory_address,
+                         end_valid_memory_address);
         }
-        if (err == CL_OUT_OF_RESOURCES) {
-            log_info("WARNING: kernel reported CL_OUT_OF_RESOURCES, indicating the global dimensions are too large. Skipping this size.\n");
+        else
+        {
+            err =
+                clEnqueueNDRangeKernel(queue, kernel, dimensions, NULL,
+                                       global_size, local_size, 0, NULL, NULL);
+            if (DEBUG)
+                log_info("\t\t\tExecuting kernel with global %s, local %s, %d "
+                         "dim, start address %llu, end address %llu.\n",
+                         print_dimensions(global_size[0], global_size[1],
+                                          global_size[2], dimensions),
+                         print_dimensions2(local_size[0], local_size[1],
+                                           local_size[2], dimensions),
+                         dimensions, start_valid_memory_address,
+                         end_valid_memory_address);
+        }
+        if (err == CL_OUT_OF_RESOURCES)
+        {
+            log_info(
+                "WARNING: kernel reported CL_OUT_OF_RESOURCES, indicating the "
+                "global dimensions are too large. Skipping this size.\n");
             return 0;
         }
-        if (err != CL_SUCCESS) {
-            print_error( err, "Failed to execute kernel\n");
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Failed to execute kernel\n");
             return -3;
         }
 
-        void* mapped = clEnqueueMapBuffer(queue, array, CL_TRUE, CL_MAP_READ, 0, memory_size, 0, NULL, NULL, &err );
-        if (err != CL_SUCCESS) {
-            print_error( err, "Failed to map results\n");
+        void *mapped = clEnqueueMapBuffer(queue, array, CL_TRUE, CL_MAP_READ, 0,
+                                          memory_size, 0, NULL, NULL, &err);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Failed to map results\n");
             return -4;
         }
-        cl_uint* data = (cl_uint*)mapped;
+        cl_uint *data = (cl_uint *)mapped;
 
         // Verify the data
         cl_uint i;
-        cl_uint last_address = (cl_uint)(end_valid_memory_address - start_valid_memory_address)/(cl_uint)sizeof(cl_uint);
-        for (i=0; i<last_address; i++) {
-            if (i < last_address) {
-                if (data[i] != 1) {
+        cl_uint last_address =
+            (cl_uint)(end_valid_memory_address - start_valid_memory_address)
+            / (cl_uint)sizeof(cl_uint);
+        for (i = 0; i < last_address; i++)
+        {
+            if (i < last_address)
+            {
+                if (data[i] != 1)
+                {
                     errors++;
                     //        log_info("%d expected 1 got %d\n", i, data[i]);
                 }
-            } else {
-                if (data[i] != 0) {
+            }
+            else
+            {
+                if (data[i] != 0)
+                {
                     errors++;
                     log_info("%d expected 0 got %d\n", i, data[i]);
                 }
             }
         }
 
-        err = clEnqueueUnmapMemObject(queue, array, mapped, 0, NULL, NULL );
-        if (err != CL_SUCCESS) {
-            print_error( err, "Failed to unmap results\n");
+        err = clEnqueueUnmapMemObject(queue, array, mapped, 0, NULL, NULL);
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Failed to unmap results\n");
             return -4;
         }
 
         err = clFlush(queue);
-        if (err != CL_SUCCESS) {
-            print_error( err, "Failed to flush\n");
+        if (err != CL_SUCCESS)
+        {
+            print_error(err, "Failed to flush\n");
             return -4;
         }
 
         // Increment the addresses
-        if (end_valid_memory_address == last_memory_address)
-            break;
-        start_valid_memory_address += memory_size;
-        end_valid_memory_address += memory_size;
+        if (end_valid_memory_address == last_memory_address) break;
+        start_valid_memory_address +=
+            memory_size * (bufferStep ? bufferStep : 1);
+        end_valid_memory_address += memory_size * (bufferStep ? bufferStep : 1);
         if (end_valid_memory_address > last_memory_address)
             end_valid_memory_address = last_memory_address;
     }
 
-    if (errors)
-        log_error("%d errors.\n", errors);
+    if (errors) log_error("%d errors.\n", errors);
     return errors;
 }
 
 
+static cl_uint max_x_size = 1, min_x_size = 1, max_y_size = 1, min_y_size = 1,
+               max_z_size = 1, min_z_size = 1;
 
-
-static cl_uint max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1;
-
-static void set_min(cl_uint *x, cl_uint *y, cl_uint *z) {
-    if (*x < min_x_size)
-        *x = min_x_size;
-    if (*y < min_y_size)
-        *y = min_y_size;
-    if (*z < min_z_size)
-        *z = min_z_size;
-    if (*x > max_x_size)
-        *x = max_x_size;
-    if (*y > max_y_size)
-        *y = max_y_size;
-    if (*z > max_z_size)
-        *z = max_z_size;
+static void set_min(cl_uint *x, cl_uint *y, cl_uint *z)
+{
+    if (*x < min_x_size) *x = min_x_size;
+    if (*y < min_y_size) *y = min_y_size;
+    if (*z < min_z_size) *z = min_z_size;
+    if (*x > max_x_size) *x = max_x_size;
+    if (*y > max_y_size) *y = max_y_size;
+    if (*z > max_z_size) *z = max_z_size;
 }
 
 
-int
-test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue queue, cl_uint dimensions, cl_uint min_dim, cl_uint max_dim, cl_uint quick_test, cl_uint size_increase_per_iteration, int explicit_local) {
+int test_thread_dimensions(cl_device_id device, cl_context context,
+                           cl_command_queue queue, cl_uint dimensions,
+                           cl_uint min_dim, cl_uint max_dim, cl_uint quick_test,
+                           cl_uint size_increase_per_iteration,
+                           int explicit_local)
+{
     cl_mem array;
     cl_program program;
     cl_kernel kernel;
@@ -424,9 +512,10 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
     int use_atomics = 1;
     MTdata d;
 
-    if (getenv("CL_WIMPY_MODE") && !quick_test) {
-      log_info("CL_WIMPY_MODE enabled, skipping test\n");
-      return 0;
+    if (getenv("CL_WIMPY_MODE") && !quick_test)
+    {
+        log_info("CL_WIMPY_MODE enabled, skipping test\n");
+        return 0;
     }
 
     // Unconditionally test larger sizes for CL 1.1
@@ -434,41 +523,74 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
     limit_size = 0;
 
     /* Check if atomics are supported. */
-    if (!is_extension_available(device, "cl_khr_global_int32_base_atomics")) {
-        log_info("WARNING: Base atomics not supported (cl_khr_global_int32_base_atomics). Test will not be guaranteed to catch overlaping thread dimensions.\n");
+    if (!is_extension_available(device, "cl_khr_global_int32_base_atomics"))
+    {
+        log_info("WARNING: Base atomics not supported "
+                 "(cl_khr_global_int32_base_atomics). Test will not be "
+                 "guaranteed to catch overlaping thread dimensions.\n");
         use_atomics = 0;
     }
 
     if (quick_test)
-        log_info("WARNING: Running quick test. This will only test the base dimensions (power of two) and base-1 with all local threads fixed in one dim.\n");
+        log_info("WARNING: Running quick test. This will only test the base "
+                 "dimensions (power of two) and base-1 with all local threads "
+                 "fixed in one dim.\n");
 
     // Verify that we can test this many dimensions
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(device_max_dimensions), &device_max_dimensions, NULL);
-    test_error(err, "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed");
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+                          sizeof(device_max_dimensions), &device_max_dimensions,
+                          NULL);
+    test_error(err,
+               "clGetDeviceInfo for CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed");
 
-    if (dimensions > device_max_dimensions) {
-        log_info("Can not test %d dimensions when device only supports %d.\n", dimensions, device_max_dimensions);
+    if (dimensions > device_max_dimensions)
+    {
+        log_info("Can not test %d dimensions when device only supports %d.\n",
+                 dimensions, device_max_dimensions);
         return 0;
     }
 
     log_info("Setting random seed to 0.\n");
 
-    if (gHasLong) {
-        if (use_atomics) {
-            err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_long, "test_thread_dimension_atomic" );
-        } else {
-            err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_long, "test_thread_dimension_not_atomic" );
+    if (gHasLong)
+    {
+        if (use_atomics)
+        {
+            err = create_single_kernel_helper(
+                context, &program, &kernel, 1,
+                &thread_dimension_kernel_code_atomic_long,
+                "test_thread_dimension_atomic");
+        }
+        else
+        {
+            err = create_single_kernel_helper(
+                context, &program, &kernel, 1,
+                &thread_dimension_kernel_code_not_atomic_long,
+                "test_thread_dimension_not_atomic");
         }
-    } else {
-        if (use_atomics) {
-            err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_atomic_not_long, "test_thread_dimension_atomic" );
-        } else {
-            err = create_single_kernel_helper( context, &program, &kernel, 1, &thread_dimension_kernel_code_not_atomic_not_long, "test_thread_dimension_not_atomic" );
+    }
+    else
+    {
+        if (use_atomics)
+        {
+            err = create_single_kernel_helper(
+                context, &program, &kernel, 1,
+                &thread_dimension_kernel_code_atomic_not_long,
+                "test_thread_dimension_atomic");
+        }
+        else
+        {
+            err = create_single_kernel_helper(
+                context, &program, &kernel, 1,
+                &thread_dimension_kernel_code_not_atomic_not_long,
+                "test_thread_dimension_not_atomic");
         }
     }
-    test_error( err, "Unable to create testing kernel" );
+    test_error(err, "Unable to create testing kernel");
 
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                          sizeof(max_local_workgroup_size),
+                          max_local_workgroup_size, NULL);
     test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
 
     // Get the maximum sizes supported by this device
@@ -477,50 +599,68 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
     cl_ulong max_physical = 0;
     int found_size = 0;
 
-    err = get_maximums(kernel, context,
-                       &max_workgroup_size, &max_allocation, &max_physical);
+    err = get_maximums(kernel, context, &max_workgroup_size, &max_allocation,
+                       &max_physical);
 
-    // Make sure we don't try to allocate more than half the physical memory present.
-    if (max_allocation > (max_physical/2)) {
-        log_info("Limiting max allocation to half of the maximum physical memory (%gMB of %gMB physical).\n",
-                 (max_physical/2/(1024.0*1024.0)), (max_physical/(1024.0*1024.0)));
-        max_allocation = max_physical/2;
+    // Make sure we don't try to allocate more than half the physical memory
+    // present.
+    if (max_allocation > (max_physical / 2))
+    {
+        log_info("Limiting max allocation to half of the maximum physical "
+                 "memory (%gMB of %gMB physical).\n",
+                 (max_physical / 2 / (1024.0 * 1024.0)),
+                 (max_physical / (1024.0 * 1024.0)));
+        max_allocation = max_physical / 2;
     }
 
     // Limit the maximum we'll allocate for this test to 512 to be reasonable.
-    if (max_allocation > 1024*1024*512) {
-        log_info("Limiting max allocation to 512MB from device maximum allocation of %gMB.\n", (max_allocation/1024.0/1024.0));
-        max_allocation = 1024*1024*512;
+    if (max_allocation > 1024 * 1024 * 512)
+    {
+        log_info("Limiting max allocation to 512MB from device maximum "
+                 "allocation of %gMB.\n",
+                 (max_allocation / 1024.0 / 1024.0));
+        max_allocation = 1024 * 1024 * 512;
     }
 
-    max_memory_size = (cl_uint)(max_allocation);
-    if (max_memory_size > 512*1024*1024)
-        max_memory_size = 512*1024*1024;
+    max_memory_size = bufferSize ? bufferSize : (cl_uint)(max_allocation);
+    if (max_memory_size > 512 * 1024 * 1024)
+        max_memory_size = 512 * 1024 * 1024;
     memory_size = max_memory_size;
 
-    log_info("Memory allocation size to use is %gMB, max workgroup size is %d.\n", max_memory_size/(1024.0*1024.0), (int)max_workgroup_size);
+    log_info(
+        "Memory allocation size to use is %gMB, max workgroup size is %d.\n",
+        max_memory_size / (1024.0 * 1024.0), (int)max_workgroup_size);
 
-    while (!found_size && memory_size >= max_memory_size/8) {
+    while (!found_size && memory_size >= max_memory_size / 8)
+    {
         array =
             clCreateBuffer(context, CL_MEM_READ_WRITE, memory_size, NULL, &err);
-        if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE || err == CL_OUT_OF_HOST_MEMORY) {
-            memory_size -= max_memory_size/16;
+        if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE
+            || err == CL_OUT_OF_HOST_MEMORY)
+        {
+            memory_size -= max_memory_size / 16;
             continue;
         }
-        if (err) {
-            print_error( err, "clCreateBuffer failed");
+        if (err)
+        {
+            print_error(err, "clCreateBuffer failed");
             return -1;
         }
         found_size = 1;
     }
 
-    if (!found_size) {
-        log_error("Failed to find a working size greater than 1/8th of the reported allocation size.\n");
+    if (!found_size)
+    {
+        log_error("Failed to find a working size greater than 1/8th of the "
+                  "reported allocation size.\n");
         return -1;
     }
 
-    if (memory_size < max_memory_size) {
-        log_info("Note: failed to allocate %gMB, using %gMB instead.\n", max_memory_size/(1024.0*1024.0), memory_size/(1024.0*1024.0));
+    if (memory_size < max_memory_size)
+    {
+        log_info("Note: failed to allocate %gMB, using %gMB instead.\n",
+                 max_memory_size / (1024.0 * 1024.0),
+                 memory_size / (1024.0 * 1024.0));
     }
 
     int errors = 0;
@@ -530,171 +670,290 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
     // 2 tests with each dimensions +/- 1
     // 2 tests with all dimensions +/- 1
     // 2 random tests
-    cl_uint tests_per_size = 1 + 2*dimensions + 2 + 2;
+    cl_uint tests_per_size = 1 + 2 * dimensions + 2 + 2;
 
     // 1 test with 1 as the local threads in each dimensions
     // 1 test with all the local threads in each dimension
     // 2 random tests
     cl_uint local_tests_per_size = 1 + dimensions + 2;
-    if (explicit_local == 0)
-        local_tests_per_size = 1;
+    if (explicit_local == 0) local_tests_per_size = 1;
 
-    max_x_size=1, min_x_size=1, max_y_size=1, min_y_size=1, max_z_size=1, min_z_size=1;
+    max_x_size = 1, min_x_size = 1, max_y_size = 1, min_y_size = 1,
+    max_z_size = 1, min_z_size = 1;
 
-    if (dimensions > 3) {
+    if (dimensions > 3)
+    {
         log_error("Invalid dimensions: %d\n", dimensions);
         return -1;
     }
     max_x_size = max_dim;
     min_x_size = min_dim;
-    if (dimensions > 1) {
+    if (dimensions > 1)
+    {
         max_y_size = max_dim;
         min_y_size = min_dim;
     }
-    if (dimensions > 2) {
+    if (dimensions > 2)
+    {
         max_z_size = max_dim;
         min_z_size = min_dim;
     }
 
-    log_info("Testing with dimensions up to %s.\n", print_dimensions(max_x_size, max_y_size, max_z_size, dimensions));
+    log_info("Testing with dimensions up to %s.\n",
+             print_dimensions(max_x_size, max_y_size, max_z_size, dimensions));
+    if (bufferSize)
+    {
+        log_info("Testing with buffer size %d.\n", bufferSize);
+    }
+    if (bufferStep)
+    {
+        log_info("Testing with buffer step %d.\n", bufferStep);
+    }
     cl_uint x_size, y_size, z_size;
 
-    d = init_genrand( gRandomSeed );
+    d = init_genrand(gRandomSeed);
     z_size = min_z_size;
-    while (z_size <= max_z_size) {
+    while (z_size <= max_z_size)
+    {
         y_size = min_y_size;
-        while (y_size <= max_y_size) {
+        while (y_size <= max_y_size)
+        {
             x_size = min_x_size;
-            while (x_size <= max_x_size) {
+            while (x_size <= max_x_size)
+            {
 
-                log_info("Base test size %s:\n", print_dimensions(x_size, y_size, z_size, dimensions));
+                log_info("Base test size %s:\n",
+                         print_dimensions(x_size, y_size, z_size, dimensions));
 
                 cl_uint sub_test;
                 cl_uint final_x_size, final_y_size, final_z_size;
-                for (sub_test = 0; sub_test < tests_per_size; sub_test++) {
+                for (sub_test = 0; sub_test < tests_per_size; sub_test++)
+                {
                     final_x_size = x_size;
                     final_y_size = y_size;
                     final_z_size = z_size;
 
-                    if (sub_test == 0) {
-                        if (DEBUG) log_info("\tTesting with base dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
-                    } else if (quick_test) {
-                        // If we are in quick mode just do 1 run with x-1, y-1, and z-1.
-                        if (sub_test > 1)
-                            break;
+                    if (sub_test == 0)
+                    {
+                        if (DEBUG)
+                            log_info(
+                                "\tTesting with base dimensions %s.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
+                    }
+                    else if (quick_test)
+                    {
+                        // If we are in quick mode just do 1 run with x-1, y-1,
+                        // and z-1.
+                        if (sub_test > 1) break;
                         final_x_size--;
                         final_y_size--;
                         final_z_size--;
                         set_min(&final_x_size, &final_y_size, &final_z_size);
-                        if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
-                    } else if (sub_test <= dimensions*2) {
-                        int dim_to_change = (sub_test-1)%dimensions;
-                        //log_info ("dim_to_change: %d (sub_test:%d) dimensions %d\n", dim_to_change,sub_test, dimensions);
+                        if (DEBUG)
+                            log_info(
+                                "\tTesting with all base dimensions - 1 %s.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
+                    }
+                    else if (sub_test <= dimensions * 2)
+                    {
+                        int dim_to_change = (sub_test - 1) % dimensions;
+                        // log_info ("dim_to_change: %d (sub_test:%d) dimensions
+                        // %d\n", dim_to_change,sub_test, dimensions);
                         int up_down = (sub_test > dimensions) ? 0 : 1;
 
-                        if (dim_to_change == 0) {
+                        if (dim_to_change == 0)
+                        {
                             final_x_size += (up_down) ? -1 : +1;
-                        } else if (dim_to_change == 1) {
+                        }
+                        else if (dim_to_change == 1)
+                        {
                             final_y_size += (up_down) ? -1 : +1;
-                        } else if (dim_to_change == 2) {
+                        }
+                        else if (dim_to_change == 2)
+                        {
                             final_z_size += (up_down) ? -1 : +1;
-                        } else {
-                            log_error("Invalid dim_to_change: %d\n", dim_to_change);
+                        }
+                        else
+                        {
+                            log_error("Invalid dim_to_change: %d\n",
+                                      dim_to_change);
                             return -1;
                         }
                         set_min(&final_x_size, &final_y_size, &final_z_size);
-                        if (DEBUG) log_info("\tTesting with one base dimension +/- 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
-                    } else if (sub_test == (dimensions*2+1)) {
-                        if (dimensions == 1)
-                            continue;
+                        if (DEBUG)
+                            log_info(
+                                "\tTesting with one base dimension +/- 1 %s.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
+                    }
+                    else if (sub_test == (dimensions * 2 + 1))
+                    {
+                        if (dimensions == 1) continue;
                         final_x_size--;
                         final_y_size--;
                         final_z_size--;
                         set_min(&final_x_size, &final_y_size, &final_z_size);
-                        if (DEBUG) log_info("\tTesting with all base dimensions - 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
-                    } else if (sub_test == (dimensions*2+2)) {
-                        if (dimensions == 1)
-                            continue;
+                        if (DEBUG)
+                            log_info(
+                                "\tTesting with all base dimensions - 1 %s.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
+                    }
+                    else if (sub_test == (dimensions * 2 + 2))
+                    {
+                        if (dimensions == 1) continue;
                         final_x_size++;
                         final_y_size++;
                         final_z_size++;
                         set_min(&final_x_size, &final_y_size, &final_z_size);
-                        if (DEBUG) log_info("\tTesting with all base dimensions + 1 %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
-                    } else {
-                        final_x_size = (int)get_random_float(0, (x_size/size_increase_per_iteration), d)+x_size/size_increase_per_iteration;
-                        final_y_size = (int)get_random_float(0, (y_size/size_increase_per_iteration), d)+y_size/size_increase_per_iteration;
-                        final_z_size = (int)get_random_float(0, (z_size/size_increase_per_iteration), d)+z_size/size_increase_per_iteration;
+                        if (DEBUG)
+                            log_info(
+                                "\tTesting with all base dimensions + 1 %s.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
+                    }
+                    else
+                    {
+                        final_x_size =
+                            (int)get_random_float(
+                                0, (x_size / size_increase_per_iteration), d)
+                            + x_size / size_increase_per_iteration;
+                        final_y_size =
+                            (int)get_random_float(
+                                0, (y_size / size_increase_per_iteration), d)
+                            + y_size / size_increase_per_iteration;
+                        final_z_size =
+                            (int)get_random_float(
+                                0, (z_size / size_increase_per_iteration), d)
+                            + z_size / size_increase_per_iteration;
                         set_min(&final_x_size, &final_y_size, &final_z_size);
-                        if (DEBUG) log_info("\tTesting with random dimensions %s.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
+                        if (DEBUG)
+                            log_info(
+                                "\tTesting with random dimensions %s.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
                     }
 
-                    if (limit_size && final_x_size*final_y_size*final_z_size >= MAX_TOTAL_GLOBAL_THREADS_FOR_TEST) {
-                        log_info("Skipping size %s as it exceeds max test threads of %d.\n", print_dimensions(final_x_size, final_y_size, final_z_size, dimensions), MAX_TOTAL_GLOBAL_THREADS_FOR_TEST);
+                    if (limit_size
+                        && final_x_size * final_y_size * final_z_size
+                            >= MAX_TOTAL_GLOBAL_THREADS_FOR_TEST)
+                    {
+                        log_info("Skipping size %s as it exceeds max test "
+                                 "threads of %d.\n",
+                                 print_dimensions(final_x_size, final_y_size,
+                                                  final_z_size, dimensions),
+                                 MAX_TOTAL_GLOBAL_THREADS_FOR_TEST);
                         continue;
                     }
 
                     cl_uint local_test;
                     cl_uint local_x_size, local_y_size, local_z_size;
-                    cl_uint previous_local_x_size=0, previous_local_y_size=0, previous_local_z_size=0;
-                    for (local_test = 0; local_test < local_tests_per_size; local_test++) {
+                    cl_uint previous_local_x_size = 0,
+                            previous_local_y_size = 0,
+                            previous_local_z_size = 0;
+                    for (local_test = 0; local_test < local_tests_per_size;
+                         local_test++)
+                    {
 
                         local_x_size = 1;
                         local_y_size = 1;
                         local_z_size = 1;
 
-                        if (local_test == 0) {
-                        } else if (local_test <= dimensions) {
-                            int dim_to_change = (local_test-1)%dimensions;
-                            if (dim_to_change == 0) {
+                        if (local_test == 0)
+                        {
+                        }
+                        else if (local_test <= dimensions)
+                        {
+                            int dim_to_change = (local_test - 1) % dimensions;
+                            if (dim_to_change == 0)
+                            {
                                 local_x_size = (cl_uint)max_workgroup_size;
-                            } else if (dim_to_change == 1) {
+                            }
+                            else if (dim_to_change == 1)
+                            {
                                 local_y_size = (cl_uint)max_workgroup_size;
-                            } else if (dim_to_change == 2) {
+                            }
+                            else if (dim_to_change == 2)
+                            {
                                 local_z_size = (cl_uint)max_workgroup_size;
-                            } else {
-                                log_error("Invalid dim_to_change: %d\n", dim_to_change);
+                            }
+                            else
+                            {
+                                log_error("Invalid dim_to_change: %d\n",
+                                          dim_to_change);
                                 free_mtdata(d);
                                 return -1;
                             }
-                        } else {
-                            local_x_size = (int)get_random_float(1, (int)max_workgroup_size, d);
-                            while ((local_x_size > 1) && (final_x_size%local_x_size != 0))
+                        }
+                        else
+                        {
+                            local_x_size = (int)get_random_float(
+                                1, (int)max_workgroup_size, d);
+                            while ((local_x_size > 1)
+                                   && (final_x_size % local_x_size != 0))
                                 local_x_size--;
-                            int remainder = (int)floor((double)max_workgroup_size/local_x_size);
+                            int remainder = (int)floor(
+                                (double)max_workgroup_size / local_x_size);
                             // Evenly prefer dimensions 2 and 1 first
-                            if (local_test % 2) {
-                                if (dimensions > 1) {
-                                    local_y_size = (int)get_random_float(1, (int)remainder, d);
-                                    while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
+                            if (local_test % 2)
+                            {
+                                if (dimensions > 1)
+                                {
+                                    local_y_size = (int)get_random_float(
+                                        1, (int)remainder, d);
+                                    while (
+                                        (local_y_size > 1)
+                                        && (final_y_size % local_y_size != 0))
                                         local_y_size--;
-                                    remainder = (int)floor((double)remainder/local_y_size);
+                                    remainder = (int)floor((double)remainder
+                                                           / local_y_size);
                                 }
-                                if (dimensions > 2) {
-                                    local_z_size = (int)get_random_float(1, (int)remainder, d);
-                                    while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
+                                if (dimensions > 2)
+                                {
+                                    local_z_size = (int)get_random_float(
+                                        1, (int)remainder, d);
+                                    while (
+                                        (local_z_size > 1)
+                                        && (final_z_size % local_z_size != 0))
                                         local_z_size--;
                                 }
-                            } else {
-                                if (dimensions > 2) {
-                                    local_z_size = (int)get_random_float(1, (int)remainder, d);
-                                    while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
+                            }
+                            else
+                            {
+                                if (dimensions > 2)
+                                {
+                                    local_z_size = (int)get_random_float(
+                                        1, (int)remainder, d);
+                                    while (
+                                        (local_z_size > 1)
+                                        && (final_z_size % local_z_size != 0))
                                         local_z_size--;
-                                    remainder = (int)floor((double)remainder/local_z_size);
+                                    remainder = (int)floor((double)remainder
+                                                           / local_z_size);
                                 }
-                                if (dimensions > 1) {
-                                    local_y_size = (int)get_random_float(1, (int)remainder, d);
-                                    while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
+                                if (dimensions > 1)
+                                {
+                                    local_y_size = (int)get_random_float(
+                                        1, (int)remainder, d);
+                                    while (
+                                        (local_y_size > 1)
+                                        && (final_y_size % local_y_size != 0))
                                         local_y_size--;
                                 }
                             }
                         }
 
-                        // Put all the threads in one dimension to speed up the test in quick mode.
-                        if (quick_test) {
+                        // Put all the threads in one dimension to speed up the
+                        // test in quick mode.
+                        if (quick_test)
+                        {
                             local_y_size = 1;
                             local_z_size = 1;
                             local_x_size = 1;
-                            if (final_z_size > final_y_size && final_z_size > final_x_size)
+                            if (final_z_size > final_y_size
+                                && final_z_size > final_x_size)
                                 local_z_size = (cl_uint)max_workgroup_size;
                             else if (final_y_size > final_x_size)
                                 local_y_size = (cl_uint)max_workgroup_size;
@@ -704,56 +963,85 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
 
                         if (local_x_size > max_local_workgroup_size[0])
                             local_x_size = (int)max_local_workgroup_size[0];
-                        if (dimensions > 1 && local_y_size > max_local_workgroup_size[1])
+                        if (dimensions > 1
+                            && local_y_size > max_local_workgroup_size[1])
                             local_y_size = (int)max_local_workgroup_size[1];
-                        if (dimensions > 2 && local_z_size > max_local_workgroup_size[2])
+                        if (dimensions > 2
+                            && local_z_size > max_local_workgroup_size[2])
                             local_z_size = (int)max_local_workgroup_size[2];
 
                         // Cleanup the local dimensions
-                        while ((local_x_size > 1) && (final_x_size%local_x_size != 0))
+                        while ((local_x_size > 1)
+                               && (final_x_size % local_x_size != 0))
                             local_x_size--;
-                        while ((local_y_size > 1) && (final_y_size%local_y_size != 0))
+                        while ((local_y_size > 1)
+                               && (final_y_size % local_y_size != 0))
                             local_y_size--;
-                        while ((local_z_size > 1) && (final_z_size%local_z_size != 0))
+                        while ((local_z_size > 1)
+                               && (final_z_size % local_z_size != 0))
                             local_z_size--;
-                        if ((previous_local_x_size == local_x_size) && (previous_local_y_size == local_y_size) && (previous_local_z_size == local_z_size))
+                        if ((previous_local_x_size == local_x_size)
+                            && (previous_local_y_size == local_y_size)
+                            && (previous_local_z_size == local_z_size))
                             continue;
 
-                        if (explicit_local == 0) {
+                        if (explicit_local == 0)
+                        {
                             local_x_size = 0;
                             local_y_size = 0;
                             local_z_size = 0;
                         }
 
-                        if (DEBUG) log_info("\t\tTesting local size %s.\n", print_dimensions(local_x_size, local_y_size, local_z_size, dimensions));
-
-                        if (explicit_local == 0) {
-                            log_info("\tTesting global %s local [NULL]...\n",
-                                     print_dimensions(final_x_size, final_y_size, final_z_size, dimensions));
-                        } else {
-                            log_info("\tTesting global %s local %s...\n",
-                                     print_dimensions(final_x_size, final_y_size, final_z_size, dimensions),
-                                     print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions));
+                        if (DEBUG)
+                            log_info(
+                                "\t\tTesting local size %s.\n",
+                                print_dimensions(local_x_size, local_y_size,
+                                                 local_z_size, dimensions));
+
+                        if (explicit_local == 0)
+                        {
+                            log_info(
+                                "\tTesting global %s local [NULL]...\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions));
+                        }
+                        else
+                        {
+                            log_info(
+                                "\tTesting global %s local %s...\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions),
+                                print_dimensions2(local_x_size, local_y_size,
+                                                  local_z_size, dimensions));
                         }
 
-                        // Avoid running with very small local sizes on very large global sizes
-                        cl_uint total_local_size = local_x_size * local_y_size * local_z_size;
+                        // Avoid running with very small local sizes on very
+                        // large global sizes
+                        cl_uint total_local_size =
+                            local_x_size * local_y_size * local_z_size;
                         long total_global_size = final_x_size * final_y_size * final_z_size;
                         if (total_local_size < max_workgroup_size) {
-                            if (total_global_size > 16384*16384) {
-                                if (total_local_size < 64) {
-                                    log_info("Skipping test as local_size is small and it will take a long time.\n");
-                                    continue;
-                                }
+                            if (((total_global_size > 16384 * 16384)
+                                 && (total_local_size < 64))
+                                || ((total_global_size > 8192 * 8192)
+                                    && (total_local_size < 16)))
+                            {
+                                log_info("Skipping test as local_size is small "
+                                         "and it will take a long time.\n");
+                                continue;
                             }
                         }
 
-                        err = run_test(context, queue, kernel, array, memory_size, dimensions,
-                                       final_x_size, final_y_size, final_z_size,
-                                       local_x_size, local_y_size, local_z_size, explicit_local);
+                        err =
+                            run_test(context, queue, kernel, array, memory_size,
+                                     dimensions, final_x_size, final_y_size,
+                                     final_z_size, local_x_size, local_y_size,
+                                     local_z_size, explicit_local);
 
-                        // If we failed to execute, then return so we don't crash.
-                        if (err < 0) {
+                        // If we failed to execute, then return so we don't
+                        // crash.
+                        if (err < 0)
+                        {
                             clReleaseMemObject(array);
                             clReleaseKernel(kernel);
                             clReleaseProgram(program);
@@ -762,10 +1050,14 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
                         }
 
                         // Otherwise, if we had errors add them up.
-                        if (err) {
-                            log_error("Test global %s local %s failed.\n",
-                                      print_dimensions(final_x_size, final_y_size, final_z_size, dimensions),
-                                      print_dimensions2(local_x_size, local_y_size, local_z_size, dimensions));
+                        if (err)
+                        {
+                            log_error(
+                                "Test global %s local %s failed.\n",
+                                print_dimensions(final_x_size, final_y_size,
+                                                 final_z_size, dimensions),
+                                print_dimensions2(local_x_size, local_y_size,
+                                                  local_z_size, dimensions));
                             errors++;
                             clReleaseMemObject(array);
                             clReleaseKernel(kernel);
@@ -780,30 +1072,23 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
                         previous_local_z_size = local_z_size;
 
                         // Only test one config in quick mode.
-                        if (quick_test)
-                            break;
+                        if (quick_test) break;
                     } // local_test size
                 } // sub_test
                   // Increment the x_size
-                if (x_size == max_x_size)
-                    break;
+                if (x_size == max_x_size) break;
                 x_size *= size_increase_per_iteration;
-                if (x_size > max_x_size)
-                    x_size = max_x_size;
+                if (x_size > max_x_size) x_size = max_x_size;
             } // x_size
               // Increment the y_size
-            if (y_size == max_y_size)
-                break;
+            if (y_size == max_y_size) break;
             y_size *= size_increase_per_iteration;
-            if (y_size > max_y_size)
-                y_size = max_y_size;
+            if (y_size > max_y_size) y_size = max_y_size;
         } // y_size
           // Increment the z_size
-        if (z_size == max_z_size)
-            break;
+        if (z_size == max_z_size) break;
         z_size *= size_increase_per_iteration;
-        if (z_size > max_z_size)
-            z_size = max_z_size;
+        if (z_size > max_z_size) z_size = max_z_size;
     } // z_size
 
 
@@ -811,75 +1096,108 @@ test_thread_dimensions(cl_device_id device, cl_context context, cl_command_queue
     clReleaseMemObject(array);
     clReleaseKernel(kernel);
     clReleaseProgram(program);
-    if (errors)
-        log_error("%d total errors.\n", errors);
+    if (errors) log_error("%d total errors.\n", errors);
     return errors;
-
 }
 
 #define QUICK 1
 #define FULL 0
 
-int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_quick_1d_explicit_local(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*512, QUICK, 4, 1);
+    return test_thread_dimensions(
+        deviceID, context, queue, 1, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 * 512, QUICK, 4, 1);
 }
 
-int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_quick_2d_explicit_local(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, QUICK, 16, 1);
+    return test_thread_dimensions(
+        deviceID, context, queue, 2, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 / 4, QUICK, 16, 1);
 }
 
-int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_quick_3d_explicit_local(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, QUICK, 32, 1);
+    return test_thread_dimensions(
+        deviceID, context, queue, 3, 1,
+        maxThreadDimension ? maxThreadDimension : 1024, QUICK, 32, 1);
 }
 
 
-int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_quick_1d_implicit_local(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*256, QUICK, 4, 0);
+    return test_thread_dimensions(
+        deviceID, context, queue, 1, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 * 256, QUICK, 4, 0);
 }
 
-int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_quick_2d_implicit_local(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, QUICK, 16, 0);
+    return test_thread_dimensions(
+        deviceID, context, queue, 2, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 / 4, QUICK, 16, 0);
 }
 
-int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_quick_3d_implicit_local(cl_device_id deviceID, cl_context context,
+                                 cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, QUICK, 32, 0);
+    return test_thread_dimensions(
+        deviceID, context, queue, 3, 1,
+        maxThreadDimension ? maxThreadDimension : 1024, QUICK, 32, 0);
 }
 
 
-int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_full_1d_explicit_local(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*512, FULL, 4, 1);
+    return test_thread_dimensions(
+        deviceID, context, queue, 1, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 * 512, FULL, 4, 1);
 }
 
-int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_full_2d_explicit_local(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, FULL, 16, 1);
+    return test_thread_dimensions(
+        deviceID, context, queue, 2, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 / 4, FULL, 16, 1);
 }
 
-int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_full_3d_explicit_local(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, FULL, 32, 1);
+    return test_thread_dimensions(
+        deviceID, context, queue, 3, 1,
+        maxThreadDimension ? maxThreadDimension : 1024, FULL, 32, 1);
 }
 
 
-int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_full_1d_implicit_local(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 1, 1, 65536*256, FULL, 4, 0);
+    return test_thread_dimensions(
+        deviceID, context, queue, 1, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 * 256, FULL, 4, 0);
 }
 
-int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_full_2d_implicit_local(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 2, 1, 65536/4, FULL, 16, 0);
+    return test_thread_dimensions(
+        deviceID, context, queue, 2, 1,
+        maxThreadDimension ? maxThreadDimension : 65536 / 4, FULL, 16, 0);
 }
 
-int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
+int test_full_3d_implicit_local(cl_device_id deviceID, cl_context context,
+                                cl_command_queue queue, int num_elements)
 {
-    return test_thread_dimensions(deviceID, context, queue, 3, 1, 1024, FULL, 32, 0);
+    return test_thread_dimensions(
+        deviceID, context, queue, 3, 1,
+        maxThreadDimension ? maxThreadDimension : 1024, FULL, 32, 0);
 }
-
diff --git a/test_conformance/vulkan/main.cpp b/test_conformance/vulkan/main.cpp
index 5c699b6303..aec3f7c52a 100644
--- a/test_conformance/vulkan/main.cpp
+++ b/test_conformance/vulkan/main.cpp
@@ -369,7 +369,7 @@ int main(int argc, const char *argv[])
         log_info(" TEST SKIPPED\n");
         return CL_SUCCESS;
     }
-    init_cl_vk_ext(platform);
+    init_cl_vk_ext(platform, num_devices, devices);
 
     // Execute tests.
     // Note: don't use the entire harness, because we have a different way of
@@ -381,4 +381,4 @@ int main(int argc, const char *argv[])
     errNum = parseAndCallCommandLineTests(argCount, argList, devices[device_no],
                                           test_num, test_list, config);
     return errNum;
-}
+}
\ No newline at end of file
diff --git a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
index 196a8f33b0..2787c17189 100644
--- a/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
+++ b/test_conformance/vulkan/test_vulkan_interop_buffer.cpp
@@ -89,10 +89,10 @@ int run_test_with_two_queue(
 {
     int err = CL_SUCCESS;
     size_t global_work_size[1];
-    uint8_t *error_2;
-    cl_mem error_1;
-    cl_kernel update_buffer_kernel;
-    cl_kernel kernel_cq;
+    uint8_t *error_2 = nullptr;
+    cl_mem error_1 = nullptr;
+    cl_kernel update_buffer_kernel = nullptr;
+    cl_kernel kernel_cq = nullptr;
     clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
     clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
     const char *program_source_const = kernel_text_numbuffer_2;
@@ -140,9 +140,9 @@ int run_test_with_two_queue(
     }
     else
     {
-        clVk2CLExternalSemaphore = new clExternalSemaphore(
+        clVk2CLExternalSemaphore = new clExternalImportableSemaphore(
             vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-        clCl2VkExternalSemaphore = new clExternalSemaphore(
+        clCl2VkExternalSemaphore = new clExternalExportableSemaphore(
             vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
     }
 
@@ -413,8 +413,8 @@ int run_test_with_one_queue(
 {
     log_info("RUNNING TEST WITH ONE QUEUE...... \n\n");
     size_t global_work_size[1];
-    uint8_t *error_2;
-    cl_mem error_1;
+    uint8_t *error_2 = nullptr;
+    cl_mem error_1 = nullptr;
     cl_kernel update_buffer_kernel;
     clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
     clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
@@ -453,9 +453,9 @@ int run_test_with_one_queue(
     }
     else
     {
-        clVk2CLExternalSemaphore = new clExternalSemaphore(
+        clVk2CLExternalSemaphore = new clExternalImportableSemaphore(
             vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-        clCl2VkExternalSemaphore = new clExternalSemaphore(
+        clCl2VkExternalSemaphore = new clExternalExportableSemaphore(
             vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
     }
 
@@ -699,8 +699,8 @@ int run_test_with_multi_import_same_ctx(
     VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType)
 {
     size_t global_work_size[1];
-    uint8_t *error_2;
-    cl_mem error_1;
+    uint8_t *error_2 = nullptr;
+    cl_mem error_1 = nullptr;
     int numImports = numBuffers;
     cl_kernel update_buffer_kernel;
     clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
@@ -742,9 +742,9 @@ int run_test_with_multi_import_same_ctx(
     }
     else
     {
-        clVk2CLExternalSemaphore = new clExternalSemaphore(
+        clVk2CLExternalSemaphore = new clExternalImportableSemaphore(
             vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-        clCl2VkExternalSemaphore = new clExternalSemaphore(
+        clCl2VkExternalSemaphore = new clExternalExportableSemaphore(
             vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
     }
 
@@ -1025,9 +1025,9 @@ int run_test_with_multi_import_diff_ctx(
     VulkanExternalSemaphoreHandleType vkExternalSemaphoreHandleType)
 {
     size_t global_work_size[1];
-    uint8_t *error_3;
-    cl_mem error_1;
-    cl_mem error_2;
+    uint8_t *error_3 = nullptr;
+    cl_mem error_1 = nullptr;
+    cl_mem error_2 = nullptr;
     int numImports = numBuffers;
     cl_kernel update_buffer_kernel1[MAX_IMPORTS];
     cl_kernel update_buffer_kernel2[MAX_IMPORTS];
@@ -1071,17 +1071,17 @@ int run_test_with_multi_import_diff_ctx(
     }
     else
     {
-        clVk2CLExternalSemaphore = new clExternalSemaphore(
+        clVk2CLExternalSemaphore = new clExternalImportableSemaphore(
             vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-        clCl2VkExternalSemaphore = new clExternalSemaphore(
+        clCl2VkExternalSemaphore = new clExternalExportableSemaphore(
             vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
 
-        clVk2CLExternalSemaphore2 =
-            new clExternalSemaphore(vkVk2CLSemaphore, context2,
-                                    vkExternalSemaphoreHandleType, deviceId);
-        clCl2VkExternalSemaphore2 =
-            new clExternalSemaphore(vkCl2VkSemaphore, context2,
-                                    vkExternalSemaphoreHandleType, deviceId);
+        clVk2CLExternalSemaphore2 = new clExternalImportableSemaphore(
+            vkVk2CLSemaphore, context2, vkExternalSemaphoreHandleType,
+            deviceId);
+        clCl2VkExternalSemaphore2 = new clExternalExportableSemaphore(
+            vkCl2VkSemaphore, context2, vkExternalSemaphoreHandleType,
+            deviceId);
     }
 
     const uint32_t maxIter = innerIterations;
diff --git a/test_conformance/vulkan/test_vulkan_interop_image.cpp b/test_conformance/vulkan/test_vulkan_interop_image.cpp
index 872044df9d..7ca7b7f321 100644
--- a/test_conformance/vulkan/test_vulkan_interop_image.cpp
+++ b/test_conformance/vulkan/test_vulkan_interop_image.cpp
@@ -251,9 +251,9 @@ int run_test_with_two_queue(
     clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
     clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
+    clVk2CLExternalSemaphore = new clExternalImportableSemaphore(
         vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
+    clCl2VkExternalSemaphore = new clExternalExportableSemaphore(
         vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
 
     std::vector<VulkanDeviceMemory *> vkImage2DListDeviceMemory1;
@@ -816,9 +816,9 @@ int run_test_with_one_queue(
     clExternalSemaphore *clVk2CLExternalSemaphore = NULL;
     clExternalSemaphore *clCl2VkExternalSemaphore = NULL;
 
-    clVk2CLExternalSemaphore = new clExternalSemaphore(
+    clVk2CLExternalSemaphore = new clExternalImportableSemaphore(
         vkVk2CLSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
-    clCl2VkExternalSemaphore = new clExternalSemaphore(
+    clCl2VkExternalSemaphore = new clExternalExportableSemaphore(
         vkCl2VkSemaphore, context, vkExternalSemaphoreHandleType, deviceId);
 
     std::vector<VulkanDeviceMemory *> vkImage2DListDeviceMemory1;

From 4019a26a5b79ad9d18899c6cd06d5e323b51fdc3 Mon Sep 17 00:00:00 2001
From: Sreelakshmi Haridas Maruthur <sharidas@quicinc.com>
Date: Tue, 9 Apr 2024 09:58:33 -0600
Subject: [PATCH 6/7] conversions: Fix verification of half subnormal cases
 with FTZ (#1914)

---
 .../conversions/basic_test_conversions.cpp    | 18 +++-------
 .../conversions/conversions_data_info.h       | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index b5f59deab8..155c272b81 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -1005,14 +1005,6 @@ double SubtractTime(uint64_t endTime, uint64_t startTime)
 }
 #endif
 
-static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
-{
-    cl_uint i;
-    for (i = 0; i < count; ++i)
-        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
-}
-
-
 void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &ptr);
 
 void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
@@ -1337,15 +1329,13 @@ cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
         // Decide if we allow a zero result in addition to the correctly rounded
         // one
         memset(a, 0, count);
-        if (gForceFTZ)
+        if (gForceFTZ && (inType == kfloat || outType == kfloat))
         {
-            if (inType == kfloat || outType == kfloat)
-                setAllowZ((uint8_t *)a, (uint32_t *)s, count);
+            info->set_allow_zero_array((uint8_t *)a, d, s, count);
         }
-        if (gForceHalfFTZ)
+        if (gForceHalfFTZ && (inType == khalf || outType == khalf))
         {
-            if (inType == khalf || outType == khalf)
-                setAllowZ((uint8_t *)a, (uint32_t *)s, count);
+            info->set_allow_zero_array((uint8_t *)a, d, s, count);
         }
     }
     else
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index bf887edecc..807d8ee9b5 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -30,6 +30,7 @@ extern roundingMode qcom_rm;
 
 #include <CL/cl_half.h>
 
+#include "harness/conversions.h"
 #include "harness/mt19937.h"
 #include "harness/rounding_mode.h"
 #include "harness/typeWrappers.h"
@@ -82,6 +83,7 @@ struct DataInitBase : public DataInitInfo
     virtual void conv_array(void *out, void *in, size_t n) {}
     virtual void conv_array_sat(void *out, void *in, size_t n) {}
     virtual void init(const cl_uint &, const cl_uint &) {}
+    virtual void set_allow_zero_array(uint8_t *allow, void *out, void *in, size_t n) {}
 };
 
 template <typename InType, typename OutType, bool InFP, bool OutFP>
@@ -99,6 +101,9 @@ struct DataInfoSpec : public DataInitBase
     void conv(OutType *out, InType *in);
     void conv_sat(OutType *out, InType *in);
 
+    // Decide if we allow a zero result in addition to the correctly rounded one
+    void set_allow_zero(uint8_t *allow, OutType *out, InType *in);
+
     // min/max ranges for output type of data
     std::pair<OutType, OutType> ranges;
 
@@ -130,6 +135,10 @@ struct DataInfoSpec : public DataInitBase
     }
 
     void init(const cl_uint &, const cl_uint &) override;
+    void set_allow_zero_array(uint8_t *allow, void *out, void *in, size_t n) override {
+        for (size_t i = 0; i < n; i++)
+            set_allow_zero(&allow[i], &((OutType *)out)[i], &((InType *)in)[i]);
+    }
     InType clamp(const InType &);
     inline float fclamp(float lo, float v, float hi)
     {
@@ -717,6 +726,33 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
     }
 }
 
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::set_allow_zero(uint8_t *allow,
+                                                                OutType *out,
+                                                                InType *in)
+{
+  // from double
+  if (std::is_same<InType, cl_double>::value)
+    *allow |= IsDoubleSubnormal(*in);
+  // from float
+  if (std::is_same<InType, cl_float>::value)
+    *allow |= IsFloatSubnormal(*in);
+  // from half
+  if (is_in_half())
+    *allow |= IsHalfSubnormal(*in);
+
+  // handle the cases that the converted result is subnormal
+  // from double
+  if (std::is_same<OutType, cl_double>::value)
+    *allow |= IsDoubleSubnormal(*out);
+  // from float
+  if (std::is_same<OutType, cl_float>::value)
+    *allow |= IsFloatSubnormal(*out);
+  // from half
+  if (is_out_half())
+    *allow |= IsHalfSubnormal(*out);
+}
+
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
                                                       const cl_uint &thread_id)

From 11e39f533bf9e560f3a0751cf9c97629dca8b8cb Mon Sep 17 00:00:00 2001
From: Ben Ashbaugh <ben.ashbaugh@intel.com>
Date: Tue, 9 Apr 2024 09:58:37 -0700
Subject: [PATCH 7/7] remove special-case range clamps for the fp16 divide test
 (#1902)

In the fp32 divide test, the range clamps are needed for the "fast
relaxed math" testing, but there is no "fast relaxed math" testing
for fp16.  This means that the range clamps are unnecessary and can
be removed.
---
 .../math_brute_force/binary_operator_half.cpp | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/test_conformance/math_brute_force/binary_operator_half.cpp b/test_conformance/math_brute_force/binary_operator_half.cpp
index 31e5f49f16..6386580798 100644
--- a/test_conformance/math_brute_force/binary_operator_half.cpp
+++ b/test_conformance/math_brute_force/binary_operator_half.cpp
@@ -143,8 +143,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
     }
 
-    bool divide = strcmp(name, "divide") == 0;
-
     // Init input array
     cl_half *p = (cl_half *)gIn + thread_id * buffer_elements;
     cl_half *p2 = (cl_half *)gIn2 + thread_id * buffer_elements;
@@ -171,15 +169,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
                 y++;
                 if (y >= specialValuesHalfCount) break;
             }
-
-            if (divide)
-            {
-                cl_half pj = p[idx] & 0x7fff;
-                cl_half p2j = p2[idx] & 0x7fff;
-                // Replace values outside [2^-7, 2^7] with QNaN
-                if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
-                if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
-            }
         }
     }
 
@@ -188,15 +177,6 @@ cl_int TestHalf(cl_uint job_id, cl_uint thread_id, void *data)
     {
         p[idx] = (cl_half)genrand_int32(d);
         p2[idx] = (cl_half)genrand_int32(d);
-
-        if (divide)
-        {
-            cl_half pj = p[idx] & 0x7fff;
-            cl_half p2j = p2[idx] & 0x7fff;
-            // Replace values outside [2^-7, 2^7] with QNaN
-            if (pj < 0x2000 || pj > 0x5800) p[idx] = 0x7e00; // HALF_NAN
-            if (p2j < 0x2000 || p2j > 0x5800) p2[idx] = 0x7e00;
-        }
     }
     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
                                       buffer_size, p, 0, NULL, NULL)))