From 7b83c8f7ab60ec5999bac8059da367f079464dc2 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Mon, 8 May 2023 19:20:05 +0200
Subject: [PATCH 1/8] Modernization of conversions test, preparation to handle
 cl_khr_fp16 extension

---
 .../conversions/basic_test_conversions.cpp    | 3317 +++++++----------
 .../conversions/basic_test_conversions.h      |  382 +-
 .../conversions/conversions_data_info.h       |  812 ++++
 test_conformance/conversions/fplib.h          |    5 +
 .../conversions/test_conversions.cpp          | 1331 +------
 5 files changed, 2534 insertions(+), 3313 deletions(-)
 create mode 100644 test_conformance/conversions/conversions_data_info.h
diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index dfb32279a..a01f60015 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -1,6 +1,6 @@
 //
 // Copyright (c) 2017 The Khronos Group Inc.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -15,2243 +15,1514 @@
 //
 #include "harness/testHarness.h"
 #include "harness/compat.h"
+#include "harness/rounding_mode.h"
+#include "harness/ThreadPool.h"
+#include "harness/testHarness.h"
+#include "harness/kernelHelpers.h"
+#include "harness/mt19937.h"
+#include "harness/kernelHelpers.h"
 
-#include "basic_test_conversions.h"
-#include <limits.h>
-#include <string.h>
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <mach/mach_time.h>
+#endif
 
-#include "harness/mt19937.h"
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/sysctl.h>
+#endif
+#if defined(__linux__)
+#include <sys/param.h>
+#include <libgen.h>
+#endif
 
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-#include "fplib.h"
+#if defined(__MINGW32__)
+#include <sys/param.h>
 #endif
 
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-/* Rounding modes and saturation for use with qcom 64 bit to float conversion library */
-    bool            qcom_sat;
-    roundingMode    qcom_rm;
+#include <sstream>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#if !defined(_WIN32)
+#include <libgen.h>
+#include <sys/mman.h>
 #endif
+#include <time.h>
+
+#include <algorithm>
 
-static inline cl_ulong random64( MTdata d );
+#include <vector>
+#include <type_traits>
 
-#if defined (_WIN32)
-    #include <mmintrin.h>
-    #include <emmintrin.h>
+#include "basic_test_conversions.h"
+
+#if (defined(_WIN32) && defined(_MSC_VER))
+// need for _controlfp_s and rouinding modes in RoundingMode
+#include "harness/testHarness.h"
+#endif
+
+#if defined(_WIN32)
+#include <mmintrin.h>
+#include <emmintrin.h>
 #else // !_WIN32
-#if defined (__SSE__ )
-    #include <xmmintrin.h>
+#if defined(__SSE__)
+#include <xmmintrin.h>
 #endif
-#if defined (__SSE2__ )
-    #include <emmintrin.h>
+#if defined(__SSE2__)
+#include <emmintrin.h>
 #endif
 #endif // _WIN32
 
-const char *gTypeNames[ kTypeCount ] = {
-                                            "uchar", "char",
-                                            "ushort", "short",
-                                            "uint",   "int",
-                                            "float", "double",
-                                            "ulong", "long"
-                                        };
-
-const char *gRoundingModeNames[ kRoundingModeCount ] = {
-                                                            "",
-                                                            "_rte",
-                                                            "_rtp",
-                                                            "_rtn",
-                                                            "_rtz"
-                                                        };
-
-const char *gSaturationNames[ 2 ] = { "", "_sat" };
-
-size_t gTypeSizes[ kTypeCount ] = {
-                                    sizeof( cl_uchar ), sizeof( cl_char ),
-                                    sizeof( cl_ushort ), sizeof( cl_short ),
-                                    sizeof( cl_uint ), sizeof( cl_int ),
-                                    sizeof( cl_float ), sizeof( cl_double ),
-                                    sizeof( cl_ulong ), sizeof( cl_long ),
-                                };
-
-long lrintf_clamped( float f );
-long lrintf_clamped( float f )
-{
-    static const float magic[2] = { MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23), - MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23) };
+cl_context gContext = NULL;
+cl_command_queue gQueue = NULL;
+int gStartTestNumber = -1;
+int gEndTestNumber = 0;
+#if defined(__APPLE__)
+int gTimeResults = 1;
+#else
+int gTimeResults = 0;
+#endif
+int gReportAverageTimes = 0;
+void *gIn = NULL;
+void *gRef = NULL;
+void *gAllowZ = NULL;
+void *gOut[kCallStyleCount] = { NULL };
+cl_mem gInBuffer;
+cl_mem gOutBuffers[kCallStyleCount];
+size_t gComputeDevices = 0;
+uint32_t gDeviceFrequency = 0;
+int gWimpyMode = 0;
+int gWimpyReductionFactor = 128;
+int gSkipTesting = 0;
+int gForceFTZ = 0;
+int gIsRTZ = 0;
+uint32_t gSimdSize = 1;
+int gHasDouble = 0;
+int gTestDouble = 1;
+const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
+int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
+int gMinVectorSize = 0;
+int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
+MTdata gMTdata;
+const char **argList = NULL;
+int argCount = 0;
+
+
+double SubtractTime(uint64_t endTime, uint64_t startTime);
+
+
+// clang-format off
+// for readability sake keep this section unformatted
+
+std::vector<unsigned int> DataInitInfo::specialValuesUInt = {
+      uint32_t(INT_MIN), uint32_t(INT_MIN + 1), uint32_t(INT_MIN + 2),
+      uint32_t(-(1 << 30) - 3), uint32_t(-(1 << 30) - 2), uint32_t(-(1 << 30) - 1), uint32_t(-(1 << 30)),
+      uint32_t(-(1 << 30) + 1), uint32_t(-(1 << 30) + 2), uint32_t(-(1 << 30) + 3),
+      uint32_t(-(1 << 24) - 3), uint32_t(-(1 << 24) - 2),uint32_t(-(1 << 24) - 1),
+      uint32_t(-(1 << 24)), uint32_t(-(1 << 24) + 1), uint32_t(-(1 << 24) + 2), uint32_t(-(1 << 24) + 3),
+      uint32_t(-(1 << 23) - 3), uint32_t(-(1 << 23) - 2),uint32_t(-(1 << 23) - 1),
+      uint32_t(-(1 << 23)), uint32_t(-(1 << 23) + 1), uint32_t(-(1 << 23) + 2), uint32_t(-(1 << 23) + 3),
+      uint32_t(-(1 << 22) - 3), uint32_t(-(1 << 22) - 2),uint32_t(-(1 << 22) - 1),
+      uint32_t(-(1 << 22)), uint32_t(-(1 << 22) + 1), uint32_t(-(1 << 22) + 2), uint32_t(-(1 << 22) + 3),
+      uint32_t(-(1 << 21) - 3), uint32_t(-(1 << 21) - 2),uint32_t(-(1 << 21) - 1),
+      uint32_t(-(1 << 21)), uint32_t(-(1 << 21) + 1), uint32_t(-(1 << 21) + 2), uint32_t(-(1 << 21) + 3),
+      uint32_t(-(1 << 16) - 3), uint32_t(-(1 << 16) - 2),uint32_t(-(1 << 16) - 1),
+      uint32_t(-(1 << 16)), uint32_t(-(1 << 16) + 1), uint32_t(-(1 << 16) + 2), uint32_t(-(1 << 16) + 3),
+      uint32_t(-(1 << 15) - 3), uint32_t(-(1 << 15) - 2),uint32_t(-(1 << 15) - 1),
+      uint32_t(-(1 << 15)), uint32_t(-(1 << 15) + 1), uint32_t(-(1 << 15) + 2), uint32_t(-(1 << 15) + 3),
+      uint32_t(-(1 << 8) - 3), uint32_t(-(1 << 8) - 2),uint32_t(-(1 << 8) - 1),
+      uint32_t(-(1 << 8)), uint32_t(-(1 << 8) + 1), uint32_t(-(1 << 8) + 2), uint32_t(-(1 << 8) + 3),
+      uint32_t(-(1 << 7) - 3), uint32_t(-(1 << 7) - 2),uint32_t(-(1 << 7) - 1),
+      uint32_t(-(1 << 7)), uint32_t(-(1 << 7) + 1), uint32_t(-(1 << 7) + 2), uint32_t(-(1 << 7) + 3),
+      uint32_t(-4), uint32_t(-3), uint32_t(-2), uint32_t(-1), 0, 1, 2, 3, 4,
+      (1 << 7) - 3,(1 << 7) - 2,(1 << 7) - 1, (1 << 7), (1 << 7) + 1, (1 << 7) + 2, (1 << 7) + 3,
+      (1 << 8) - 3,(1 << 8) - 2,(1 << 8) - 1, (1 << 8), (1 << 8) + 1, (1 << 8) + 2, (1 << 8) + 3,
+      (1 << 15) - 3,(1 << 15) - 2,(1 << 15) - 1, (1 << 15), (1 << 15) + 1, (1 << 15) + 2, (1 << 15) + 3,
+      (1 << 16) - 3,(1 << 16) - 2,(1 << 16) - 1, (1 << 16), (1 << 16) + 1, (1 << 16) + 2, (1 << 16) + 3,
+      (1 << 21) - 3,(1 << 21) - 2,(1 << 21) - 1, (1 << 21), (1 << 21) + 1, (1 << 21) + 2, (1 << 21) + 3,
+      (1 << 22) - 3,(1 << 22) - 2,(1 << 22) - 1, (1 << 22), (1 << 22) + 1, (1 << 22) + 2, (1 << 22) + 3,
+      (1 << 23) - 3,(1 << 23) - 2,(1 << 23) - 1, (1 << 23), (1 << 23) + 1, (1 << 23) + 2, (1 << 23) + 3,
+      (1 << 24) - 3,(1 << 24) - 2,(1 << 24) - 1, (1 << 24), (1 << 24) + 1, (1 << 24) + 2, (1 << 24) + 3,
+      (1 << 30) - 3,(1 << 30) - 2,(1 << 30) - 1, (1 << 30), (1 << 30) + 1, (1 << 30) + 2, (1 << 30) + 3,
+      INT_MAX - 3, INT_MAX - 2, INT_MAX - 1, INT_MAX, // 0x80000000, 0x80000001 0x80000002 already covered above
+      UINT_MAX - 3, UINT_MAX - 2, UINT_MAX - 1, UINT_MAX
+};
 
-    if( f >= -(float) LONG_MIN )
-        return LONG_MAX;
+std::vector<float> DataInitInfo::specialValuesFloat = {
+    -NAN, -INFINITY, -FLT_MAX,
+    MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
+    -1000.f, -100.f, -4.0f, -3.5f, -3.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f,
+    MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f,
+    MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24), MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25), MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f,
+    MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27), MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN,
+    MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
+    MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150), MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
+    MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
+    MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f, +NAN, +INFINITY, +FLT_MAX,
+    MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
+    MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
+    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
+    MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
+    +1000.f, +100.f, +4.0f, +3.5f, +3.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23), +2.0f,
+    MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
+    MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
+    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
+    MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
+    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
+    MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
+    MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150), MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
+    MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
+    MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
+};
 
-    if( f <= (float) LONG_MIN )
-        return LONG_MIN;
+// A table of more difficult cases to get right
+std::vector<double> DataInitInfo::specialValuesDouble = {
+    -NAN, -INFINITY, -DBL_MAX,
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.80000000000001p64, -0x180000000000001LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.8p64, -0x18LL, 60), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp64, -0x17ffffffffffffLL, 12),
+    MAKE_HEX_DOUBLE(-0x1.80000000000001p63, -0x180000000000001LL, 7), MAKE_HEX_DOUBLE(-0x1.8p63, -0x18LL, 59),
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp63, -0x17ffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.80000000000001p32, -0x180000000000001LL, -24), MAKE_HEX_DOUBLE(-0x1.8p32, -0x18LL, 28),
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp32, -0x17ffffffffffffLL, -20), MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(-0x1.80000000000001p31, -0x180000000000001LL, -25), MAKE_HEX_DOUBLE(-0x1.8p31, -0x18LL, 27),
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp31, -0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
+    -1000., -100., -4.0, -3.5, -3.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0,
+    MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5,
+    MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52), MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53), MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25,
+    MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55), MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
+    -DBL_MIN,
+    MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
+    MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
+    -0.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
+    MAKE_HEX_DOUBLE(0x1.80000000000001p63, 0x180000000000001LL, 7), MAKE_HEX_DOUBLE(0x1.8p63, 0x18LL, 59),
+    MAKE_HEX_DOUBLE(0x1.7ffffffffffffp63, 0x17ffffffffffffLL, 11), MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
+    MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.80000000000001p32, +0x180000000000001LL, -24), MAKE_HEX_DOUBLE(+0x1.8p32, +0x18LL, 28),
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp32, +0x17ffffffffffffLL, -20), MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
+    MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
+    MAKE_HEX_DOUBLE(+0x1.80000000000001p31, +0x180000000000001LL, -25), MAKE_HEX_DOUBLE(+0x1.8p31, +0x18LL, 27),
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp31, +0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
+    MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
+    +1000., +100., +4.0, +3.5, +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5,
+    MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
+    +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52), MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
+    +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53), MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
+    +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54), MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
+    +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55), MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
+    +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
+    MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0, MAKE_HEX_DOUBLE(-0x1.ffffffffffffep62, -0x1ffffffffffffeLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp62, -0x1ffffffffffffcLL, 10), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep62, +0x1ffffffffffffeLL, 10), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp62, +0x1ffffffffffffcLL, 10),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10), MAKE_HEX_DOUBLE(-0x1.ffffffffffffep51, -0x1ffffffffffffeLL, -1),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp51, -0x1ffffffffffffcLL, -1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp51, -0x1fffffffffffffLL, -1),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep51, +0x1ffffffffffffeLL, -1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp51, +0x1ffffffffffffcLL, -1),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp51, +0x1fffffffffffffLL, -1), MAKE_HEX_DOUBLE(-0x1.ffffffffffffep52, -0x1ffffffffffffeLL, 0),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp52, -0x1ffffffffffffcLL, 0), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp52, -0x1fffffffffffffLL, 0),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep52, +0x1ffffffffffffeLL, 0), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp52, +0x1ffffffffffffcLL, 0),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp52, +0x1fffffffffffffLL, 0), MAKE_HEX_DOUBLE(-0x1.ffffffffffffep53, -0x1ffffffffffffeLL, 1),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp53, -0x1ffffffffffffcLL, 1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp53, -0x1fffffffffffffLL, 1),
+    MAKE_HEX_DOUBLE(+0x1.ffffffffffffep53, +0x1ffffffffffffeLL, 1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp53, +0x1ffffffffffffcLL, 1),
+    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp53, +0x1fffffffffffffLL, 1), MAKE_HEX_DOUBLE(-0x1.0000000000002p52, -0x10000000000002LL, 0),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p52, -0x10000000000001LL, 0), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52),
+    MAKE_HEX_DOUBLE(+0x1.0000000000002p52, +0x10000000000002LL, 0), MAKE_HEX_DOUBLE(+0x1.0000000000001p52, +0x10000000000001LL, 0),
+    MAKE_HEX_DOUBLE(+0x1.0p52, +0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0000000000002p53, -0x10000000000002LL, 1),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p53, -0x10000000000001LL, 1), MAKE_HEX_DOUBLE(-0x1.0p53, -0x1LL, 53),
+    MAKE_HEX_DOUBLE(+0x1.0000000000002p53, +0x10000000000002LL, 1), MAKE_HEX_DOUBLE(+0x1.0000000000001p53, +0x10000000000001LL, 1),
+    MAKE_HEX_DOUBLE(+0x1.0p53, +0x1LL, 53), MAKE_HEX_DOUBLE(-0x1.0000000000002p54, -0x10000000000002LL, 2),
+    MAKE_HEX_DOUBLE(-0x1.0000000000001p54, -0x10000000000001LL, 2), MAKE_HEX_DOUBLE(-0x1.0p54, -0x1LL, 54),
+    MAKE_HEX_DOUBLE(+0x1.0000000000002p54, +0x10000000000002LL, 2), MAKE_HEX_DOUBLE(+0x1.0000000000001p54, +0x10000000000001LL, 2),
+    MAKE_HEX_DOUBLE(+0x1.0p54, +0x1LL, 54), MAKE_HEX_DOUBLE(-0x1.fffffffefffffp62, -0x1fffffffefffffLL, 10),
+    MAKE_HEX_DOUBLE(-0x1.ffffffffp62, -0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(-0x1.ffffffff00001p62, -0x1ffffffff00001LL, 10),
+    MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30),
+    MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10),
+};
+// clang-format on
 
-    // Round fractional values to integer in round towards nearest mode
-    if( fabsf(f) < MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23 ) )
-    {
-        volatile float x = f;
-        float magicVal = magic[ f < 0 ];
-
-#if defined( __SSE__ ) || defined (_WIN32)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128 v = _mm_set_ss( x );
-        __m128 m = _mm_set_ss( magicVal );
-        v = _mm_add_ss( v, m );
-        v = _mm_sub_ss( v, m );
-        _mm_store_ss( (float*) &x, v );
+
+// Windows (since long double got deprecated) sets the x87 to 53-bit precision
+// (that's x87 default state).  This causes problems with the tests that
+// convert long and ulong to float and double or otherwise deal with values
+// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
+static inline void Force64BitFPUPrecision(void)
+{
+#if __MINGW32__
+    // The usual method is to use _controlfp as follows:
+    //     #include <float.h>
+    //     _controlfp(_PC_64, _MCW_PC);
+    //
+    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
+    // divergent code just use inline assembly which works for both.
+    unsigned short int orig_cw = 0;
+    unsigned short int new_cw = 0;
+    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
+    new_cw = orig_cw | 0x0300; // set precision to 64-bit
+    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
 #else
-        x += magicVal;
-        x -= magicVal;
+    /* Implement for other platforms if needed */
 #endif
-        f = x;
+}
+
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+int CalcRefValsPat<InType, OutType, InFP, OutFP>::check_result(void *test,
+                                                               uint32_t count,
+                                                               int vectorSize)
+{
+    const cl_uchar *a = (const cl_uchar *)gAllowZ;
+
+    if (std::is_integral<OutType>::value)
+    { // char/uchar/short/ushort/int/uint/long/ulong
+        const OutType *t = (const OutType *)test;
+        const OutType *c = (const OutType *)gRef;
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (OutType)0))
+            {
+                size_t s = sizeof(OutType) * 2;
+                std::stringstream sstr;
+                sstr << "\nError for vector size %d found at 0x%8.8x:  *0x%"
+                     << s << "." << s << "x vs 0x%" << s << "." << s << "x\n";
+                vlog(sstr.str().c_str(), vectorSize, i, c[i], t[i]);
+                return i + 1;
+            }
+    }
+    else if (std::is_same<OutType, cl_float>::value)
+    {
+        // cast to integral - from original test
+        const cl_uint *t = (const cl_uint *)test;
+        const cl_uint *c = (const cl_uint *)gRef;
+
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] &&
+                // Allow nan's to be binary different
+                !((t[i] & 0x7fffffffU) > 0x7f800000U
+                  && (c[i] & 0x7fffffffU) > 0x7f800000U)
+                && !(a[i] != (cl_uchar)0 && t[i] == (c[i] & 0x80000000U)))
+            {
+                vlog(
+                    "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
+                    vectorSize, i, ((OutType *)gRef)[i], ((OutType *)test)[i]);
+                return i + 1;
+            }
+    }
+    else
+    {
+        const cl_ulong *t = (const cl_ulong *)test;
+        const cl_ulong *c = (const cl_ulong *)gRef;
+
+        for (uint32_t i = 0; i < count; i++)
+            if (t[i] != c[i] &&
+                // Allow nan's to be binary different
+                !((t[i] & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL
+                  && (c[i] & 0x7fffffffffffffffULL) > 0x7f80000000000000ULL)
+                && !(a[i] != (cl_uchar)0
+                     && t[i] == (c[i] & 0x8000000000000000ULL)))
+            {
+                vlog(
+                    "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
+                    vectorSize, i, ((OutType *)gRef)[i], ((OutType *)test)[i]);
+                return i + 1;
+            }
     }
 
-    return (long) f;
+    return 0;
 }
 
-long long llrintf_clamped( float f );
-long long llrintf_clamped( float f )
+
+cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
 {
-    static const float magic[2] = { MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23), - MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23) };
+    if (0 == (x & (x - 1))) return x;
 
-    if( f >= -(float) LLONG_MIN )
-        return LLONG_MAX;
+    while (x & (x - 1)) x &= x - 1;
 
-    if( f <= (float) LLONG_MIN )
-        return LLONG_MIN;
+    return x + x;
+}
+
+
+cl_int CustomConversionsTest::Run()
+{
+    int startMinVectorSize = gMinVectorSize;
+    Type inType, outType;
+    RoundingMode round;
+    SaturationMode sat;
 
-    // Round fractional values to integer in round towards nearest mode
-    if( fabsf(f) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1L, 23) )
+    for (int i = 0; i < argCount; i++)
     {
-        volatile float x = f;
-        float magicVal = magic[ f < 0 ];
-#if defined( __SSE__ ) || defined (_WIN32)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128 v = _mm_set_ss( x );
-        __m128 m = _mm_set_ss( magicVal );
-        v = _mm_add_ss( v, m );
-        v = _mm_sub_ss( v, m );
-        _mm_store_ss( (float*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
+        if (conv_test::GetTestCase(argList[i], &outType, &inType, &sat, &round))
+        {
+            vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
+                       "%s.  Skipping....  *****\n\n",
+                       argList[i]);
+            continue;
+        }
+
+        // skip double if we don't have it
+        if (!gTestDouble && (inType == kdouble || outType == kdouble))
+        {
+            if (gHasDouble)
+            {
+                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                           gTypeNames[outType], gSaturationNames[sat],
+                           gRoundingModeNames[round], gTypeNames[inType]);
+                vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
+                     "off.\n");
+            }
+            continue;
+        }
+
+        // skip longs on embedded
+        if (!gHasLong
+            && (inType == klong || outType == klong || inType == kulong
+                || outType == kulong))
+        {
+            continue;
+        }
+
+        // Skip the implicit converts if the rounding mode is not default or
+        // test is saturated
+        if (0 == startMinVectorSize)
+        {
+            if (sat || round != kDefaultRoundingMode)
+                gMinVectorSize = 1;
+            else
+                gMinVectorSize = 0;
+        }
+
+        IterOverSelectedTypes iter(typeIterator, *this, inType, outType);
+
+        iter.Run();
+
+        if (gFailCount)
+        {
+            vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
+                       gTypeNames[outType], gSaturationNames[sat],
+                       gRoundingModeNames[round], gTypeNames[inType]);
+        }
     }
 
-    return (long long) f;
+    return gFailCount;
+}
+
+
+ConversionsTest::ConversionsTest(cl_device_id device, cl_context context,
+                                 cl_command_queue queue)
+    : context(context), device(device), queue(queue), num_elements(0),
+      typeIterator({ cl_uchar(0), cl_char(0), cl_ushort(0), cl_short(0),
+                     cl_uint(0), cl_int(0), cl_float(0), cl_double(0),
+                     cl_ulong(0), cl_long(0) })
+{}
+
+
+cl_int ConversionsTest::Run()
+{
+    IterOverTypes iter(typeIterator, *this);
+
+    iter.Run();
+
+    return gFailCount;
 }
 
-long lrint_clamped( double f );
-long lrint_clamped( double f )
+
+cl_int ConversionsTest::SetUp(int elements)
 {
-    static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
+    num_elements = elements;
+    return CL_SUCCESS;
+}
 
-    if( sizeof( long ) > 4 )
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void ConversionsTest::TestTypesConversion(const Type &inType,
+                                          const Type &outType, int &testNumber)
+{
+    SaturationMode sat;
+    RoundingMode round;
+    int error;
+    int startMinVectorSize = gMinVectorSize;
+
+    // skip longs on embedded
+    if (!gHasLong
+        && (inType == klong || outType == klong || inType == kulong
+            || outType == kulong))
     {
-        if( f >= -(double) LONG_MIN )
-            return LONG_MAX;
+        return;
     }
-    else
+
+    for (sat = (SaturationMode)0; sat < kSaturationModeCount;
+         sat = (SaturationMode)(sat + 1))
     {
-        if( f >= LONG_MAX )
-            return LONG_MAX;
-    }
+        // skip illegal saturated conversions to float type
+        if (kSaturated == sat && (outType == kfloat || outType == kdouble))
+        {
+            continue;
+        }
+
+        for (round = (RoundingMode)0; round < kRoundingModeCount;
+             round = (RoundingMode)(round + 1))
+        {
+            if (++testNumber < gStartTestNumber)
+            {
+                continue;
+            }
+            else
+            {
+                if (gEndTestNumber > 0 && testNumber >= gEndTestNumber) return;
+            }
 
-    if( f <= (double) LONG_MIN )
-        return LONG_MIN;
+            vlog("%d) Testing convert_%sn%s%s( %sn ):\n", testNumber,
+                 gTypeNames[outType], gSaturationNames[sat],
+                 gRoundingModeNames[round], gTypeNames[inType]);
 
-    // Round fractional values to integer in round towards nearest mode
-    if( fabs(f) < MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52) )
-    {
-        volatile double x = f;
-        double magicVal = magic[ f < 0 ];
-#if defined( __SSE2__ ) || defined (_MSC_VER)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128d v = _mm_set_sd( x );
-        __m128d m = _mm_set_sd( magicVal );
-        v = _mm_add_sd( v, m );
-        v = _mm_sub_sd( v, m );
-        _mm_store_sd( (double*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
-    }
+            // skip double if we don't have it
+            if (!gTestDouble && (inType == kdouble || outType == kdouble))
+            {
+                if (gHasDouble)
+                {
+                    vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
+                               "FAILED ** \n",
+                               testNumber, gTypeNames[outType],
+                               gSaturationNames[sat], gRoundingModeNames[round],
+                               gTypeNames[inType]);
+                    vlog("\t\tcl_khr_fp64 enabled, but double "
+                         "testing turned off.\n");
+                }
+                continue;
+            }
 
-    return (long) f;
+            // Skip the implicit converts if the rounding mode is
+            // not default or test is saturated
+            if (0 == startMinVectorSize)
+            {
+                if (sat || round != kDefaultRoundingMode)
+                    gMinVectorSize = 1;
+                else
+                    gMinVectorSize = 0;
+            }
+
+            if ((error = DoTest<InType, OutType, InFP, OutFP>(outType, inType,
+                                                              sat, round)))
+            {
+                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
+                           "FAILED ** \n",
+                           testNumber, gTypeNames[outType],
+                           gSaturationNames[sat], gRoundingModeNames[round],
+                           gTypeNames[inType]);
+            }
+        }
+    }
 }
 
-long long llrint_clamped( double f );
-long long llrint_clamped( double f )
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
+                            RoundingMode round)
 {
-    static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
+#ifdef __APPLE__
+    cl_ulong wall_start = mach_absolute_time();
+#endif
 
-    if( f >= -(double) LLONG_MIN )
-        return LLONG_MAX;
+#if 0
+    uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
+#else
+    cl_uint threads = GetThreadCount();
+    uint64_t lastCase = 1000000ULL;
+#endif
 
-    if( f <= (double) LLONG_MIN )
-        return LLONG_MIN;
+    DataInitInfo info = { 0, 0, outType, inType, sat, round, threads };
+    DataInfoSpec<InType, OutType, InFP, OutFP> init_info(info);
+    WriteInputBufferInfo writeInputBufferInfo;
+    int vectorSize;
+    int error = 0;
+    uint64_t i;
 
-    // Round fractional values to integer in round towards nearest mode
-    if( fabs(f) < MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52) )
+    gTestCount++;
+    size_t blockCount =
+        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
+    size_t step = blockCount;
+
+    for (i = 0; i < threads; i++)
     {
-        volatile double x = f;
-        double magicVal = magic[ f < 0 ];
-#if defined( __SSE2__ ) || defined (_MSC_VER)
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128d v = _mm_set_sd( x );
-        __m128d m = _mm_set_sd( magicVal );
-        v = _mm_add_sd( v, m );
-        v = _mm_sub_sd( v, m );
-        _mm_store_sd( (double*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
+        init_info.mdv.emplace_back(MTdataHolder(gRandomSeed));
     }
 
-    return (long long) f;
-}
+    writeInputBufferInfo.outType = outType;
+    writeInputBufferInfo.inType = inType;
 
+    writeInputBufferInfo.calcInfo.resize(gMaxVectorSize);
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        writeInputBufferInfo.calcInfo[vectorSize].reset(
+            new CalcRefValsPat<InType, OutType, InFP, OutFP>());
+        writeInputBufferInfo.calcInfo[vectorSize]->program =
+            conv_test::MakeProgram(
+                outType, inType, sat, round, vectorSize,
+                &writeInputBufferInfo.calcInfo[vectorSize]->kernel);
+        if (NULL == writeInputBufferInfo.calcInfo[vectorSize]->program)
+        {
+            gFailCount++;
+            return -1;
+        }
+        if (NULL == writeInputBufferInfo.calcInfo[vectorSize]->kernel)
+        {
+            gFailCount++;
+            vlog_error("\t\tFAILED -- Failed to create kernel.\n");
+            return -2;
+        }
 
-/*
-    Names created as:
-
-    #include <stdio.h>
+        writeInputBufferInfo.calcInfo[vectorSize]->parent =
+            &writeInputBufferInfo;
+        writeInputBufferInfo.calcInfo[vectorSize]->vectorSize = vectorSize;
+        writeInputBufferInfo.calcInfo[vectorSize]->result = -1;
+    }
 
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
+    if (gSkipTesting) return error;
 
-    int main( void )
+    // Patch up rounding mode if default is RTZ
+    // We leave the part above in default rounding mode so that the right kernel
+    // is compiled.
+    if (std::is_same<OutType, cl_float>::value)
     {
+        if (round == kDefaultRoundingMode && gIsRTZ)
+            init_info.round = round = kRoundTowardZero;
+    }
 
-        int i,j;
+#if 0
+    // Figure out how many elements are in a work block
+    // we handle 64-bit types a bit differently.
+    if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
+#endif
 
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
-            {
-                if( j == i )
-                    continue;
+    if (!gWimpyMode && gIsEmbedded)
+        step = blockCount * EMBEDDED_REDUCTION_FACTOR;
 
-                vlog( "void %s2%s( void *, void *);\n", names[i], names[j] );
-            }
+    if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
+    vlog("Testing... ");
+    fflush(stdout);
+    for (i = 0; i < (uint64_t)lastCase; i += step)
+    {
 
+        if (0 == (i & ((lastCase >> 3) - 1)))
+        {
+            vlog(".");
+            fflush(stdout);
+        }
 
-        return 0;
-    }
-*/
-
-static float my_fabsf( float x );
-static double my_fabs( double x );
-
-
-
-static void uchar2char( void *, void *);
-static void uchar2ushort( void *, void *);
-static void uchar2short( void *, void *);
-static void uchar2uint( void *, void *);
-static void uchar2int( void *, void *);
-static void uchar2float( void *, void *);
-static void uchar2double( void *, void *);
-static void uchar2ulong( void *, void *);
-static void uchar2long( void *, void *);
-static void char2uchar( void *, void *);
-static void char2ushort( void *, void *);
-static void char2short( void *, void *);
-static void char2uint( void *, void *);
-static void char2int( void *, void *);
-static void char2float( void *, void *);
-static void char2double( void *, void *);
-static void char2ulong( void *, void *);
-static void char2long( void *, void *);
-static void ushort2uchar( void *, void *);
-static void ushort2char( void *, void *);
-static void ushort2short( void *, void *);
-static void ushort2uint( void *, void *);
-static void ushort2int( void *, void *);
-static void ushort2float( void *, void *);
-static void ushort2double( void *, void *);
-static void ushort2ulong( void *, void *);
-static void ushort2long( void *, void *);
-static void short2uchar( void *, void *);
-static void short2char( void *, void *);
-static void short2ushort( void *, void *);
-static void short2uint( void *, void *);
-static void short2int( void *, void *);
-static void short2float( void *, void *);
-static void short2double( void *, void *);
-static void short2ulong( void *, void *);
-static void short2long( void *, void *);
-static void uint2uchar( void *, void *);
-static void uint2char( void *, void *);
-static void uint2ushort( void *, void *);
-static void uint2short( void *, void *);
-static void uint2int( void *, void *);
-static void uint2float( void *, void *);
-static void uint2double( void *, void *);
-static void uint2ulong( void *, void *);
-static void uint2long( void *, void *);
-static void int2uchar( void *, void *);
-static void int2char( void *, void *);
-static void int2ushort( void *, void *);
-static void int2short( void *, void *);
-static void int2uint( void *, void *);
-static void int2float( void *, void *);
-static void int2double( void *, void *);
-static void int2ulong( void *, void *);
-static void int2long( void *, void *);
-static void float2uchar( void *, void *);
-static void float2char( void *, void *);
-static void float2ushort( void *, void *);
-static void float2short( void *, void *);
-static void float2uint( void *, void *);
-static void float2int( void *, void *);
-static void float2double( void *, void *);
-static void float2ulong( void *, void *);
-static void float2long( void *, void *);
-static void double2uchar( void *, void *);
-static void double2char( void *, void *);
-static void double2ushort( void *, void *);
-static void double2short( void *, void *);
-static void double2uint( void *, void *);
-static void double2int( void *, void *);
-static void double2float( void *, void *);
-static void double2ulong( void *, void *);
-static void double2long( void *, void *);
-static void ulong2uchar( void *, void *);
-static void ulong2char( void *, void *);
-static void ulong2ushort( void *, void *);
-static void ulong2short( void *, void *);
-static void ulong2uint( void *, void *);
-static void ulong2int( void *, void *);
-static void ulong2float( void *, void *);
-static void ulong2double( void *, void *);
-static void ulong2long( void *, void *);
-static void long2uchar( void *, void *);
-static void long2char( void *, void *);
-static void long2ushort( void *, void *);
-static void long2short( void *, void *);
-static void long2uint( void *, void *);
-static void long2int( void *, void *);
-static void long2float( void *, void *);
-static void long2double( void *, void *);
-static void long2ulong( void *, void *);
-
-/*
-    Conversion list created as
-
-    #include <stdio.h>
-
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-
-    int main( void )
-    {
+        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
+        writeInputBufferInfo.count = count;
 
-        int i,j;
+        // Crate a user event to represent the status of the reference value
+        // computation completion
+        writeInputBufferInfo.calcReferenceValues =
+            clCreateUserEvent(gContext, &error);
+        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
+        {
+            vlog_error("ERROR: Unable to create user event. (%d)\n", error);
+            gFailCount++;
+            return error;
+        }
 
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
+        // retain for consumption by MapOutputBufferComplete
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
         {
-            vlog( "{ " );
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
+            if ((error =
+                     clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
             {
-                if( j == i )
-                    vlog( "          NULL, " );
-                else
-                {
-                    char s[64];
-                    sprintf( s, "%s2%s,", names[j], names[i] );
-                    vlog( "%15s ", s );
-                }
+                vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
+                gFailCount++;
+                return error;
             }
-            vlog( "},\n" );
         }
 
-        return 0;
-    }
-
- */
-/*
-Convert gConversions[kTypeCount][kTypeCount] = {
-{           NULL,     char2uchar,   ushort2uchar,    short2uchar,     uint2uchar,      int2uchar,    float2uchar,   double2uchar,    ulong2uchar,     long2uchar, },
-{     uchar2char,           NULL,    ushort2char,     short2char,      uint2char,       int2char,     float2char,    double2char,     ulong2char,      long2char, },
-{   uchar2ushort,    char2ushort,           NULL,   short2ushort,    uint2ushort,     int2ushort,   float2ushort,  double2ushort,   ulong2ushort,    long2ushort, },
-{    uchar2short,     char2short,   ushort2short,           NULL,     uint2short,      int2short,    float2short,   double2short,    ulong2short,     long2short, },
-{     uchar2uint,      char2uint,    ushort2uint,     short2uint,           NULL,       int2uint,     float2uint,    double2uint,     ulong2uint,      long2uint, },
-{      uchar2int,       char2int,     ushort2int,      short2int,       uint2int,           NULL,      float2int,     double2int,      ulong2int,       long2int, },
-{    uchar2float,     char2float,   ushort2float,    short2float,     uint2float,      int2float,           NULL,   double2float,    ulong2float,     long2float, },
-{   uchar2double,    char2double,  ushort2double,   short2double,    uint2double,     int2double,   float2double,           NULL,   ulong2double,    long2double, },
-{    uchar2ulong,     char2ulong,   ushort2ulong,    short2ulong,     uint2ulong,      int2ulong,    float2ulong,   double2ulong,           NULL,     long2ulong, },
-{     uchar2long,      char2long,    ushort2long,     short2long,      uint2long,       int2long,     float2long,    double2long,     ulong2long,           NULL, } };
-*/
-
-static void uchar2char_sat( void *, void *);
-static void uchar2ushort_sat( void *, void *);
-static void uchar2short_sat( void *, void *);
-static void uchar2uint_sat( void *, void *);
-static void uchar2int_sat( void *, void *);
-static void uchar2float_sat( void *, void *);
-static void uchar2double_sat( void *, void *);
-static void uchar2ulong_sat( void *, void *);
-static void uchar2long_sat( void *, void *);
-static void char2uchar_sat( void *, void *);
-static void char2ushort_sat( void *, void *);
-static void char2short_sat( void *, void *);
-static void char2uint_sat( void *, void *);
-static void char2int_sat( void *, void *);
-static void char2float_sat( void *, void *);
-static void char2double_sat( void *, void *);
-static void char2ulong_sat( void *, void *);
-static void char2long_sat( void *, void *);
-static void ushort2uchar_sat( void *, void *);
-static void ushort2char_sat( void *, void *);
-static void ushort2short_sat( void *, void *);
-static void ushort2uint_sat( void *, void *);
-static void ushort2int_sat( void *, void *);
-static void ushort2float_sat( void *, void *);
-static void ushort2double_sat( void *, void *);
-static void ushort2ulong_sat( void *, void *);
-static void ushort2long_sat( void *, void *);
-static void short2uchar_sat( void *, void *);
-static void short2char_sat( void *, void *);
-static void short2ushort_sat( void *, void *);
-static void short2uint_sat( void *, void *);
-static void short2int_sat( void *, void *);
-static void short2float_sat( void *, void *);
-static void short2double_sat( void *, void *);
-static void short2ulong_sat( void *, void *);
-static void short2long_sat( void *, void *);
-static void uint2uchar_sat( void *, void *);
-static void uint2char_sat( void *, void *);
-static void uint2ushort_sat( void *, void *);
-static void uint2short_sat( void *, void *);
-static void uint2int_sat( void *, void *);
-static void uint2float_sat( void *, void *);
-static void uint2double_sat( void *, void *);
-static void uint2ulong_sat( void *, void *);
-static void uint2long_sat( void *, void *);
-static void int2uchar_sat( void *, void *);
-static void int2char_sat( void *, void *);
-static void int2ushort_sat( void *, void *);
-static void int2short_sat( void *, void *);
-static void int2uint_sat( void *, void *);
-static void int2float_sat( void *, void *);
-static void int2double_sat( void *, void *);
-static void int2ulong_sat( void *, void *);
-static void int2long_sat( void *, void *);
-static void float2uchar_sat( void *, void *);
-static void float2char_sat( void *, void *);
-static void float2ushort_sat( void *, void *);
-static void float2short_sat( void *, void *);
-static void float2uint_sat( void *, void *);
-static void float2int_sat( void *, void *);
-static void float2double_sat( void *, void *);
-static void float2ulong_sat( void *, void *);
-static void float2long_sat( void *, void *);
-static void double2uchar_sat( void *, void *);
-static void double2char_sat( void *, void *);
-static void double2ushort_sat( void *, void *);
-static void double2short_sat( void *, void *);
-static void double2uint_sat( void *, void *);
-static void double2int_sat( void *, void *);
-static void double2float_sat( void *, void *);
-static void double2ulong_sat( void *, void *);
-static void double2long_sat( void *, void *);
-static void ulong2uchar_sat( void *, void *);
-static void ulong2char_sat( void *, void *);
-static void ulong2ushort_sat( void *, void *);
-static void ulong2short_sat( void *, void *);
-static void ulong2uint_sat( void *, void *);
-static void ulong2int_sat( void *, void *);
-static void ulong2float_sat( void *, void *);
-static void ulong2double_sat( void *, void *);
-static void ulong2long_sat( void *, void *);
-static void long2uchar_sat( void *, void *);
-static void long2char_sat( void *, void *);
-static void long2ushort_sat( void *, void *);
-static void long2short_sat( void *, void *);
-static void long2uint_sat( void *, void *);
-static void long2int_sat( void *, void *);
-static void long2float_sat( void *, void *);
-static void long2double_sat( void *, void *);
-static void long2ulong_sat( void *, void *);
-/*
-    #include <stdio.h>
-
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-
-    int main( void )
-    {
+        // Crate a user event to represent when the callbacks are done verifying
+        // correctness
+        writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
+        if (error || NULL == writeInputBufferInfo.doneBarrier)
+        {
+            vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
+                       error);
+            gFailCount++;
+            return error;
+        }
 
-        int i,j;
+        // retain for use by the callback that calls this
+        if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
+        {
+            vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
+                       error);
+            gFailCount++;
+            return error;
+        }
 
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
+        //      Call this in a multithreaded manner
+        cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
+        init_info.start = i;
+        init_info.size = count / chunks;
+        if (init_info.size < 16384)
         {
-            vlog( "{ " );
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
+            chunks = RoundUpToNextPowerOfTwo(threads);
+            init_info.size = count / chunks;
+            if (init_info.size < 16384)
             {
-                if( j == i )
-                    vlog( "             NULL, " );
-                else
-                {
-                    char s[64];
-                    sprintf( s, "%s2%s_sat,", names[j], names[i] );
-                    vlog( "%18s ", s );
-                }
+                init_info.size = count;
+                chunks = 1;
             }
-            vlog( "},\n" );
         }
 
-        return 0;
-    }
+        ThreadPool_Do(conv_test::InitData, chunks, &init_info);
 
-Convert gSaturatedConversions[kTypeCount][kTypeCount] = {
-{              NULL,    char2uchar_sat,  ushort2uchar_sat,   short2uchar_sat,    uint2uchar_sat,     int2uchar_sat,   float2uchar_sat,  double2uchar_sat,   ulong2uchar_sat,    long2uchar_sat, },
-{    uchar2char_sat,              NULL,   ushort2char_sat,    short2char_sat,     uint2char_sat,      int2char_sat,    float2char_sat,   double2char_sat,    ulong2char_sat,     long2char_sat, },
-{  uchar2ushort_sat,   char2ushort_sat,              NULL,  short2ushort_sat,   uint2ushort_sat,    int2ushort_sat,  float2ushort_sat, double2ushort_sat,  ulong2ushort_sat,   long2ushort_sat, },
-{   uchar2short_sat,    char2short_sat,  ushort2short_sat,              NULL,    uint2short_sat,     int2short_sat,   float2short_sat,  double2short_sat,   ulong2short_sat,    long2short_sat, },
-{    uchar2uint_sat,     char2uint_sat,   ushort2uint_sat,    short2uint_sat,              NULL,      int2uint_sat,    float2uint_sat,   double2uint_sat,    ulong2uint_sat,     long2uint_sat, },
-{     uchar2int_sat,      char2int_sat,    ushort2int_sat,     short2int_sat,      uint2int_sat,              NULL,     float2int_sat,    double2int_sat,     ulong2int_sat,      long2int_sat, },
-{   uchar2float_sat,    char2float_sat,  ushort2float_sat,   short2float_sat,    uint2float_sat,     int2float_sat,              NULL,  double2float_sat,   ulong2float_sat,    long2float_sat, },
-{  uchar2double_sat,   char2double_sat, ushort2double_sat,  short2double_sat,   uint2double_sat,    int2double_sat,  float2double_sat,              NULL,  ulong2double_sat,   long2double_sat, },
-{   uchar2ulong_sat,    char2ulong_sat,  ushort2ulong_sat,   short2ulong_sat,    uint2ulong_sat,     int2ulong_sat,   float2ulong_sat,  double2ulong_sat,              NULL,    long2ulong_sat, },
-{    uchar2long_sat,     char2long_sat,   ushort2long_sat,    short2long_sat,     uint2long_sat,      int2long_sat,    float2long_sat,   double2long_sat,    ulong2long_sat,              NULL, }
-};
-*/
+        // Copy the results to the device
+        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
+                                          count * gTypeSizes[inType], gIn, 0,
+                                          NULL, NULL)))
+        {
+            vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
+            gFailCount++;
+            return error;
+        }
 
-/*
-    #include <stdio.h>
+        // Call completion callback for the write, which will enqueue the rest
+        // of the work.
+        conv_test::WriteInputBufferComplete((void *)&writeInputBufferInfo);
 
-    const char *names[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "long" };
-    const char *types[] = { "uchar", "char", "ushort", "short", "uint", "int", "float", "double", "ulong", "llong" };
+        // Make sure the work is actually running, so we don't deadlock
+        if ((error = clFlush(gQueue)))
+        {
+            vlog_error("clFlush failed with error %d\n", error);
+            gFailCount++;
+            return error;
+        }
 
-    int main( void )
-    {
+        ThreadPool_Do(conv_test::PrepareReference, chunks, &init_info);
 
-        int i,j;
+        // signal we are done calculating the reference results
+        if ((error = clSetUserEventStatus(
+                 writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
+        {
+            vlog_error(
+                "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
+                error);
+            gFailCount++;
+            return error;
+        }
 
-        for( i = 0; i < sizeof( names ) / sizeof( names[0] ); i++ )
-            for( j = 0; j < sizeof( names ) / sizeof( names[0] ); j++ )
-            {
-                if( j == i )
-                    continue;
+        // Wait for the event callbacks to finish verifying correctness.
+        if ((error = clWaitForEvents(
+                 1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
+        {
+            vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
+            gFailCount++;
+            return error;
+        }
+
+        if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
+        {
+            vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
+                       error);
+            gFailCount++;
+            return error;
+        }
+
+        if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
+        {
+            vlog_error("Error:  Failed to release done barrier:  %d\n", error);
+            gFailCount++;
+            return error;
+        }
 
-                switch( i )
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
+        {
+            if ((error = writeInputBufferInfo.calcInfo[vectorSize]->result))
+            {
+                switch (inType)
                 {
-                    case 6: //float
-                        if( j == 7 )
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) ((%s*) in)[0]; }\n", names[i], names[i], names[j], types[j], types[i] );
-                        else
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) my_rintf(((%s*) in)[0]); }\n", names[i], names[i], names[j], types[j], types[i] );
+                    case kuchar:
+                    case kchar:
+                        vlog("Input value: 0x%2.2x ",
+                             ((unsigned char *)gIn)[error - 1]);
+                        break;
+                    case kushort:
+                    case kshort:
+                        vlog("Input value: 0x%4.4x ",
+                             ((unsigned short *)gIn)[error - 1]);
                         break;
-                    case 7: //double
-                        if( j == 6 )
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) ((%s*) in)[0]; }\n", names[i], names[i], names[j], types[j], types[i] );
-                        else
-                            vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s) rint(((%s*) in)[0]); }\n", names[i], names[i], names[j], types[j], types[i] );
+                    case kuint:
+                    case kint:
+                        vlog("Input value: 0x%8.8x ",
+                             ((unsigned int *)gIn)[error - 1]);
+                        break;
+                    case kfloat:
+                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
+                        break;
+                    case kulong:
+                    case klong:
+                        vlog("Input value: 0x%16.16llx ",
+                             ((unsigned long long *)gIn)[error - 1]);
+                        break;
+                    case kdouble:
+                        vlog("Input value: %a ", ((double *)gIn)[error - 1]);
                         break;
                     default:
-                        vlog( "void %s2%s( void *out, void *in){ ((%s*) out)[0] = (%s)
-                        ((%s*) in)[0]; }\n", names[i], names[i], names[j], types[j], types[i] );
+                        vlog_error("Internal error at %s: %d\n", __FILE__,
+                                   __LINE__);
+                        abort();
                         break;
                 }
-            }
 
+                // tell the user which conversion it was.
+                if (0 == vectorSize)
+                    vlog(" (implicit scalar conversion from %s to %s)\n",
+                         gTypeNames[inType], gTypeNames[outType]);
+                else
+                    vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
+                         sizeNames[vectorSize], gSaturationNames[sat],
+                         gRoundingModeNames[round], gTypeNames[inType],
+                         sizeNames[vectorSize]);
 
-        return 0;
+                gFailCount++;
+                return error;
+            }
+        }
     }
-*/
 
-float my_fabsf( float x )
-{
-    union{ cl_uint u; float f; }u;
-    u.f = x;
-    u.u &= 0x7fffffff;
-    return u.f;
-}
+    log_info("done.\n");
 
-double my_fabs( double x )
-{
-    union{ cl_ulong u; double f; }u;
-    u.f = x;
-    u.u &= 0x7fffffffffffffffULL;
-    return u.f;
-}
+    if (gTimeResults)
+    {
+        // Kick off tests for the various vector lengths
+        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
+             vectorSize++)
+        {
+            size_t workItemCount = blockCount / vectorSizes[vectorSize];
+            if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
+                workItemCount /=
+                    4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
+
+            double sum = 0.0;
+            double bestTime = INFINITY;
+            cl_uint k;
+            for (k = 0; k < PERF_LOOP_COUNT; k++)
+            {
+                uint64_t startTime = conv_test::GetTime();
+                if ((error = conv_test::RunKernel(
+                         writeInputBufferInfo.calcInfo[vectorSize]->kernel,
+                         gInBuffer, gOutBuffers[vectorSize], workItemCount)))
+                {
+                    gFailCount++;
+                    return error;
+                }
 
-static float my_rintf( float f );
-static float my_rintf( float f )
-{
-    static const float magic[2] = { MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23), - MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23) };
+                // Make sure OpenCL is done
+                if ((error = clFinish(gQueue)))
+                {
+                    vlog_error("Error %d at clFinish\n", error);
+                    return error;
+                }
 
-    // Round fractional values to integer in round towards nearest mode
-    if( fabsf(f) < MAKE_HEX_FLOAT( 0x1.0p23f, 0x1, 23 ) )
-    {
-        volatile float x = f;
-        float magicVal = magic[ f < 0 ];
-
-#if defined( __SSE__ )
-        // Defeat x87 based arithmetic, which cant do FTZ, and will round this incorrectly
-        __m128 v = _mm_set_ss( x );
-        __m128 m = _mm_set_ss( magicVal );
-        v = _mm_add_ss( v, m );
-        v = _mm_sub_ss( v, m );
-        _mm_store_ss( (float*) &x, v );
-#else
-        x += magicVal;
-        x -= magicVal;
-#endif
-        f = x;
+                uint64_t endTime = conv_test::GetTime();
+                double time = SubtractTime(endTime, startTime);
+                sum += time;
+                if (time < bestTime) bestTime = time;
+            }
+
+            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
+            double clocksPerOp = bestTime * (double)gDeviceFrequency
+                * gComputeDevices * gSimdSize * 1e6
+                / (workItemCount * vectorSizes[vectorSize]);
+            if (0 == vectorSize)
+                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
+                          "implicit convert %s -> %s", gTypeNames[inType],
+                          gTypeNames[outType]);
+            else
+                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
+                          "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
+                          sizeNames[vectorSize], gSaturationNames[sat],
+                          gRoundingModeNames[round], gTypeNames[inType],
+                          sizeNames[vectorSize]);
+        }
     }
 
-    return f;
-}
+    if (gWimpyMode)
+        vlog("\tWimp pass");
+    else
+        vlog("\tpassed");
 
-static void uchar2char( void *out, void *in){ ((char*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2short( void *out, void *in){ ((short*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2int( void *out, void *in){ ((int*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2float( void *out, void *in)
-{
-    cl_uchar l = ((cl_uchar*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uchar2double( void *out, void *in)
-{
-    cl_uchar l = ((cl_uchar*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uchar2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uchar*) in)[0]; }
-static void char2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_char*) in)[0]; }
-static void char2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_char*) in)[0]; }
-static void char2short( void *out, void *in){ ((short*) out)[0] = ((cl_char*) in)[0]; }
-static void char2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_char*) in)[0]; }
-static void char2int( void *out, void *in){ ((int*) out)[0] = ((cl_char*) in)[0]; }
-static void char2float( void *out, void *in)
-{
-    cl_char l = ((cl_char*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void char2double( void *out, void *in)
-{
-    cl_char l = ((cl_char*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void char2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_char*) in)[0]; }
-static void char2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_char*) in)[0]; }
-static void ushort2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2char( void *out, void *in){ ((char*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2short( void *out, void *in){ ((short*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2int( void *out, void *in){ ((int*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2float( void *out, void *in)
-{
-    cl_ushort l = ((cl_ushort*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void ushort2double( void *out, void *in)
-{
-    cl_ushort l = ((cl_ushort*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void ushort2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_ushort*) in)[0]; }
-static void short2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_short*) in)[0]; }
-static void short2char( void *out, void *in){ ((cl_char*) out)[0] = ((cl_short*) in)[0]; }
-static void short2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_short*) in)[0]; }
-static void short2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_short*) in)[0]; }
-static void short2int( void *out, void *in){ ((cl_int*) out)[0] = ((cl_short*) in)[0]; }
-static void short2float( void *out, void *in)
-{
-    cl_short l = ((cl_short*) in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void short2double( void *out, void *in)
-{
-    cl_short l = ((cl_short*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void short2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_short*) in)[0]; }
-static void short2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_short*) in)[0]; }
-static void uint2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2char( void *out, void *in){ ((cl_char*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2short( void *out, void *in){ ((short*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2int( void *out, void *in){ ((cl_int*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2float( void *out, void *in)
-{
-    // Use volatile to prevent optimization by Clang compiler
-    volatile cl_uint l = ((cl_uint *)in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uint2double( void *out, void *in)
-{
-    cl_uint l = ((cl_uint*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void uint2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uint*) in)[0]; }
-static void int2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = ((cl_int*) in)[0]; }
-static void int2char( void *out, void *in){ ((cl_char*) out)[0] = ((cl_int*) in)[0]; }
-static void int2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_int*) in)[0]; }
-static void int2short( void *out, void *in){ ((cl_short*) out)[0] = ((cl_int*) in)[0]; }
-static void int2uint( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_int*) in)[0]; }
-static void int2float( void *out, void *in)
-{
-    // Use volatile to prevent optimization by Clang compiler
-    volatile cl_int l = ((cl_int *)in)[0];
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void int2double( void *out, void *in)
-{
-    cl_int l = ((cl_int*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-}
-static void int2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_int*) in)[0]; }
-static void int2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_int*) in)[0]; }
-static void float2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2char( void *out, void *in){ ((cl_char*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2short( void *out, void *in){ ((cl_short*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2uint( void *out, void *in){ ((cl_uint*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2int( void *out, void *in){ ((cl_int*) out)[0] = my_rintf(((cl_float*) in)[0]); }
-static void float2double( void *out, void *in){ ((cl_double*) out)[0] = ((cl_float*) in)[0]; }
-static void float2ulong( void *out, void *in)
-{
-#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-    // VS2005 (at least) on x86 uses fistp to store the float as a 64-bit int.
-    // However, fistp stores it as a signed int, and some of the test values won't
-    // fit into a signed int. (These test values are >= 2^63.) The result on VS2005
-    // is that these end up silently (at least by default settings) clamped to
-    // the max lowest ulong.
-    cl_float x = my_rintf(((cl_float *)in)[0]);
-    if (x >= 9223372036854775808.0f) {
-        x -= 9223372036854775808.0f;
-        ((cl_ulong*) out)[0] = x;
-        ((cl_ulong*) out)[0] += 9223372036854775808ULL;
-    } else {
-        ((cl_ulong*) out)[0] = x;
-    }
-#else
-    ((cl_ulong*) out)[0] = my_rintf(((cl_float*) in)[0]);
+#ifdef __APPLE__
+    // record the run time
+    vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
 #endif
-}
+    vlog("\n\n");
+    fflush(stdout);
 
-static void float2long( void *out, void *in){ ((cl_long*) out)[0] =  llrint_clamped( ((cl_float*) in)[0] ); }
-static void double2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2char( void *out, void *in){ ((cl_char*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2short( void *out, void *in){ ((cl_short*) out)[0] = rint(((cl_double*) in)[0]); }
-static void double2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) rint(((cl_double*) in)[0]); }
-static void double2int( void *out, void *in){ ((cl_int*) out)[0] = (int) rint(((cl_double*) in)[0]); }
-static void double2float( void *out, void *in){ ((cl_float*) out)[0] = (float) ((cl_double*) in)[0]; }
-static void double2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = (cl_ulong) rint(((cl_double*) in)[0]); }
-static void double2long( void *out, void *in){ ((cl_long*) out)[0] = (cl_long) rint(((cl_double*) in)[0]); }
-static void ulong2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = (cl_uchar) ((cl_ulong*) in)[0]; }
-static void ulong2char( void *out, void *in){ ((cl_char*) out)[0] = (cl_char) ((cl_ulong*) in)[0]; }
-static void ulong2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = (cl_ushort) ((cl_ulong*) in)[0]; }
-static void ulong2short( void *out, void *in){ ((cl_short*) out)[0] = (cl_short)((cl_ulong*) in)[0]; }
-static void ulong2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) ((cl_ulong*) in)[0]; }
-static void ulong2int( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) ((cl_ulong*) in)[0]; }
-static void ulong2float( void *out, void *in)
-{
-#if defined(_MSC_VER) && defined(_M_X64)
-    cl_ulong l = ((cl_ulong*) in)[0];
-    float result;
-    cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1)) : (cl_long)l;
-    _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), sl));
-    ((float*) out)[0] = (l == 0 ? 0.0f : (((cl_long)l < 0) ? result * 2.0f : result));
-#else
-    cl_ulong l = ((cl_ulong*) in)[0];
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-    /* ARM VFP doesn't have hardware instruction for converting from 64-bit
-     * integer to float types, hence GCC ARM uses the floating-point emulation
-     * code despite which -mfloat-abi setting it is. But the emulation code in
-     * libgcc.a has only one rounding mode (round to nearest even in this case)
-     * and ignores the user rounding mode setting in hardware.
-     * As a result setting rounding modes in hardware won't give correct
-     * rounding results for type covert from 64-bit integer to float using GCC
-     * for ARM compiler so for testing different rounding modes, we need to use
-     * alternative reference function. ARM64 does have an instruction, however
-     * we cannot guarantee the compiler will use it.  On all ARM architechures
-     * use emulation to calculate reference.*/
-    ((float*) out)[0] = qcom_u64_2_f32(l, qcom_sat, qcom_rm);
-#else
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#endif
-#endif
+    return error;
 }
-static void ulong2double( void *out, void *in)
-{
-#if defined(_MSC_VER)
-    cl_ulong l = ((cl_ulong*) in)[0];
-    double result;
 
-    cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1)) : (cl_long)l;
-#if defined(_M_X64)
-    _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), sl));
-#else
-    result = sl;
-#endif
-    ((double*) out)[0] = (l == 0 ? 0.0 : (((cl_long)l < 0) ? result * 2.0 : result));
-#else
-    // Use volatile to prevent optimization by Clang compiler
-    volatile cl_ulong l = ((cl_ulong *)in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
+#if !defined(__APPLE__)
+void memset_pattern4(void *dest, const void *src_pattern, size_t bytes);
 #endif
-}
-static void ulong2long( void *out, void *in){ ((cl_long*) out)[0] = ((cl_ulong*) in)[0]; }
-static void long2uchar( void *out, void *in){ ((cl_uchar*) out)[0] = (cl_uchar) ((cl_long*) in)[0]; }
-static void long2char( void *out, void *in){ ((cl_char*) out)[0] = (cl_char) ((cl_long*) in)[0]; }
-static void long2ushort( void *out, void *in){ ((cl_ushort*) out)[0] = (cl_ushort) ((cl_long*) in)[0]; }
-static void long2short( void *out, void *in){ ((cl_short*) out)[0] = (cl_short) ((cl_long*) in)[0]; }
-static void long2uint( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) ((cl_long*) in)[0]; }
-static void long2int( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) ((cl_long*) in)[0]; }
-static void long2float( void *out, void *in)
-{
-#if defined(_MSC_VER) && defined(_M_X64)
-    cl_long l = ((cl_long*) in)[0];
-    float result;
 
-    _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), l));
-    ((float*) out)[0] = (l == 0 ? 0.0f : result);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#else
-    cl_long l = ((cl_long*) in)[0];
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-    /* ARM VFP doesn't have hardware instruction for converting from 64-bit
-     * integer to float types, hence GCC ARM uses the floating-point emulation
-     * code despite which -mfloat-abi setting it is. But the emulation code in
-     * libgcc.a has only one rounding mode (round to nearest even in this case)
-     * and ignores the user rounding mode setting in hardware.
-     * As a result setting rounding modes in hardware won't give correct
-     * rounding results for type covert from 64-bit integer to float using GCC
-     * for ARM compiler so for testing different rounding modes, we need to use
-     * alternative reference function. ARM64 does have an instruction, however
-     * we cannot guarantee the compiler will use it.  On all ARM architechures
-     * use emulation to calculate reference.*/
-    ((float*) out)[0] = (l == 0 ? 0.0f : qcom_s64_2_f32(l, qcom_sat, qcom_rm));
+#if defined(_MSC_VER)
+/* function is defined in "compat.h" */
 #else
-    ((float*) out)[0] = (l == 0 ? 0.0f : (float) l);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
-#endif
-#endif
-}
-static void long2double( void *out, void *in)
+double SubtractTime(uint64_t endTime, uint64_t startTime)
 {
-#if defined(_MSC_VER) && defined(_M_X64)
-    cl_long l = ((cl_long*) in)[0];
-    double result;
+    uint64_t diff = endTime - startTime;
+    static double conversion = 0.0;
 
-    _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), l));
-    ((double*) out)[0] = (l == 0 ? 0.0 : result);        // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
+    if (0.0 == conversion)
+    {
+#if defined(__APPLE__)
+        mach_timebase_info_data_t info = { 0, 0 };
+        kern_return_t err = mach_timebase_info(&info);
+        if (0 == err)
+            conversion = 1e-9 * (double)info.numer / (double)info.denom;
 #else
-    cl_long l = ((cl_long*) in)[0];
-    ((double*) out)[0] = (l == 0 ? 0.0 : (double) l);      // Per IEEE-754-2008 5.4.1, 0's always convert to +0.0
+        // This function consumes output from GetTime() above, and converts the
+        // time to secionds.
+#warning need accurate ticks to seconds conversion factor here. Times are invalid.
 #endif
-}
-static void long2ulong( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_long*) in)[0]; }
-
-#define CLAMP( _lo, _x, _hi )   ( (_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
-
-// Done by hand
-static void uchar2char_sat( void *out, void *in){ cl_uchar c = ((cl_uchar*) in)[0]; ((cl_char*) out)[0] = c > 0x7f ? 0x7f : c; }
-static void uchar2ushort_sat( void *out, void *in){ ((cl_ushort*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2short_sat( void *out, void *in){ ((cl_short*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2float_sat( void *out, void *in){ ((cl_float*) out)[0] = my_fabsf( (cl_float) ((cl_uchar*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void uchar2double_sat( void *out, void *in){ ((cl_double*) out)[0] = my_fabs( (cl_double) ((cl_uchar*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void uchar2ulong_sat( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uchar*) in)[0]; }
-static void uchar2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uchar*) in)[0]; }
-static void char2uchar_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_uchar*) out)[0] = c < 0 ? 0 : c; }
-static void char2ushort_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_ushort*) out)[0] = c < 0 ? 0 : c; }
-static void char2short_sat( void *out, void *in){ ((cl_short*) out)[0] = ((cl_char*) in)[0]; }
-static void char2uint_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_uint*) out)[0] = c < 0 ? 0 : c; }
-static void char2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_char*) in)[0]; }
-static void char2float_sat( void *out, void *in){ ((cl_float*) out)[0] = ((cl_char*) in)[0]; }
-static void char2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_char*) in)[0]; }
-static void char2ulong_sat( void *out, void *in){ cl_char c = ((cl_char*) in)[0]; ((cl_ulong*) out)[0] = c < 0 ? 0 : c; }
-static void char2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_char*) in)[0]; }
-static void ushort2uchar_sat( void *out, void *in){ cl_ushort u = ((cl_ushort*) in)[0]; ((cl_uchar*) out)[0] = u > 0xff ? 0xFF : u; }
-static void ushort2char_sat( void *out, void *in){ cl_ushort u = ((cl_ushort*) in)[0]; ((cl_char*) out)[0] = u > 0x7f ? 0x7F : u; }
-static void ushort2short_sat( void *out, void *in){ cl_ushort u = ((cl_ushort*) in)[0]; ((cl_short*) out)[0] = u > 0x7fff ? 0x7fFF : u; }
-static void ushort2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2float_sat( void *out, void *in){ ((cl_float*) out)[0] = my_fabsf((cl_float)((cl_ushort*) in)[0]); }     // my_fabs workaround for <rdar://problem/5965527>
-static void ushort2double_sat( void *out, void *in){ ((cl_double*) out)[0] = my_fabs( (cl_double) ((cl_ushort*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void ushort2ulong_sat( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_ushort*) in)[0]; }
-static void ushort2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_ushort*) in)[0]; }
-static void short2uchar_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, s, CL_UCHAR_MAX ); }
-static void short2char_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, s, CL_CHAR_MAX ); }
-static void short2ushort_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_ushort*) out)[0] = s < 0 ? 0 : s; }
-static void short2uint_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_uint*) out)[0] = s < 0 ? 0 : s; }
-static void short2int_sat( void *out, void *in){ ((cl_int*) out)[0] = ((cl_short*) in)[0]; }
-static void short2float_sat( void *out, void *in){ ((cl_float*) out)[0] = ((cl_short*) in)[0]; }
-static void short2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_short*) in)[0]; }
-static void short2ulong_sat( void *out, void *in){ cl_short s = ((cl_short*) in)[0]; ((cl_ulong*) out)[0] = s < 0 ? 0 : s; }
-static void short2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_short*) in)[0]; }
-static void uint2uchar_sat( void *out, void *in){ cl_uint u = ((cl_uint*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, u, CL_UCHAR_MAX); }
-static void uint2char_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_char*) out)[0] = CLAMP( 0, u, CL_CHAR_MAX ); }
-static void uint2ushort_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, u, CL_USHRT_MAX); }
-static void uint2short_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_short*) out)[0] = CLAMP( 0, u, CL_SHRT_MAX); }
-static void uint2int_sat( void *out, void *in){  cl_uint u = ((cl_uint*) in)[0]; ((cl_int*) out)[0] = CLAMP( 0, u, CL_INT_MAX); }
-static void uint2float_sat( void *out, void *in){ ((cl_float*) out)[0] = my_fabsf( (cl_float) ((cl_uint*) in)[0] ); }  // my_fabs workaround for <rdar://problem/5965527>
-static void uint2double_sat( void *out, void *in){ ((cl_double*) out)[0] = my_fabs( (cl_double) ((cl_uint*) in)[0]); } // my_fabs workaround for <rdar://problem/5965527>
-static void uint2ulong_sat( void *out, void *in){ ((cl_ulong*) out)[0] = ((cl_uint*) in)[0]; }
-static void uint2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_uint*) in)[0]; }
-static void int2uchar_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, i, CL_UCHAR_MAX); }
-static void int2char_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, i, CL_CHAR_MAX); }
-static void int2ushort_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, i, CL_USHRT_MAX); }
-static void int2short_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, i, CL_SHRT_MAX); }
-static void int2uint_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_uint*) out)[0] = CLAMP( 0, i, CL_INT_MAX); }
-static void int2float_sat( void *out, void *in){ ((cl_float*) out)[0] = ((cl_int*) in)[0]; }
-static void int2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_int*) in)[0]; }
-static void int2ulong_sat( void *out, void *in){ cl_int i = ((cl_int*) in)[0]; ((cl_ulong*) out)[0] = i < 0 ? 0 : i; }
-static void int2long_sat( void *out, void *in){ ((cl_long*) out)[0] = ((cl_int*) in)[0]; }
-static void float2uchar_sat( void *out, void *in){ ((cl_uchar*) out)[0] = CLAMP( 0, lrintf_clamped(((cl_float*) in)[0]), CL_UCHAR_MAX ); }
-static void float2char_sat( void *out, void *in){ ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, lrintf_clamped(((cl_float*) in)[0]), CL_CHAR_MAX); }
-static void float2ushort_sat( void *out, void *in){ ((cl_ushort*) out)[0] = CLAMP( 0, lrintf_clamped(((cl_float*) in)[0]), CL_USHRT_MAX ); }
-static void float2short_sat( void *out, void *in){ ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, lrintf_clamped(((cl_float*) in)[0]), CL_SHRT_MAX ); }
-static void float2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, llrintf_clamped(((cl_float*) in)[0]), CL_UINT_MAX ); }
-static void float2int_sat( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) CLAMP( CL_INT_MIN, lrintf_clamped(((cl_float*) in)[0]), CL_INT_MAX ); }
-static void float2double_sat( void *out, void *in){ ((cl_double*) out)[0] = ((cl_float*) in)[0]; }
-static void float2ulong_sat( void *out, void *in)
-{
-#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
-    // VS2005 (at least) on x86 uses fistp to store the float as a 64-bit int.
-    // However, fistp stores it as a signed int, and some of the test values won't
-    // fit into a signed int. (These test values are >= 2^63.) The result on VS2005
-    // is that these end up silently (at least by default settings) clamped to
-    // the max lowest ulong.
-    cl_float x = my_rintf(((cl_float *)in)[0]);
-    if (x >= 18446744073709551616.0f) {         // 2^64
-        ((cl_ulong*) out)[0] = 0xFFFFFFFFFFFFFFFFULL;
-    } else if (x < 0) {
-        ((cl_ulong*) out)[0] = 0;
-    } else if (x >= 9223372036854775808.0f) {   // 2^63
-        x -= 9223372036854775808.0f;
-        ((cl_ulong*) out)[0] = x;
-        ((cl_ulong*) out)[0] += 9223372036854775808ULL;
-    } else {
-        ((cl_ulong*) out)[0] = x;
     }
-#else
-    float f = my_rintf(((float*) in)[0]); ((cl_ulong*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64) ? 0xFFFFFFFFFFFFFFFFULL : f < 0 ? 0 : (cl_ulong) f;
-#endif
-}
-// The final cast used to be (cl_ulong) f, but on Linux (RHEL5 at least)
-// if f = -1.0f, then (cl_ulong) f = 0xffffffff, which clearly isn't right.
-// Switching it to (cl_long) f seems to fix that.
-static void float2long_sat( void *out, void *in){ float f = my_rintf(((float*) in)[0]); ((cl_long*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63) ? 0x7FFFFFFFFFFFFFFFULL : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63) ? 0x8000000000000000LL : (cl_long) f; }
-static void double2uchar_sat( void *out, void *in){ ((cl_uchar*) out)[0] = CLAMP( 0, lrint_clamped(((cl_double*) in)[0]), CL_UCHAR_MAX ); }
-static void double2char_sat( void *out, void *in){ ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, lrint_clamped(((cl_double*) in)[0]), CL_CHAR_MAX); }
-static void double2ushort_sat( void *out, void *in){ ((cl_ushort*) out)[0] = CLAMP( 0, lrint_clamped(((cl_double*) in)[0]), CL_USHRT_MAX ); }
-static void double2short_sat( void *out, void *in){ ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, lrint_clamped(((cl_double*) in)[0]), CL_SHRT_MAX ); }
-static void double2uint_sat( void *out, void *in){ ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, llrint_clamped(((cl_double*) in)[0]), CL_UINT_MAX ); }
-static void double2int_sat( void *out, void *in){ ((cl_int*) out)[0] = (cl_int) CLAMP( CL_INT_MIN, lrint_clamped(((cl_double*) in)[0]), CL_INT_MAX ); }
-static void double2float_sat( void *out, void *in){ ((cl_float*) out)[0] = (cl_float) ((double*) in)[0]; }
-static void double2ulong_sat( void *out, void *in){ double f = rint(((double*) in)[0]); ((cl_ulong*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64) ? 0xFFFFFFFFFFFFFFFFULL : f < 0 ? 0 : (cl_ulong) f; }
-static void double2long_sat( void *out, void *in){ double f = rint(((double*) in)[0]); ((cl_long*) out)[0] = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63) ? 0x7FFFFFFFFFFFFFFFULL : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63) ? 0x8000000000000000LL : (cl_long) f; }
-static void ulong2uchar_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, u, CL_UCHAR_MAX ); }
-static void ulong2char_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_char*) out)[0] = CLAMP( 0, u, CL_CHAR_MAX ); }
-static void ulong2ushort_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, u, CL_USHRT_MAX ); }
-static void ulong2short_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_short*) out)[0] = CLAMP( 0, u, CL_SHRT_MAX ); }
-static void ulong2uint_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, u, CL_UINT_MAX ); }
-static void ulong2int_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_int*) out)[0] = (cl_int) CLAMP( 0, u, CL_INT_MAX ); }
-static void ulong2float_sat( void *out, void *in){ ((float*) out)[0] = my_fabsf((float) ((cl_ulong*) in)[0]); }  // my_fabs workaround for <rdar://problem/5965527>
-static void ulong2double_sat( void *out, void *in){ ((double*) out)[0] = my_fabs( ((cl_ulong*) in)[0]); }        // my_fabs workaround for <rdar://problem/5965527>
-static void ulong2long_sat( void *out, void *in){ cl_ulong u = ((cl_ulong*) in)[0]; ((cl_long*) out)[0] = CLAMP( 0, u, CL_LONG_MAX ); }
-static void long2uchar_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_uchar*) out)[0] = CLAMP( 0, u, CL_UCHAR_MAX ); }
-static void long2char_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_char*) out)[0] = CLAMP( CL_CHAR_MIN, u, CL_CHAR_MAX ); }
-static void long2ushort_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_ushort*) out)[0] = CLAMP( 0, u, CL_USHRT_MAX ); }
-static void long2short_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_short*) out)[0] = CLAMP( CL_SHRT_MIN, u, CL_SHRT_MAX ); }
-static void long2uint_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_uint*) out)[0] = (cl_uint) CLAMP( 0, u, CL_UINT_MAX ); }
-static void long2int_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_int*) out)[0] = (int) CLAMP( CL_INT_MIN, u, CL_INT_MAX ); }
-static void long2float_sat( void *out, void *in){ ((float*) out)[0] = (float) ((cl_long*) in)[0]; }
-static void long2double_sat( void *out, void *in){ ((double*) out)[0] = ((cl_long*) in)[0]; }
-static void long2ulong_sat( void *out, void *in){ cl_long u = ((cl_long*) in)[0]; ((cl_ulong*) out)[0] = CLAMP( 0, u, CL_LONG_MAX ); }
-
-/*
-#include <stdio.h>
 
-char *ground[] = {   "",
-                                                            "_rte",
-                                                            "_rtp",
-                                                            "_rtn",
-                                                            "_rtz"
-                    };
-
-const char *gTypeNames[  ] = {
-                                            "uchar", "char",
-                                            "ushort", "short",
-                                            "uint",   "int",
-                                            "float", "double",
-                                            "ulong", "long"
-                                        };
+    // strictly speaking we should also be subtracting out timer latency here
+    return conversion * (double)diff;
+}
+#endif
 
+////////////////////////////////////////////////////////////////////////////////
 
-int main( void )
+static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
 {
-    int i, j;
-
-    for( i = 0; i < sizeof( gTypeNames ) / sizeof( gTypeNames[0] ); i++ )
-        for( j = 0; j < sizeof( ground ) / sizeof( ground[0] ); j++ )
-        {
-            vlog( "float clampf_%s%s( float );\n", gTypeNames[i], ground[j] );
-            vlog( "double clampd_%s%s( double );\n", gTypeNames[i], ground[j] );
-        }
-
-    return 0;
-
+    cl_uint i;
+    for (i = 0; i < count; ++i)
+        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
 }
-*/
-
-
-float clampf_uchar( float );
-double clampd_uchar( double );
-float clampf_uchar_rte( float );
-double clampd_uchar_rte( double );
-float clampf_uchar_rtp( float );
-double clampd_uchar_rtp( double );
-float clampf_uchar_rtn( float );
-double clampd_uchar_rtn( double );
-float clampf_uchar_rtz( float );
-double clampd_uchar_rtz( double );
-float clampf_char( float );
-double clampd_char( double );
-float clampf_char_rte( float );
-double clampd_char_rte( double );
-float clampf_char_rtp( float );
-double clampd_char_rtp( double );
-float clampf_char_rtn( float );
-double clampd_char_rtn( double );
-float clampf_char_rtz( float );
-double clampd_char_rtz( double );
-float clampf_ushort( float );
-double clampd_ushort( double );
-float clampf_ushort_rte( float );
-double clampd_ushort_rte( double );
-float clampf_ushort_rtp( float );
-double clampd_ushort_rtp( double );
-float clampf_ushort_rtn( float );
-double clampd_ushort_rtn( double );
-float clampf_ushort_rtz( float );
-double clampd_ushort_rtz( double );
-float clampf_short( float );
-double clampd_short( double );
-float clampf_short_rte( float );
-double clampd_short_rte( double );
-float clampf_short_rtp( float );
-double clampd_short_rtp( double );
-float clampf_short_rtn( float );
-double clampd_short_rtn( double );
-float clampf_short_rtz( float );
-double clampd_short_rtz( double );
-float clampf_uint( float );
-double clampd_uint( double );
-float clampf_uint_rte( float );
-double clampd_uint_rte( double );
-float clampf_uint_rtp( float );
-double clampd_uint_rtp( double );
-float clampf_uint_rtn( float );
-double clampd_uint_rtn( double );
-float clampf_uint_rtz( float );
-double clampd_uint_rtz( double );
-float clampf_int( float );
-double clampd_int( double );
-float clampf_int_rte( float );
-double clampd_int_rte( double );
-float clampf_int_rtp( float );
-double clampd_int_rtp( double );
-float clampf_int_rtn( float );
-double clampd_int_rtn( double );
-float clampf_int_rtz( float );
-double clampd_int_rtz( double );
-float clampf_float( float );
-double clampd_float( double );
-float clampf_float_rte( float );
-double clampd_float_rte( double );
-float clampf_float_rtp( float );
-double clampd_float_rtp( double );
-float clampf_float_rtn( float );
-double clampd_float_rtn( double );
-float clampf_float_rtz( float );
-double clampd_float_rtz( double );
-float clampf_double( float );
-double clampd_double( double );
-float clampf_double_rte( float );
-double clampd_double_rte( double );
-float clampf_double_rtp( float );
-double clampd_double_rtp( double );
-float clampf_double_rtn( float );
-double clampd_double_rtn( double );
-float clampf_double_rtz( float );
-double clampd_double_rtz( double );
-float clampf_ulong( float );
-double clampd_ulong( double );
-float clampf_ulong_rte( float );
-double clampd_ulong_rte( double );
-float clampf_ulong_rtp( float );
-double clampd_ulong_rtp( double );
-float clampf_ulong_rtn( float );
-double clampd_ulong_rtn( double );
-float clampf_ulong_rtz( float );
-double clampd_ulong_rtz( double );
-float clampf_long( float );
-double clampd_long( double );
-float clampf_long_rte( float );
-double clampd_long_rte( double );
-float clampf_long_rtp( float );
-double clampd_long_rtp( double );
-float clampf_long_rtn( float );
-double clampd_long_rtn( double );
-float clampf_long_rtz( float );
-double clampd_long_rtz( double );
-
-/*
-#include <stdio.h>
 
-char *ground[] = {   "",
-                                                            "_rte",
-                                                            "_rtp",
-                                                            "_rtn",
-                                                            "_rtz"
-                    };
 
-const char *gTypeNames[  ] = {
-                                            "uchar", "char",
-                                            "ushort", "short",
-                                            "uint",   "int",
-                                            "float", "double",
-                                            "ulong", "long"
-                                        };
+void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &ptr);
 
+void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
+                                             void *data);
 
-int main( void )
+// Note: May be called reentrantly
+void MapResultValuesComplete(const std::unique_ptr<CalcRefValsBase> &info)
 {
-    int i, j;
-
-    for( i = 0; i < sizeof( gTypeNames ) / sizeof( gTypeNames[0] ); i++ )
+    cl_int status;
+    // CalcRefValsBase *info = (CalcRefValsBase *)data;
+    cl_event calcReferenceValues = info->parent->calcReferenceValues;
+
+    // we know that the map is done, wait for the main thread to finish
+    // calculating the reference values
+    if ((status =
+             clSetEventCallback(calcReferenceValues, CL_COMPLETE,
+                                CalcReferenceValuesComplete, (void *)&info)))
     {
-        vlog( "{\t" );
-        for( j = 0; j < sizeof( ground ) / sizeof( ground[0] ); j++ )
-            vlog( "clampf_%s%s,\t", gTypeNames[i], ground[j] );
+        vlog_error("ERROR: clSetEventCallback failed in "
+                   "MapResultValuesComplete with status: %d\n",
+                   status);
+        gFailCount++; // not thread safe -- being lazy here
+    }
 
-        vlog( "\t},\n" );
+    // this thread no longer needs its reference to info->calcReferenceValues,
+    // so release it
+    if ((status = clReleaseEvent(calcReferenceValues)))
+    {
+        vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
+                   "with status: %d\n",
+                   status);
+        gFailCount++; // not thread safe -- being lazy here
     }
 
-    return 0;
+    // no need to flush since we didn't enqueue anything
 
+    // e was already released by WriteInputBufferComplete. It should be
+    // destroyed automatically soon after we exit.
 }
-*/
-clampf gClampFloat[ kTypeCount ][kRoundingModeCount] = {
-    {    clampf_uchar,    clampf_uchar_rte,    clampf_uchar_rtp,    clampf_uchar_rtn,    clampf_uchar_rtz,        },
-    {    clampf_char,    clampf_char_rte,    clampf_char_rtp,    clampf_char_rtn,    clampf_char_rtz,        },
-    {    clampf_ushort,    clampf_ushort_rte,    clampf_ushort_rtp,    clampf_ushort_rtn,    clampf_ushort_rtz,        },
-    {    clampf_short,    clampf_short_rte,    clampf_short_rtp,    clampf_short_rtn,    clampf_short_rtz,        },
-    {    clampf_uint,    clampf_uint_rte,    clampf_uint_rtp,    clampf_uint_rtn,    clampf_uint_rtz,        },
-    {    clampf_int,     clampf_int_rte,     clampf_int_rtp,     clampf_int_rtn,     clampf_int_rtz,         },
-    {    clampf_float,    clampf_float_rte,    clampf_float_rtp,    clampf_float_rtn,    clampf_float_rtz,        },
-    {    clampf_double,    clampf_double_rte,    clampf_double_rtp,    clampf_double_rtn,    clampf_double_rtz,        },
-    {    clampf_ulong,    clampf_ulong_rte,    clampf_ulong_rtp,    clampf_ulong_rtn,    clampf_ulong_rtz,        },
-    {    clampf_long,    clampf_long_rte,    clampf_long_rtp,    clampf_long_rtn,    clampf_long_rtz,        }
-};
-
-clampd gClampDouble[ kTypeCount ][kRoundingModeCount] = {
-    {    clampd_uchar,    clampd_uchar_rte,    clampd_uchar_rtp,    clampd_uchar_rtn,    clampd_uchar_rtz,        },
-    {    clampd_char,    clampd_char_rte,    clampd_char_rtp,    clampd_char_rtn,    clampd_char_rtz,        },
-    {    clampd_ushort,    clampd_ushort_rte,    clampd_ushort_rtp,    clampd_ushort_rtn,    clampd_ushort_rtz,        },
-    {    clampd_short,    clampd_short_rte,    clampd_short_rtp,    clampd_short_rtn,    clampd_short_rtz,        },
-    {    clampd_uint,    clampd_uint_rte,    clampd_uint_rtp,    clampd_uint_rtn,    clampd_uint_rtz,        },
-    {    clampd_int,     clampd_int_rte,     clampd_int_rtp,     clampd_int_rtn,     clampd_int_rtz,         },
-    {    clampd_float,    clampd_float_rte,    clampd_float_rtp,    clampd_float_rtn,    clampd_float_rtz,        },
-    {    clampd_double,    clampd_double_rte,    clampd_double_rtp,    clampd_double_rtn,    clampd_double_rtz,        },
-    {    clampd_ulong,    clampd_ulong_rte,    clampd_ulong_rtp,    clampd_ulong_rtn,    clampd_ulong_rtz,        },
-    {    clampd_long,    clampd_long_rte,    clampd_long_rtp,    clampd_long_rtn,    clampd_long_rtz,        }
-};
 
-#if defined (_WIN32)
-#define __attribute__(X)
-#endif
 
-static inline float fclamp( float lo, float v, float hi ) __attribute__ ((always_inline));
-static inline double dclamp( double lo, double v, double hi ) __attribute__ ((always_inline));
-
-static inline float fclamp( float lo, float v, float hi ){ v = v < lo ? lo : v; return v < hi ? v : hi; }
-static inline double dclamp( double lo, double v, double hi ){ v = v < lo ? lo : v; return v < hi ? v : hi; }
-
-// Clamp unsaturated inputs into range so we don't get test errors:
-float clampf_uchar( float f )       { return fclamp( -0.5f, f, 255.5f - 128.0f * FLT_EPSILON ); }
-double clampd_uchar( double f )     { return dclamp( -0.5, f, 255.5 - 128.0 * DBL_EPSILON ); }
-float clampf_uchar_rte( float f )   { return fclamp( -0.5f, f, 255.5f - 128.0f * FLT_EPSILON ); }
-double clampd_uchar_rte( double f ) { return dclamp( -0.5, f, 255.5 - 128.0 * DBL_EPSILON ); }
-float clampf_uchar_rtp( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 255.0f ); }
-double clampd_uchar_rtp( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 255.0 ); }
-float clampf_uchar_rtn( float f )   { return fclamp( -0.0f, f, 256.0f - 128.0f * FLT_EPSILON); }
-double clampd_uchar_rtn( double f ) { return dclamp( -0.0, f, 256.0 - 128.0 * DBL_EPSILON); }
-float clampf_uchar_rtz( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 256.0f - 128.0f * FLT_EPSILON); }
-double clampd_uchar_rtz( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 256.0 - 128.0f * DBL_EPSILON); }
-
-float clampf_char( float f )        { return fclamp( -128.5f, f, 127.5f - 64.f * FLT_EPSILON ); }
-double clampd_char( double f )      { return dclamp( -128.5, f, 127.5 - 64. * DBL_EPSILON ); }
-float clampf_char_rte( float f )    { return fclamp( -128.5f, f, 127.5f - 64.f * FLT_EPSILON ); }
-double clampd_char_rte( double f )  { return dclamp( -128.5, f, 127.5 - 64. * DBL_EPSILON ); }
-float clampf_char_rtp( float f )    { return fclamp( -129.0f + 128.f*FLT_EPSILON, f, 127.f ); }
-double clampd_char_rtp( double f )  { return dclamp( -129.0 + 128.*DBL_EPSILON, f, 127. ); }
-float clampf_char_rtn( float f )    { return fclamp( -128.0f, f, 128.f - 64.0f*FLT_EPSILON ); }
-double clampd_char_rtn( double f )  { return dclamp( -128.0, f, 128. - 64.0*DBL_EPSILON ); }
-float clampf_char_rtz( float f )    { return fclamp( -129.0f + 128.f*FLT_EPSILON, f, 128.f - 64.0f*FLT_EPSILON ); }
-double clampd_char_rtz( double f )  { return dclamp( -129.0 + 128.*DBL_EPSILON, f, 128. - 64.0*DBL_EPSILON ); }
-
-float clampf_ushort( float f )       { return fclamp( -0.5f, f, 65535.5f - 32768.0f * FLT_EPSILON ); }
-double clampd_ushort( double f )     { return dclamp( -0.5, f, 65535.5 - 32768.0 * DBL_EPSILON ); }
-float clampf_ushort_rte( float f )   { return fclamp( -0.5f, f, 65535.5f - 32768.0f * FLT_EPSILON ); }
-double clampd_ushort_rte( double f ) { return dclamp( -0.5, f, 65535.5 - 32768.0 * DBL_EPSILON ); }
-float clampf_ushort_rtp( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 65535.0f ); }
-double clampd_ushort_rtp( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 65535.0 ); }
-float clampf_ushort_rtn( float f )   { return fclamp( -0.0f, f, 65536.0f - 32768.0f * FLT_EPSILON); }
-double clampd_ushort_rtn( double f ) { return dclamp( -0.0, f, 65536.0 - 32768.0 * DBL_EPSILON); }
-float clampf_ushort_rtz( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, 65536.0f - 32768.0f * FLT_EPSILON); }
-double clampd_ushort_rtz( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, 65536.0 - 32768.0f * DBL_EPSILON); }
-
-float clampf_short( float f )        { return fclamp( -32768.5f, f, 32767.5f - 16384.f * FLT_EPSILON ); }
-double clampd_short( double f )      { return dclamp( -32768.5, f, 32767.5 - 16384. * DBL_EPSILON ); }
-float clampf_short_rte( float f )    { return fclamp( -32768.5f, f, 32767.5f - 16384.f * FLT_EPSILON ); }
-double clampd_short_rte( double f )  { return dclamp( -32768.5, f, 32767.5 - 16384. * DBL_EPSILON ); }
-float clampf_short_rtp( float f )    { return fclamp( -32769.0f + 32768.f*FLT_EPSILON, f, 32767.f ); }
-double clampd_short_rtp( double f )  { return dclamp( -32769.0 + 32768.*DBL_EPSILON, f, 32767. ); }
-float clampf_short_rtn( float f )    { return fclamp( -32768.0f, f, 32768.f - 16384.0f*FLT_EPSILON ); }
-double clampd_short_rtn( double f )  { return dclamp( -32768.0, f, 32768. - 16384.0*DBL_EPSILON ); }
-float clampf_short_rtz( float f )    { return fclamp( -32769.0f + 32768.f*FLT_EPSILON, f, 32768.f - 16384.0f*FLT_EPSILON ); }
-double clampd_short_rtz( double f )  { return dclamp( -32769.0 + 32768.*DBL_EPSILON, f, 32768. - 16384.0*DBL_EPSILON ); }
-
-float clampf_uint( float f )        { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) ); }
-double clampd_uint( double f )      { return dclamp( -0.5, f, CL_UINT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * DBL_EPSILON ); }
-float clampf_uint_rte( float f )    { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) ); }
-double clampd_uint_rte( double f )  { return dclamp( -0.5, f, CL_UINT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * DBL_EPSILON ); }
-float clampf_uint_rtp( float f )    { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) ); }
-double clampd_uint_rtp( double f )  { return dclamp( -1.0 + DBL_EPSILON/2.0, f, CL_UINT_MAX ); }
-float clampf_uint_rtn( float f )    { return fclamp( -0.0f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)); }
-double clampd_uint_rtn( double f )  { return dclamp( -0.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21) ); }
-float clampf_uint_rtz( float f )    { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)); }
-double clampd_uint_rtz( double f )  { return dclamp( -1.0 + DBL_EPSILON/2.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21)); }
-
-float clampf_int( float f )         { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int( double f )       { return dclamp( INT_MIN - 0.5, f, CL_INT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-float clampf_int_rte( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rte( double f )   { return dclamp( INT_MIN - 0.5, f, CL_INT_MAX + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-float clampf_int_rtp( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rtp( double f )   { return dclamp( INT_MIN - 1.0 + DBL_EPSILON * MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31), f, CL_INT_MAX ); }
-float clampf_int_rtn( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rtn( double f )   { return dclamp( INT_MIN, f, CL_INT_MAX + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-float clampf_int_rtz( float f )     { return fclamp( INT_MIN, f, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) ); }
-double clampd_int_rtz( double f )   { return dclamp( INT_MIN - 1.0 + DBL_EPSILON * MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31), f, CL_INT_MAX + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * DBL_EPSILON ); }
-
-float clampf_float( float f ){ return f; }
-double clampd_float( double f ){ return f; }
-float clampf_float_rte( float f ){ return f; }
-double clampd_float_rte( double f ){ return f; }
-float clampf_float_rtp( float f ){ return f; }
-double clampd_float_rtp( double f ){ return f; }
-float clampf_float_rtn( float f ){ return f; }
-double clampd_float_rtn( double f ){ return f; }
-float clampf_float_rtz( float f ){ return f; }
-double clampd_float_rtz( double f ){ return f; }
-
-float clampf_double( float f ){ return f; }
-double clampd_double( double f ){ return f; }
-float clampf_double_rte( float f ){ return f; }
-double clampd_double_rte( double f ){ return f; }
-float clampf_double_rtp( float f ){ return f; }
-double clampd_double_rtp( double f ){ return f; }
-float clampf_double_rtn( float f ){ return f; }
-double clampd_double_rtn( double f ){ return f; }
-float clampf_double_rtz( float f ){ return f; }
-double clampd_double_rtz( double f ){ return f; }
-
-float clampf_ulong( float f )       { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong( double f )     { return dclamp( -0.5, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rte( float f )   { return fclamp( -0.5f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rte( double f ) { return dclamp( -0.5, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rtp( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rtp( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rtn( float f )   { return fclamp( -0.0f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rtn( double f ) { return dclamp( -0.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-float clampf_ulong_rtz( float f )   { return fclamp( -1.0f + FLT_EPSILON/2.0f, f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) ); }
-double clampd_ulong_rtz( double f ) { return dclamp( -1.0 + DBL_EPSILON/2.0, f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) ); }
-
-float clampf_long( float f )        { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long( double f )      { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rte( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rte( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rtp( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rtp( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rtn( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rtn( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-float clampf_long_rtz( float f )    { return fclamp( MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), f, MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38) ); }
-double clampd_long_rtz( double f )  { return dclamp( MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10) ); }
-
-#pragma mark -
-
-int alwaysPass( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int alwaysFail( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_uchar( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_char( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_ushort( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_short( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_uint( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_int( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_ulong( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_long( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_float( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-int check_double( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
-
-void init_uchar( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_char( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_ushort( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_short( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_uint( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_int( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_float( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_double( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_ulong( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-void init_long( void *dest, SaturationMode, RoundingMode, Type destType, uint64_t start, int count, MTdata d );
-
-InitDataFunc gInitFunctions[ kTypeCount ] = {
-                                                init_uchar, init_char,
-                                                init_ushort, init_short,
-                                                init_uint, init_int,
-                                                init_float, init_double,
-                                                init_ulong, init_long
-                                            };
-
-
-CheckResults gCheckResults[ kTypeCount ] = {
-                                                check_uchar, check_char, check_ushort, check_short, check_uint,
-                                                check_int, check_float, check_double, check_ulong, check_long
-                                            };
-#if !defined (__APPLE__)
-#define UNUSED
-#else
-#define UNUSED  __attribute__((unused))
-#endif
+void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
+                                             void *data)
+{
+    std::unique_ptr<CalcRefValsBase> &info =
+        *(std::unique_ptr<CalcRefValsBase> *)data;
+
+    cl_uint vectorSize = info->vectorSize;
+    cl_uint count = info->parent->count;
+    Type outType =
+        info->parent->outType; // the data type of the conversion result
+    Type inType = info->parent->inType; // the data type of the conversion input
+    size_t j;
+    cl_int error;
+    cl_event doneBarrier = info->parent->doneBarrier;
+
+    // report spurious error condition
+    if (CL_SUCCESS != status)
+    {
+        vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
+                   status);
+        gFailCount++; // lazy about thread safety here
+        return;
+    }
 
-int alwaysPass( void UNUSED *out1, void UNUSED *out2, void UNUSED *allowZ, uint32_t UNUSED count, int UNUSED vectorSize){ return 0; }
-int alwaysFail( void UNUSED *out1, void UNUSED *out2, void UNUSED *allowZ, uint32_t UNUSED count, int UNUSED vectorSize ){ return -1; }
+    // Now we know that both results have been mapped back from the device, and
+    // the main thread is done calculating the reference results. It is now time
+    // to check the results.
 
-int check_uchar( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_uchar *t = (const cl_uchar*)test;
-    const cl_uchar *c = (const cl_uchar*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+    // verify results
+    void *mapped = info->p;
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_uchar)0))
+    // Patch up NaNs conversions to integer to zero -- these can be converted to
+    // any integer
+    if (outType != kfloat && outType != kdouble)
+    {
+        if (inType == kfloat)
         {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%2.2x vs 0x%2.2x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
+            float *inp = (float *)gIn;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)mapped + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
         }
-
-    return 0;
-}
-
-int check_char( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_char *t = (const cl_char*)test;
-    const cl_char *c = (const cl_char*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_char)0))
+        if (inType == kdouble)
+        {
+            double *inp = (double *)gIn;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)mapped + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+    }
+    else if (inType == kfloat || inType == kdouble)
+    { // outtype and intype is float or double.  NaN conversions for float <->
+      // double can be any NaN
+        if (inType == kfloat && outType == kdouble)
         {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%2.2x vs 0x%2.2x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
+            float *inp = (float *)gIn;
+            double *outp = (double *)mapped;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
+            }
         }
+        if (inType == kdouble && outType == kfloat)
+        {
+            double *inp = (double *)gIn;
+            float *outp = (float *)mapped;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
+            }
+        }
+    }
 
-    return 0;
-}
+    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
+        info->result =
+            info->check_result(mapped, count, vectorSizes[vectorSize]);
+    else
+        info->result = 0;
 
-int check_ushort( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_ushort *t = (const cl_ushort*)test;
-    const cl_ushort *c = (const cl_ushort*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+    // Fill the output buffer with junk and release it
+    {
+        cl_uint pattern = 0xffffdead;
+        memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
+        if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
+                                             mapped, 0, NULL, NULL)))
+        {
+            vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
+                       "CalcReferenceValuesComplete  (%d)\n",
+                       error);
+            gFailCount++;
+        }
+    }
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_ushort)0))
+    if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
+    {
+        if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
         {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%4.4x vs 0x%4.4x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
+            vlog_error("ERROR: clSetUserEventStatus failed in "
+                       "CalcReferenceValuesComplete (err: %d). We're probably "
+                       "going to deadlock.\n",
+                       status);
+            gFailCount++;
+            return;
         }
 
-    return 0;
+        if ((status = clReleaseEvent(doneBarrier)))
+        {
+            vlog_error("ERROR: clReleaseEvent failed in "
+                       "CalcReferenceValuesComplete (err: %d).\n",
+                       status);
+            gFailCount++;
+            return;
+        }
+    }
+    // e was already released by WriteInputBufferComplete. It should be
+    // destroyed automatically soon after all the calls to
+    // CalcReferenceValuesComplete exit.
 }
 
-int check_short( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_short *t = (const cl_short*)test;
-    const cl_short *c = (const cl_short*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+//
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_short)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%4.4x vs 0x%4.4x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
+namespace conv_test {
 
-    return 0;
-}
+////////////////////////////////////////////////////////////////////////////////
 
-int check_uint( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
+cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
 {
-    const cl_uint *t = (const cl_uint*)test;
-    const cl_uint *c = (const cl_uint*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+    DataInitBase *info = (DataInitBase *)p;
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_uint)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%8.8x vs 0x%8.8x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
+    info->init(job_id, thread_id);
 
-    return 0;
+    return CL_SUCCESS;
 }
 
-int check_int( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
+////////////////////////////////////////////////////////////////////////////////
+
+cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
 {
-    const cl_int *t = (const cl_int*)test;
-    const cl_int *c = (const cl_int*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+    DataInitBase *info = (DataInitBase *)p;
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_int)0))
-        {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%8.8x vs 0x%8.8x\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
-        }
+    cl_uint count = info->size;
+    Type inType = info->inType;
+    Type outType = info->outType;
+    RoundingMode round = info->round;
+    size_t j;
 
-    return 0;
-}
+    Force64BitFPUPrecision();
 
-int check_ulong( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_ulong *t = (const cl_ulong*)test;
-    const cl_ulong *c = (const cl_ulong*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+    void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
+    void *a = (cl_uchar *)gAllowZ + job_id * count;
+    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_ulong)0))
+
+    if (outType != inType)
+    {
+        // create the reference while we wait
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+        /* ARM VFP doesn't have hardware instruction for converting from 64-bit
+         * integer to float types, hence GCC ARM uses the floating-point
+         * emulation code despite which -mfloat-abi setting it is. But the
+         * emulation code in libgcc.a has only one rounding mode (round to
+         * nearest even in this case) and ignores the user rounding mode setting
+         * in hardware. As a result setting rounding modes in hardware won't
+         * give correct rounding results for type covert from 64-bit integer to
+         * float using GCC for ARM compiler so for testing different rounding
+         * modes, we need to use alternative reference function. ARM64 does have
+         * an instruction, however we cannot guarantee the compiler will use it.
+         * On all ARM architechures use emulation to calculate reference.*/
+        switch (round)
         {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%16.16llx vs 0x%16.16llx\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
+            /* conversions to floating-point type use the current rounding mode.
+             * The only default floating-point rounding mode supported is round
+             * to nearest even i.e the current rounding mode will be _rte for
+             * floating-point types. */
+            case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
+            case kRoundToNearestEven: qcom_rm = qcomRTE; break;
+            case kRoundUp: qcom_rm = qcomRTP; break;
+            case kRoundDown: qcom_rm = qcomRTN; break;
+            case kRoundTowardZero: qcom_rm = qcomRTZ; break;
+            default:
+                vlog_error("ERROR: undefined rounding mode %d\n", round);
+                break;
         }
+        qcom_sat = info->sat;
+#endif
 
-    return 0;
-}
+        RoundingMode oldRound = set_round(round, outType);
 
-int check_long( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_long *t = (const cl_long*)test;
-    const cl_long *c = (const cl_long*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
+        if (info->sat)
+            info->conv_array_sat(d, s, count);
+        else
+            info->conv_array(d, s, count);
 
-    for( i = 0; i < count; i++ )
-        if( t[i] != c[i] && !(a[i] != (cl_uchar)0 && t[i] == (cl_long)0))
+        set_round(oldRound, outType);
+
+        // Decide if we allow a zero result in addition to the correctly rounded
+        // one
+        memset(a, 0, count);
+        if (gForceFTZ)
         {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *0x%16.16llx vs 0x%16.16llx\n", vectorSize, i, c[i], t[i] );
-            return i + 1;
+            if (inType == kfloat || outType == kfloat)
+                setAllowZ((uint8_t *)a, (uint32_t *)s, count);
         }
+    }
+    else
+    {
+        // Copy the input to the reference
+        memcpy(d, s, info->size * gTypeSizes[inType]);
+    }
 
-    return 0;
-}
-
-int check_float( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_uint *t = (const cl_uint*)test;
-    const cl_uint *c = (const cl_uint*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if (t[i] != c[i] &&
-            // Allow nan's to be binary different
-            !((t[i] & 0x7fffffffU) > 0x7f800000U &&
-              (c[i] & 0x7fffffffU) > 0x7f800000U) &&
-            !(a[i] != (cl_uchar)0 &&
-              t[i] == (c[i] & 0x80000000U))) {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
-                    vectorSize, i, ((float*)correct)[i], ((float*)test)[i] );
-            return i + 1;
+    // Patch up NaNs conversions to integer to zero -- these can be converted to
+    // any integer
+    if (info->outType != kfloat && info->outType != kdouble)
+    {
+        if (inType == kfloat)
+        {
+            float *inp = (float *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)d + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+        if (inType == kdouble)
+        {
+            double *inp = (double *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j]))
+                    memset((char *)d + j * gTypeSizes[outType], 0,
+                           gTypeSizes[outType]);
+            }
+        }
+    }
+    else if (inType == kfloat || inType == kdouble)
+    { // outtype and intype is float or double.  NaN conversions for float <->
+      // double can be any NaN
+        if (inType == kfloat && outType == kdouble)
+        {
+            float *inp = (float *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j])) ((double *)d)[j] = NAN;
+            }
         }
+        if (inType == kdouble && outType == kfloat)
+        {
+            double *inp = (double *)s;
+            for (j = 0; j < count; j++)
+            {
+                if (isnan(inp[j])) ((float *)d)[j] = NAN;
+            }
+        }
+    }
 
-    return 0;
+    return CL_SUCCESS;
 }
 
-int check_double( void *test, void *correct, void *allowZ, uint32_t count, int vectorSize )
-{
-    const cl_ulong *t = (const cl_ulong*)test;
-    const cl_ulong *c = (const cl_ulong*)correct;
-    const cl_uchar *a = (const cl_uchar*)allowZ;
-    uint32_t i;
-
-    for( i = 0; i < count; i++ )
-        if (t[i] != c[i] &&
-            // Allow nan's to be binary different
-            !((t[i] & 0x7fffffffffffffffULL) > 0x7ff0000000000000ULL &&
-              (c[i] & 0x7fffffffffffffffULL) > 0x7f80000000000000ULL) &&
-            !(a[i] != (cl_uchar)0 &&
-              t[i] == (c[i] & 0x8000000000000000ULL))) {
-            vlog( "\nError for vector size %d found at 0x%8.8x:  *%a vs %a\n",
-                  vectorSize, i, ((double*)correct)[i], ((double*)test)[i] );
-            return i + 1;
-        }
+////////////////////////////////////////////////////////////////////////////////
 
+uint64_t GetTime(void)
+{
+#if defined(__APPLE__)
+    return mach_absolute_time();
+#elif defined(_MSC_VER)
+    return ReadTime();
+#else
+    // mach_absolute_time is a high precision timer with precision < 1
+    // microsecond.
+#warning need accurate clock here.  Times are invalid.
     return 0;
+#endif
 }
 
+////////////////////////////////////////////////////////////////////////////////
 
-void init_uchar( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata UNUSED d )
+// Note: not called reentrantly
+void WriteInputBufferComplete(void *data)
 {
-    cl_uchar *o = (cl_uchar *)out;
-    int i;
+    cl_int status;
+    WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
+    cl_uint count = info->count;
+    int vectorSize;
 
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
+    info->barrierCount = gMaxVectorSize - gMinVectorSize;
 
-void init_char( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    char *o = (char *)out;
-    int i;
+    // now that we know that the write buffer is complete, enqueue callbacks to
+    // wait for the main thread to finish calculating the reference results.
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        size_t workItemCount =
+            (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);
 
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
+        if ((status = conv_test::RunKernel(info->calcInfo[vectorSize]->kernel,
+                                           gInBuffer, gOutBuffers[vectorSize],
+                                           workItemCount)))
+        {
+            gFailCount++;
+            return;
+        }
 
-void init_ushort( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    cl_ushort *o = (cl_ushort *)out;
-    int i;
+        info->calcInfo[vectorSize]->p = clEnqueueMapBuffer(
+            gQueue, gOutBuffers[vectorSize], CL_TRUE,
+            CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
+            NULL, NULL, &status);
+        {
+            if (status)
+            {
+                vlog_error("ERROR: WriteInputBufferComplete calback failed "
+                           "with status: %d\n",
+                           status);
+                gFailCount++;
+                return;
+            }
+        }
+    }
 
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
-}
+    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
+    {
+        MapResultValuesComplete(info->calcInfo[vectorSize]);
+    }
 
-void init_short( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, UNUSED Type destType, uint64_t start, int count, MTdata UNUSED d )
-{
-    short *o = (short *)out;
-    int i;
+    // Make sure the work starts moving -- otherwise we may deadlock
+    if ((status = clFlush(gQueue)))
+    {
+        vlog_error(
+            "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
+            status);
+        gFailCount++;
+        return;
+    }
 
-    for( i = 0; i < count; i++ )
-        o[i] = start++;
+    // e was already released by the main thread. It should be destroyed
+    // automatically soon after we exit.
 }
 
-void init_uint( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata d )
+////////////////////////////////////////////////////////////////////////////////
+
+cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
+                       RoundingMode round, int vectorSize, cl_kernel *outKernel)
 {
-    static const unsigned int specialValuesUInt[] = {
-    INT_MIN, INT_MIN + 1, INT_MIN + 2,
-    -(1<<30)-3,-(1<<30)-2,-(1<<30)-1, -(1<<30), -(1<<30)+1, -(1<<30)+2, -(1<<30)+3,
-    -(1<<24)-3,-(1<<24)-2,-(1<<24)-1, -(1<<24), -(1<<24)+1, -(1<<24)+2, -(1<<24)+3,
-    -(1<<23)-3,-(1<<23)-2,-(1<<23)-1, -(1<<23), -(1<<23)+1, -(1<<23)+2, -(1<<23)+3,
-    -(1<<22)-3,-(1<<22)-2,-(1<<22)-1, -(1<<22), -(1<<22)+1, -(1<<22)+2, -(1<<22)+3,
-    -(1<<21)-3,-(1<<21)-2,-(1<<21)-1, -(1<<21), -(1<<21)+1, -(1<<21)+2, -(1<<21)+3,
-    -(1<<16)-3,-(1<<16)-2,-(1<<16)-1, -(1<<16), -(1<<16)+1, -(1<<16)+2, -(1<<16)+3,
-    -(1<<15)-3,-(1<<15)-2,-(1<<15)-1, -(1<<15), -(1<<15)+1, -(1<<15)+2, -(1<<15)+3,
-    -(1<<8)-3,-(1<<8)-2,-(1<<8)-1, -(1<<8), -(1<<8)+1, -(1<<8)+2, -(1<<8)+3,
-    -(1<<7)-3,-(1<<7)-2,-(1<<7)-1, -(1<<7), -(1<<7)+1, -(1<<7)+2, -(1<<7)+3,
-    -4, -3, -2, -1, 0, 1, 2, 3, 4,
-    (1<<7)-3,(1<<7)-2,(1<<7)-1, (1<<7), (1<<7)+1, (1<<7)+2, (1<<7)+3,
-    (1<<8)-3,(1<<8)-2,(1<<8)-1, (1<<8), (1<<8)+1, (1<<8)+2, (1<<8)+3,
-    (1<<15)-3,(1<<15)-2,(1<<15)-1, (1<<15), (1<<15)+1, (1<<15)+2, (1<<15)+3,
-    (1<<16)-3,(1<<16)-2,(1<<16)-1, (1<<16), (1<<16)+1, (1<<16)+2, (1<<16)+3,
-    (1<<21)-3,(1<<21)-2,(1<<21)-1, (1<<21), (1<<21)+1, (1<<21)+2, (1<<21)+3,
-    (1<<22)-3,(1<<22)-2,(1<<22)-1, (1<<22), (1<<22)+1, (1<<22)+2, (1<<22)+3,
-    (1<<23)-3,(1<<23)-2,(1<<23)-1, (1<<23), (1<<23)+1, (1<<23)+2, (1<<23)+3,
-    (1<<24)-3,(1<<24)-2,(1<<24)-1, (1<<24), (1<<24)+1, (1<<24)+2, (1<<24)+3,
-    (1<<30)-3,(1<<30)-2,(1<<30)-1, (1<<30), (1<<30)+1, (1<<30)+2, (1<<30)+3,
-    INT_MAX-3, INT_MAX-2, INT_MAX-1, INT_MAX, // 0x80000000, 0x80000001 0x80000002 already covered above
-    UINT_MAX-3, UINT_MAX-2, UINT_MAX-1, UINT_MAX
-    };
-
-    cl_uint *o = (cl_uint *)out;
-    int i;
+    cl_program program;
+    char testName[256];
+    int error = 0;
 
-    for( i = 0; i < count; i++) {
-    if( gIsEmbedded )
-        o[i] = (cl_uint) genrand_int32(d);
-    else
-        o[i] = (cl_uint)i + start;
-    }
+    std::ostringstream source;
+    if (outType == kdouble || inType == kdouble)
+        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
 
-    if( 0 == start )
+    // Create the program. This is a bit complicated because we are trying to
+    // avoid byte and short stores.
+    if (0 == vectorSize)
     {
-    size_t tableSize = sizeof( specialValuesUInt );
-    if( sizeof( cl_uint) * count < tableSize )
-        tableSize = sizeof( cl_uint) * count;
-    memcpy( (char*)(o + i) - tableSize, specialValuesUInt, tableSize );
+        // Create the type names.
+        char inName[32];
+        char outName[32];
+        strncpy(inName, gTypeNames[inType], sizeof(inName));
+        strncpy(outName, gTypeNames[outType], sizeof(outName));
+        sprintf(testName, "test_implicit_%s_%s", outName, inName);
+
+        source << "__kernel void " << testName << "( __global " << inName
+               << " *src, __global " << outName << " *dest )\n";
+        source << "{\n";
+        source << "   size_t i = get_global_id(0);\n";
+        source << "   dest[i] =  src[i];\n";
+        source << "}\n";
+
+        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
+             gTypeNames[outType]);
+        fflush(stdout);
     }
-}
+    else
+    {
+        int vectorSizetmp = vectorSizes[vectorSize];
 
-void init_int( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata d )
-{
-    static const unsigned int specialValuesInt[] = {
-    INT_MIN, INT_MIN + 1, INT_MIN + 2,
-    -(1<<30)-3,-(1<<30)-2,-(1<<30)-1, -(1<<30), -(1<<30)+1, -(1<<30)+2, -(1<<30)+3,
-    -(1<<24)-3,-(1<<24)-2,-(1<<24)-1, -(1<<24), -(1<<24)+1, -(1<<24)+2, -(1<<24)+3,
-    -(1<<23)-3,-(1<<23)-2,-(1<<23)-1, -(1<<23), -(1<<23)+1, -(1<<23)+2, -(1<<23)+3,
-    -(1<<22)-3,-(1<<22)-2,-(1<<22)-1, -(1<<22), -(1<<22)+1, -(1<<22)+2, -(1<<22)+3,
-    -(1<<21)-3,-(1<<21)-2,-(1<<21)-1, -(1<<21), -(1<<21)+1, -(1<<21)+2, -(1<<21)+3,
-    -(1<<16)-3,-(1<<16)-2,-(1<<16)-1, -(1<<16), -(1<<16)+1, -(1<<16)+2, -(1<<16)+3,
-    -(1<<15)-3,-(1<<15)-2,-(1<<15)-1, -(1<<15), -(1<<15)+1, -(1<<15)+2, -(1<<15)+3,
-    -(1<<8)-3,-(1<<8)-2,-(1<<8)-1, -(1<<8), -(1<<8)+1, -(1<<8)+2, -(1<<8)+3,
-    -(1<<7)-3,-(1<<7)-2,-(1<<7)-1, -(1<<7), -(1<<7)+1, -(1<<7)+2, -(1<<7)+3,
-    -4, -3, -2, -1, 0, 1, 2, 3, 4,
-    (1<<7)-3,(1<<7)-2,(1<<7)-1, (1<<7), (1<<7)+1, (1<<7)+2, (1<<7)+3,
-    (1<<8)-3,(1<<8)-2,(1<<8)-1, (1<<8), (1<<8)+1, (1<<8)+2, (1<<8)+3,
-    (1<<15)-3,(1<<15)-2,(1<<15)-1, (1<<15), (1<<15)+1, (1<<15)+2, (1<<15)+3,
-    (1<<16)-3,(1<<16)-2,(1<<16)-1, (1<<16), (1<<16)+1, (1<<16)+2, (1<<16)+3,
-    (1<<21)-3,(1<<21)-2,(1<<21)-1, (1<<21), (1<<21)+1, (1<<21)+2, (1<<21)+3,
-    (1<<22)-3,(1<<22)-2,(1<<22)-1, (1<<22), (1<<22)+1, (1<<22)+2, (1<<22)+3,
-    (1<<23)-3,(1<<23)-2,(1<<23)-1, (1<<23), (1<<23)+1, (1<<23)+2, (1<<23)+3,
-    (1<<24)-3,(1<<24)-2,(1<<24)-1, (1<<24), (1<<24)+1, (1<<24)+2, (1<<24)+3,
-    (1<<30)-3,(1<<30)-2,(1<<30)-1, (1<<30), (1<<30)+1, (1<<30)+2, (1<<30)+3,
-    INT_MAX-3, INT_MAX-2, INT_MAX-1, INT_MAX, // 0x80000000, 0x80000001 0x80000002 already covered above
-    UINT_MAX-3, UINT_MAX-2, UINT_MAX-1, UINT_MAX
-    };
-
-    int *o = (int *)out;
-    int i;
+        // Create the type names.
+        char convertString[128];
+        char inName[32];
+        char outName[32];
+        switch (vectorSizetmp)
+        {
+            case 1:
+                strncpy(inName, gTypeNames[inType], sizeof(inName));
+                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
+                         outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s", convertString, inName);
+                vlog("Building %s( %s ) test\n", convertString, inName);
+                break;
+            case 3:
+                strncpy(inName, gTypeNames[inType], sizeof(inName));
+                strncpy(outName, gTypeNames[outType], sizeof(outName));
+                snprintf(convertString, sizeof(convertString),
+                         "convert_%s3%s%s", outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s3", convertString, inName);
+                vlog("Building %s( %s3 ) test\n", convertString, inName);
+                break;
+            default:
+                snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
+                         vectorSizetmp);
+                snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
+                         vectorSizetmp);
+                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
+                         outName, gSaturationNames[sat],
+                         gRoundingModeNames[round]);
+                snprintf(testName, 256, "test_%s_%s", convertString, inName);
+                vlog("Building %s( %s ) test\n", convertString, inName);
+                break;
+        }
+        fflush(stdout);
 
-    for( i = 0; i < count; i++ ) {
-    if( gIsEmbedded ) {
-        o[i] = (int) genrand_int32(d);
-    }
-    else {
-        o[i] = (int) i + start;
-    }
+        if (vectorSizetmp == 3)
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   if( i + 1 < get_global_size(0))\n";
+            source << "       vstore3( " << convertString
+                   << "( vload3( i, src)), i, dest );\n";
+            source << "   else\n";
+            source << "   {\n";
+            source << "       " << inName << "3 in;\n";
+            source << "       " << outName << "3 out;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           in.y = src[3*i+1];\n";
+            source << "       in.x = src[3*i];\n";
+            source << "       out = " << convertString << "( in ); \n";
+            source << "       dest[3*i] = out.x;\n";
+            source << "       if( 0 == (i & 1) )\n";
+            source << "           dest[3*i+1] = out.y;\n";
+            source << "   }\n";
+            source << "}\n";
+        }
+        else
+        {
+            source << "__kernel void " << testName << "( __global " << inName
+                   << " *src, __global " << outName << " *dest )\n";
+            source << "{\n";
+            source << "   size_t i = get_global_id(0);\n";
+            source << "   dest[i] = " << convertString << "( src[i] );\n";
+            source << "}\n";
+        }
     }
+    *outKernel = NULL;
 
-    if( 0 == start )
+    const char *flags = NULL;
+    if (gForceFTZ) flags = "-cl-denorms-are-zero";
+
+    // build it
+    std::string sourceString = source.str();
+    const char *programSource = sourceString.c_str();
+    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
+                                        &programSource, testName, flags);
+    if (error)
     {
-    size_t tableSize = sizeof( specialValuesInt );
-    if( sizeof( int) * count < tableSize )
-        tableSize = sizeof( int) * count;
-    memcpy( (char*)(o + i) - tableSize, specialValuesInt, tableSize );
+        vlog_error("Failed to build kernel/program (err = %d).\n", error);
+        return NULL;
     }
+
+    return program;
 }
 
-void init_float( void *out, SaturationMode sat, RoundingMode round, Type destType, uint64_t start, int count, MTdata d )
+//
+
+int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount)
 {
-    static const float specialValuesFloat[] = {
-    -NAN, -INFINITY, -FLT_MAX, MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40), MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64), MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39), MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39), MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8), MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32), MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7), MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7), MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31), MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6), -1000.f, -100.f, -4.0f, -3.5f,
-    -3.0f, MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23), -2.5f, MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23), -2.0f, MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24), -1.5f, MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24), -1.0f, MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25), -0.5f, MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26), MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26), -0.25f, MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150), -FLT_MIN, MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150), MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150), MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150), MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150), MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150), MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
-    MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150), MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150), MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150), MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150), -0.0f,
-    +NAN, +INFINITY, +FLT_MAX, MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40), MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64), MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39), MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39), MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63), MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
-    MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8), MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32), MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7), MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7), MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31), MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6), +1000.f, +100.f, +4.0f, +3.5f,
-    +3.0f, MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23), 2.5f, MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),+2.0f, MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24), 1.5f, MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24), MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24), +1.0f, MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
-    MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25), +0.5f, MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26), MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26), +0.25f, MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
-    MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150), +FLT_MIN, MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150), MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150), MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150), MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150), MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150), MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
-    MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150), MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150), MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150), MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150), +0.0f
-    };
-
-    cl_uint *o = (cl_uint *)out;
-    int i;
+    // The global dimensions are just the blockCount to execute since we haven't
+    // set up multiple queues for multiple devices.
+    int error;
 
-    for( i = 0; i < count; i++ ) {
-    if( gIsEmbedded )
-        o[i] = (cl_uint) genrand_int32(d);
-    else
-        o[i] = (cl_uint) i + start;
-    }
+    error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
+    error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
 
-    if( 0 == start )
+    if (error)
     {
-    size_t tableSize = sizeof( specialValuesFloat );
-    if( sizeof( float) * count < tableSize )
-        tableSize = sizeof( float) * count;
-    memcpy( (char*)(o + i) - tableSize, specialValuesFloat, tableSize );
+        vlog_error("FAILED -- could not set kernel args (%d)\n", error);
+        return error;
     }
 
-    if( kUnsaturated == sat )
+    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
+                                        NULL, 0, NULL, NULL)))
     {
-        clampf func = gClampFloat[ destType ][round];
-        float *f = (float *)out;
-
-        for( i = 0; i < count; i++ )
-            f[i] = func( f[i] );
+        vlog_error("FAILED -- could not execute kernel (%d)\n", error);
+        return error;
     }
-}
-
-// used to convert a bucket of bits into a search pattern through double
-static inline double DoubleFromUInt32( uint32_t bits );
-static inline double DoubleFromUInt32( uint32_t bits )
-{
-    union{ uint64_t u; double d;} u;
-
-    // split 0x89abcdef to 0x89abc00000000def
-    u.u = bits & 0xfffU;
-    u.u |= (uint64_t) (bits & ~0xfffU) << 32;
 
-    // sign extend the leading bit of def segment as sign bit so that the middle region consists of either all 1s or 0s
-    u.u -= (bits & 0x800U) << 1;
-
-    // return result
-    return u.d;
+    return 0;
 }
 
-// A table of more difficult cases to get right
-static const double specialValuesDouble[] = {
-    -NAN, -INFINITY, -DBL_MAX, MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12), MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(-0x1.80000000000001p64, -0x180000000000001LL, 8),
-    MAKE_HEX_DOUBLE(-0x1.8p64, -0x18LL, 60), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp64, -0x17ffffffffffffLL, 12),     MAKE_HEX_DOUBLE(-0x1.80000000000001p63, -0x180000000000001LL, 7), MAKE_HEX_DOUBLE(-0x1.8p63, -0x18LL, 59), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp63, -0x17ffffffffffffLL, 11),
-     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11), MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), MAKE_HEX_DOUBLE(-0x1.80000000000001p32, -0x180000000000001LL, -24), MAKE_HEX_DOUBLE(-0x1.8p32, -0x18LL, 28), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp32, -0x17ffffffffffffLL, -20),
-    MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8), MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.80000000000001p31, -0x180000000000001LL, -25), MAKE_HEX_DOUBLE(-0x1.8p31, -0x18LL, 27), MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp31, -0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21), MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22), -1000., -100.,  -4.0, -3.5,
-    -3.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51), -2.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51), -2.0, MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52), -1.5, MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), -1.0, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53), -0.5, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54), -0.25, MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074), -DBL_MIN, MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074), -0.0,
-
-    MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11), MAKE_HEX_DOUBLE(0x1.80000000000001p63, 0x180000000000001LL, 7), MAKE_HEX_DOUBLE(0x1.8p63, 0x18LL, 59), MAKE_HEX_DOUBLE(0x1.7ffffffffffffp63, 0x17ffffffffffffLL, 11),  MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11), MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-     MAKE_HEX_DOUBLE(+0x1.80000000000001p32, +0x180000000000001LL, -24), MAKE_HEX_DOUBLE(+0x1.8p32, +0x18LL, 28), MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp32, +0x17ffffffffffffLL, -20),
-    MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8), MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.80000000000001p31, +0x180000000000001LL, -25), MAKE_HEX_DOUBLE(+0x1.8p31, +0x18LL, 27), MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp31, +0x17ffffffffffffLL, -21), MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21), MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22), +1000., +100.,  +4.0, +3.5,
-    +3.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51), +2.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51), +2.0, MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52), +1.5, MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52), +1.0, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53), +0.5, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),  MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54), +0.25, MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
-    MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074), +DBL_MIN, MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074), MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074), MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
-    MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074), MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074), +0.0,
-
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep62, -0x1ffffffffffffeLL, 10), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp62, -0x1ffffffffffffcLL, 10), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep62, +0x1ffffffffffffeLL, 10), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp62, +0x1ffffffffffffcLL, 10), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep51, -0x1ffffffffffffeLL, -1), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp51, -0x1ffffffffffffcLL, -1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp51, -0x1fffffffffffffLL, -1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep51, +0x1ffffffffffffeLL, -1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp51, +0x1ffffffffffffcLL, -1), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp51, +0x1fffffffffffffLL, -1),
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep52, -0x1ffffffffffffeLL, 0), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp52, -0x1ffffffffffffcLL, 0), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp52, -0x1fffffffffffffLL, 0), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep52, +0x1ffffffffffffeLL, 0), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp52, +0x1ffffffffffffcLL, 0), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp52, +0x1fffffffffffffLL, 0),
-    MAKE_HEX_DOUBLE(-0x1.ffffffffffffep53, -0x1ffffffffffffeLL, 1), MAKE_HEX_DOUBLE(-0x1.ffffffffffffcp53, -0x1ffffffffffffcLL, 1), MAKE_HEX_DOUBLE(-0x1.fffffffffffffp53, -0x1fffffffffffffLL, 1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffep53, +0x1ffffffffffffeLL, 1), MAKE_HEX_DOUBLE(+0x1.ffffffffffffcp53, +0x1ffffffffffffcLL, 1), MAKE_HEX_DOUBLE(+0x1.fffffffffffffp53, +0x1fffffffffffffLL, 1),
-    MAKE_HEX_DOUBLE(-0x1.0000000000002p52, -0x10000000000002LL, 0), MAKE_HEX_DOUBLE(-0x1.0000000000001p52, -0x10000000000001LL, 0), MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52), MAKE_HEX_DOUBLE(+0x1.0000000000002p52, +0x10000000000002LL, 0), MAKE_HEX_DOUBLE(+0x1.0000000000001p52, +0x10000000000001LL, 0), MAKE_HEX_DOUBLE(+0x1.0p52, +0x1LL, 52),
-    MAKE_HEX_DOUBLE(-0x1.0000000000002p53, -0x10000000000002LL, 1), MAKE_HEX_DOUBLE(-0x1.0000000000001p53, -0x10000000000001LL, 1), MAKE_HEX_DOUBLE(-0x1.0p53, -0x1LL, 53), MAKE_HEX_DOUBLE(+0x1.0000000000002p53, +0x10000000000002LL, 1), MAKE_HEX_DOUBLE(+0x1.0000000000001p53, +0x10000000000001LL, 1), MAKE_HEX_DOUBLE(+0x1.0p53, +0x1LL, 53),
-    MAKE_HEX_DOUBLE(-0x1.0000000000002p54, -0x10000000000002LL, 2), MAKE_HEX_DOUBLE(-0x1.0000000000001p54, -0x10000000000001LL, 2), MAKE_HEX_DOUBLE(-0x1.0p54, -0x1LL, 54), MAKE_HEX_DOUBLE(+0x1.0000000000002p54, +0x10000000000002LL, 2), MAKE_HEX_DOUBLE(+0x1.0000000000001p54, +0x10000000000001LL, 2), MAKE_HEX_DOUBLE(+0x1.0p54, +0x1LL, 54),
-    MAKE_HEX_DOUBLE(-0x1.fffffffefffffp62, -0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(-0x1.ffffffffp62, -0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(-0x1.ffffffff00001p62, -0x1ffffffff00001LL, 10), MAKE_HEX_DOUBLE(0x1.fffffffefffffp62, 0x1fffffffefffffLL, 10), MAKE_HEX_DOUBLE(0x1.ffffffffp62, 0x1ffffffffLL, 30), MAKE_HEX_DOUBLE(0x1.ffffffff00001p62, 0x1ffffffff00001LL, 10),
-};
 
-
-void init_double( void *out, SaturationMode sat, RoundingMode round, Type destType, uint64_t start, int count, MTdata UNUSED d )
+int GetTestCase(const char *name, Type *outType, Type *inType,
+                SaturationMode *sat, RoundingMode *round)
 {
-    double *o = (double*)out;
     int i;
 
-    for( i = 0; i < count; i++ )
-    {
-        uint64_t z = i + start;
-        o[i] = DoubleFromUInt32( (uint32_t) z ^ (uint32_t) (z >> 32));
-    }
+    // Find the return type
+    for (i = 0; i < kTypeCount; i++)
+        if (name == strstr(name, gTypeNames[i]))
+        {
+            *outType = (Type)i;
+            name += strlen(gTypeNames[i]);
 
-    if( 0 == start )
-    {
-        size_t tableSize = sizeof( specialValuesDouble );
-        if( sizeof( cl_double) * count < tableSize )
-            tableSize = sizeof( cl_double) * count;
-        memcpy( (char*)(o + i) - tableSize, specialValuesDouble, tableSize );
-    }
+            break;
+        }
 
-    if( 0 == sat )
-    {
-        clampd func = gClampDouble[ destType ][round];
+    if (i == kTypeCount) return -1;
 
-        for( i = 0; i < count; i++ )
-            o[i] = func( o[i] );
-    }
-}
+    // Check to see if _sat appears next
+    *sat = (SaturationMode)0;
+    for (i = 1; i < kSaturationModeCount; i++)
+        if (name == strstr(name, gSaturationNames[i]))
+        {
+            *sat = (SaturationMode)i;
+            name += strlen(gSaturationNames[i]);
+            break;
+        }
 
-cl_ulong random64( MTdata d )
-{
-    return (cl_ulong) genrand_int32(d) | ((cl_ulong) genrand_int32(d) << 32);
-}
+    *round = (RoundingMode)0;
+    for (i = 1; i < kRoundingModeCount; i++)
+        if (name == strstr(name, gRoundingModeNames[i]))
+        {
+            *round = (RoundingMode)i;
+            name += strlen(gRoundingModeNames[i]);
+            break;
+        }
 
-void init_ulong( void *out, SaturationMode UNUSED sat, RoundingMode UNUSED round, Type UNUSED destType, uint64_t start, int count, MTdata d )
-{
-    cl_ulong *o = (cl_ulong *)out;
-    cl_ulong i, j, k;
+    if (*name != '_') return -2;
+    name++;
 
-    i = 0;
-    if( start == 0 )
-    {
-        //Try various powers of two
-        for( j = 0; j < (cl_ulong) count && j < 8 * sizeof(cl_ulong); j++ )
-            o[j] = (cl_ulong) 1 << j;
-        i = j;
-
-        // try the complement of those
-        for( j = 0; i < (cl_ulong) count && j < 8 * sizeof(cl_ulong); j++ )
-            o[i++] = ~((cl_ulong) 1 << j);
-
-        //Try various negative powers of two
-        for( j = 0; i < (cl_ulong) count && j < 8 * sizeof(cl_ulong); j++ )
-            o[i++] = (cl_ulong) 0xFFFFFFFFFFFFFFFEULL << j;
-
-        //try various powers of two plus 1, shifted by various amounts
-        for( j = 0; i < (cl_ulong)count && j < 8 * sizeof(cl_ulong); j++ )
-            for( k = 0; i < (cl_ulong)count && k < 8 * sizeof(cl_ulong) - j; k++ )
-                o[i++] = (((cl_ulong) 1 << j) + 1) << k;
-
-        //try various powers of two minus 1
-        for( j = 0; i < (cl_ulong)count && j < 8 * sizeof(cl_ulong); j++ )
-            for( k = 0; i < (cl_ulong)count && k < 8 * sizeof(cl_ulong) - j; k++ )
-                o[i++] = (((cl_ulong) 1 << j) - 1) << k;
-
-        // Other patterns
-        cl_ulong pattern[] = { 0x3333333333333333ULL, 0x5555555555555555ULL, 0x9999999999999999ULL, 0x6666666666666666ULL, 0xccccccccccccccccULL, 0xaaaaaaaaaaaaaaaaULL };
-        cl_ulong mask[] = { 0xffffffffffffffffULL, 0xff00ff00ff00ff00ULL, 0xffff0000ffff0000ULL, 0xffffffff00000000ULL };
-        for( j = 0; i < (cl_ulong) count && j < sizeof(pattern) / sizeof( pattern[0]); j++ )
-            for( k = 0; i + 2 <= (cl_ulong) count && k < sizeof(mask) / sizeof( mask[0]); k++ )
-            {
-                o[i++] = pattern[j] & mask[k];
-                o[i++] = pattern[j] & ~mask[k];
-            }
-    }
+    for (i = 0; i < kTypeCount; i++)
+        if (name == strstr(name, gTypeNames[i]))
+        {
+            *inType = (Type)i;
+            name += strlen(gTypeNames[i]);
 
-    for( ; i < (cl_ulong) count; i++ )
-        o[i] = random64(d);
-}
+            break;
+        }
 
-void init_long( void *out, SaturationMode sat, RoundingMode round, Type destType, uint64_t start, int count, MTdata d )
-{
-    init_ulong( out, sat, round, destType, start, count, d );
-}
+    if (i == kTypeCount) return -3;
 
-// ======
-
-void uchar2uchar_many( void *out, void *in, size_t n);
-void uchar2uchar_sat_many( void *out, void *in, size_t n);
-void char2uchar_many( void *out, void *in, size_t n);
-void char2uchar_sat_many( void *out, void *in, size_t n);
-void ushort2uchar_many( void *out, void *in, size_t n);
-void ushort2uchar_sat_many( void *out, void *in, size_t n);
-void short2uchar_many( void *out, void *in, size_t n);
-void short2uchar_sat_many( void *out, void *in, size_t n);
-void uint2uchar_many( void *out, void *in, size_t n);
-void uint2uchar_sat_many( void *out, void *in, size_t n);
-void int2uchar_many( void *out, void *in, size_t n);
-void int2uchar_sat_many( void *out, void *in, size_t n);
-void float2uchar_many( void *out, void *in, size_t n);
-void float2uchar_sat_many( void *out, void *in, size_t n);
-void double2uchar_many( void *out, void *in, size_t n);
-void double2uchar_sat_many( void *out, void *in, size_t n);
-void ulong2uchar_many( void *out, void *in, size_t n);
-void ulong2uchar_sat_many( void *out, void *in, size_t n);
-void long2uchar_many( void *out, void *in, size_t n);
-void long2uchar_sat_many( void *out, void *in, size_t n);
-void uchar2char_many( void *out, void *in, size_t n);
-void uchar2char_sat_many( void *out, void *in, size_t n);
-void char2char_many( void *out, void *in, size_t n);
-void char2char_sat_many( void *out, void *in, size_t n);
-void ushort2char_many( void *out, void *in, size_t n);
-void ushort2char_sat_many( void *out, void *in, size_t n);
-void short2char_many( void *out, void *in, size_t n);
-void short2char_sat_many( void *out, void *in, size_t n);
-void uint2char_many( void *out, void *in, size_t n);
-void uint2char_sat_many( void *out, void *in, size_t n);
-void int2char_many( void *out, void *in, size_t n);
-void int2char_sat_many( void *out, void *in, size_t n);
-void float2char_many( void *out, void *in, size_t n);
-void float2char_sat_many( void *out, void *in, size_t n);
-void double2char_many( void *out, void *in, size_t n);
-void double2char_sat_many( void *out, void *in, size_t n);
-void ulong2char_many( void *out, void *in, size_t n);
-void ulong2char_sat_many( void *out, void *in, size_t n);
-void long2char_many( void *out, void *in, size_t n);
-void long2char_sat_many( void *out, void *in, size_t n);
-void uchar2ushort_many( void *out, void *in, size_t n);
-void uchar2ushort_sat_many( void *out, void *in, size_t n);
-void char2ushort_many( void *out, void *in, size_t n);
-void char2ushort_sat_many( void *out, void *in, size_t n);
-void ushort2ushort_many( void *out, void *in, size_t n);
-void ushort2ushort_sat_many( void *out, void *in, size_t n);
-void short2ushort_many( void *out, void *in, size_t n);
-void short2ushort_sat_many( void *out, void *in, size_t n);
-void uint2ushort_many( void *out, void *in, size_t n);
-void uint2ushort_sat_many( void *out, void *in, size_t n);
-void int2ushort_many( void *out, void *in, size_t n);
-void int2ushort_sat_many( void *out, void *in, size_t n);
-void float2ushort_many( void *out, void *in, size_t n);
-void float2ushort_sat_many( void *out, void *in, size_t n);
-void double2ushort_many( void *out, void *in, size_t n);
-void double2ushort_sat_many( void *out, void *in, size_t n);
-void ulong2ushort_many( void *out, void *in, size_t n);
-void ulong2ushort_sat_many( void *out, void *in, size_t n);
-void long2ushort_many( void *out, void *in, size_t n);
-void long2ushort_sat_many( void *out, void *in, size_t n);
-void uchar2short_many( void *out, void *in, size_t n);
-void uchar2short_sat_many( void *out, void *in, size_t n);
-void char2short_many( void *out, void *in, size_t n);
-void char2short_sat_many( void *out, void *in, size_t n);
-void ushort2short_many( void *out, void *in, size_t n);
-void ushort2short_sat_many( void *out, void *in, size_t n);
-void short2short_many( void *out, void *in, size_t n);
-void short2short_sat_many( void *out, void *in, size_t n);
-void uint2short_many( void *out, void *in, size_t n);
-void uint2short_sat_many( void *out, void *in, size_t n);
-void int2short_many( void *out, void *in, size_t n);
-void int2short_sat_many( void *out, void *in, size_t n);
-void float2short_many( void *out, void *in, size_t n);
-void float2short_sat_many( void *out, void *in, size_t n);
-void double2short_many( void *out, void *in, size_t n);
-void double2short_sat_many( void *out, void *in, size_t n);
-void ulong2short_many( void *out, void *in, size_t n);
-void ulong2short_sat_many( void *out, void *in, size_t n);
-void long2short_many( void *out, void *in, size_t n);
-void long2short_sat_many( void *out, void *in, size_t n);
-void uchar2uint_many( void *out, void *in, size_t n);
-void uchar2uint_sat_many( void *out, void *in, size_t n);
-void char2uint_many( void *out, void *in, size_t n);
-void char2uint_sat_many( void *out, void *in, size_t n);
-void ushort2uint_many( void *out, void *in, size_t n);
-void ushort2uint_sat_many( void *out, void *in, size_t n);
-void short2uint_many( void *out, void *in, size_t n);
-void short2uint_sat_many( void *out, void *in, size_t n);
-void uint2uint_many( void *out, void *in, size_t n);
-void uint2uint_sat_many( void *out, void *in, size_t n);
-void int2uint_many( void *out, void *in, size_t n);
-void int2uint_sat_many( void *out, void *in, size_t n);
-void float2uint_many( void *out, void *in, size_t n);
-void float2uint_sat_many( void *out, void *in, size_t n);
-void double2uint_many( void *out, void *in, size_t n);
-void double2uint_sat_many( void *out, void *in, size_t n);
-void ulong2uint_many( void *out, void *in, size_t n);
-void ulong2uint_sat_many( void *out, void *in, size_t n);
-void long2uint_many( void *out, void *in, size_t n);
-void long2uint_sat_many( void *out, void *in, size_t n);
-void uchar2int_many( void *out, void *in, size_t n);
-void uchar2int_sat_many( void *out, void *in, size_t n);
-void char2int_many( void *out, void *in, size_t n);
-void char2int_sat_many( void *out, void *in, size_t n);
-void ushort2int_many( void *out, void *in, size_t n);
-void ushort2int_sat_many( void *out, void *in, size_t n);
-void short2int_many( void *out, void *in, size_t n);
-void short2int_sat_many( void *out, void *in, size_t n);
-void uint2int_many( void *out, void *in, size_t n);
-void uint2int_sat_many( void *out, void *in, size_t n);
-void int2int_many( void *out, void *in, size_t n);
-void int2int_sat_many( void *out, void *in, size_t n);
-void float2int_many( void *out, void *in, size_t n);
-void float2int_sat_many( void *out, void *in, size_t n);
-void double2int_many( void *out, void *in, size_t n);
-void double2int_sat_many( void *out, void *in, size_t n);
-void ulong2int_many( void *out, void *in, size_t n);
-void ulong2int_sat_many( void *out, void *in, size_t n);
-void long2int_many( void *out, void *in, size_t n);
-void long2int_sat_many( void *out, void *in, size_t n);
-void uchar2float_many( void *out, void *in, size_t n);
-void uchar2float_sat_many( void *out, void *in, size_t n);
-void char2float_many( void *out, void *in, size_t n);
-void char2float_sat_many( void *out, void *in, size_t n);
-void ushort2float_many( void *out, void *in, size_t n);
-void ushort2float_sat_many( void *out, void *in, size_t n);
-void short2float_many( void *out, void *in, size_t n);
-void short2float_sat_many( void *out, void *in, size_t n);
-void uint2float_many( void *out, void *in, size_t n);
-void uint2float_sat_many( void *out, void *in, size_t n);
-void int2float_many( void *out, void *in, size_t n);
-void int2float_sat_many( void *out, void *in, size_t n);
-void float2float_many( void *out, void *in, size_t n);
-void float2float_sat_many( void *out, void *in, size_t n);
-void double2float_many( void *out, void *in, size_t n);
-void double2float_sat_many( void *out, void *in, size_t n);
-void ulong2float_many( void *out, void *in, size_t n);
-void ulong2float_sat_many( void *out, void *in, size_t n);
-void long2float_many( void *out, void *in, size_t n);
-void long2float_sat_many( void *out, void *in, size_t n);
-void uchar2double_many( void *out, void *in, size_t n);
-void uchar2double_sat_many( void *out, void *in, size_t n);
-void char2double_many( void *out, void *in, size_t n);
-void char2double_sat_many( void *out, void *in, size_t n);
-void ushort2double_many( void *out, void *in, size_t n);
-void ushort2double_sat_many( void *out, void *in, size_t n);
-void short2double_many( void *out, void *in, size_t n);
-void short2double_sat_many( void *out, void *in, size_t n);
-void uint2double_many( void *out, void *in, size_t n);
-void uint2double_sat_many( void *out, void *in, size_t n);
-void int2double_many( void *out, void *in, size_t n);
-void int2double_sat_many( void *out, void *in, size_t n);
-void float2double_many( void *out, void *in, size_t n);
-void float2double_sat_many( void *out, void *in, size_t n);
-void double2double_many( void *out, void *in, size_t n);
-void double2double_sat_many( void *out, void *in, size_t n);
-void ulong2double_many( void *out, void *in, size_t n);
-void ulong2double_sat_many( void *out, void *in, size_t n);
-void long2double_many( void *out, void *in, size_t n);
-void long2double_sat_many( void *out, void *in, size_t n);
-void uchar2ulong_many( void *out, void *in, size_t n);
-void uchar2ulong_sat_many( void *out, void *in, size_t n);
-void char2ulong_many( void *out, void *in, size_t n);
-void char2ulong_sat_many( void *out, void *in, size_t n);
-void ushort2ulong_many( void *out, void *in, size_t n);
-void ushort2ulong_sat_many( void *out, void *in, size_t n);
-void short2ulong_many( void *out, void *in, size_t n);
-void short2ulong_sat_many( void *out, void *in, size_t n);
-void uint2ulong_many( void *out, void *in, size_t n);
-void uint2ulong_sat_many( void *out, void *in, size_t n);
-void int2ulong_many( void *out, void *in, size_t n);
-void int2ulong_sat_many( void *out, void *in, size_t n);
-void float2ulong_many( void *out, void *in, size_t n);
-void float2ulong_sat_many( void *out, void *in, size_t n);
-void double2ulong_many( void *out, void *in, size_t n);
-void double2ulong_sat_many( void *out, void *in, size_t n);
-void ulong2ulong_many( void *out, void *in, size_t n);
-void ulong2ulong_sat_many( void *out, void *in, size_t n);
-void long2ulong_many( void *out, void *in, size_t n);
-void long2ulong_sat_many( void *out, void *in, size_t n);
-void uchar2long_many( void *out, void *in, size_t n);
-void uchar2long_sat_many( void *out, void *in, size_t n);
-void char2long_many( void *out, void *in, size_t n);
-void char2long_sat_many( void *out, void *in, size_t n);
-void ushort2long_many( void *out, void *in, size_t n);
-void ushort2long_sat_many( void *out, void *in, size_t n);
-void short2long_many( void *out, void *in, size_t n);
-void short2long_sat_many( void *out, void *in, size_t n);
-void uint2long_many( void *out, void *in, size_t n);
-void uint2long_sat_many( void *out, void *in, size_t n);
-void int2long_many( void *out, void *in, size_t n);
-void int2long_sat_many( void *out, void *in, size_t n);
-void float2long_many( void *out, void *in, size_t n);
-void float2long_sat_many( void *out, void *in, size_t n);
-void double2long_many( void *out, void *in, size_t n);
-void double2long_sat_many( void *out, void *in, size_t n);
-void ulong2long_many( void *out, void *in, size_t n);
-void ulong2long_sat_many( void *out, void *in, size_t n);
-void long2long_many( void *out, void *in, size_t n);
-void long2long_sat_many( void *out, void *in, size_t n);
-
-void uchar2uchar_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uchar )); }
-void uchar2uchar_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uchar )); }
-void char2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_char)); }}
-void char2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_char)); }}
-void ushort2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ushort)); }}
-void short2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_short)); }}
-void short2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_short)); }}
-void uint2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_uint)); }}
-void uint2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_uint)); }}
-void int2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_int)); }}
-void int2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_int)); }}
-void float2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_float)); }}
-void float2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_float)); }}
-void double2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_double)); }}
-void double2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_double)); }}
-void ulong2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_ulong)); }}
-void long2uchar_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uchar( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_long)); }}
-void long2uchar_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uchar_sat( (char*) out + i * sizeof(cl_uchar), (char*) in + i * sizeof(cl_long)); }}
-void uchar2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uchar)); }}
-void char2char_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_char )); }
-void char2char_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_char )); }
-void ushort2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ushort)); }}
-void short2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_short)); }}
-void short2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_short)); }}
-void uint2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uint)); }}
-void uint2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_uint)); }}
-void int2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_int)); }}
-void int2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_int)); }}
-void float2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_float)); }}
-void float2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_float)); }}
-void double2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_double)); }}
-void double2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_double)); }}
-void ulong2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_ulong)); }}
-void long2char_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2char( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_long)); }}
-void long2char_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2char_sat( (char*) out + i * sizeof(cl_char), (char*) in + i * sizeof(cl_long)); }}
-void uchar2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uchar)); }}
-void char2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_char)); }}
-void char2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_char)); }}
-void ushort2ushort_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ushort )); }
-void ushort2ushort_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ushort )); }
-void short2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_short)); }}
-void short2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_short)); }}
-void uint2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uint)); }}
-void uint2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_uint)); }}
-void int2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_int)); }}
-void int2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_int)); }}
-void float2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_float)); }}
-void float2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_float)); }}
-void double2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_double)); }}
-void double2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_double)); }}
-void ulong2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_ulong)); }}
-void long2ushort_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ushort( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_long)); }}
-void long2ushort_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ushort_sat( (char*) out + i * sizeof(cl_ushort), (char*) in + i * sizeof(cl_long)); }}
-void uchar2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uchar)); }}
-void char2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_char)); }}
-void char2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_char)); }}
-void ushort2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ushort)); }}
-void short2short_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_short )); }
-void short2short_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_short )); }
-void uint2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uint)); }}
-void uint2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_uint)); }}
-void int2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_int)); }}
-void int2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_int)); }}
-void float2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_float)); }}
-void float2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_float)); }}
-void double2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_double)); }}
-void double2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_double)); }}
-void ulong2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_ulong)); }}
-void long2short_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2short( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_long)); }}
-void long2short_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2short_sat( (char*) out + i * sizeof(cl_short), (char*) in + i * sizeof(cl_long)); }}
-void uchar2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_uchar)); }}
-void char2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_char)); }}
-void char2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_char)); }}
-void ushort2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ushort)); }}
-void short2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_short)); }}
-void short2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_short)); }}
-void uint2uint_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uint )); }
-void uint2uint_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_uint )); }
-void int2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_int)); }}
-void int2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_int)); }}
-void float2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_float)); }}
-void float2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_float)); }}
-void double2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_double)); }}
-void double2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_double)); }}
-void ulong2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_ulong)); }}
-void long2uint_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uint( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_long)); }}
-void long2uint_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2uint_sat( (char*) out + i * sizeof(cl_uint), (char*) in + i * sizeof(cl_long)); }}
-void uchar2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uchar)); }}
-void char2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_char)); }}
-void char2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_char)); }}
-void ushort2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ushort)); }}
-void short2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_short)); }}
-void short2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_short)); }}
-void uint2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uint)); }}
-void uint2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_uint)); }}
-void int2int_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_int )); }
-void int2int_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_int )); }
-void float2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_float)); }}
-void float2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_float)); }}
-void double2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_double)); }}
-void double2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_double)); }}
-void ulong2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_ulong)); }}
-void long2int_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2int( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_long)); }}
-void long2int_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2int_sat( (char*) out + i * sizeof(cl_int), (char*) in + i * sizeof(cl_long)); }}
-void uchar2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uchar)); }}
-void char2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_char)); }}
-void char2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_char)); }}
-void ushort2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ushort)); }}
-void short2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_short)); }}
-void short2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_short)); }}
-void uint2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uint)); }}
-void uint2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_uint)); }}
-void int2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_int)); }}
-void int2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_int)); }}
-void float2float_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_float )); }
-void float2float_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_float )); }
-void double2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_double)); }}
-void double2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_double)); }}
-void ulong2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_ulong)); }}
-void long2float_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2float( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_long)); }}
-void long2float_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2float_sat( (char*) out + i * sizeof(cl_float), (char*) in + i * sizeof(cl_long)); }}
-void uchar2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uchar)); }}
-void char2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_char)); }}
-void char2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_char)); }}
-void ushort2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ushort)); }}
-void short2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_short)); }}
-void short2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_short)); }}
-void uint2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uint)); }}
-void uint2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_uint)); }}
-void int2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_int)); }}
-void int2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_int)); }}
-void float2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_float)); }}
-void float2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_float)); }}
-void double2double_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_double )); }
-void double2double_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_double )); }
-void ulong2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_ulong)); }}
-void long2double_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2double( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_long)); }}
-void long2double_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2double_sat( (char*) out + i * sizeof(cl_double), (char*) in + i * sizeof(cl_long)); }}
-void uchar2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uchar)); }}
-void char2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_char)); }}
-void char2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_char)); }}
-void ushort2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_ushort)); }}
-void short2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_short)); }}
-void short2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_short)); }}
-void uint2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uint)); }}
-void uint2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_uint)); }}
-void int2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_int)); }}
-void int2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_int)); }}
-void float2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_float)); }}
-void float2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_float)); }}
-void double2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_double)); }}
-void double2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_double)); }}
-void ulong2ulong_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ulong )); }
-void ulong2ulong_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_ulong )); }
-void long2ulong_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ulong( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_long)); }}
-void long2ulong_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ long2ulong_sat( (char*) out + i * sizeof(cl_ulong), (char*) in + i * sizeof(cl_long)); }}
-void uchar2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uchar)); }}
-void uchar2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uchar2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uchar)); }}
-void char2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_char)); }}
-void char2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ char2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_char)); }}
-void ushort2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ushort)); }}
-void ushort2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ushort2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ushort)); }}
-void short2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_short)); }}
-void short2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ short2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_short)); }}
-void uint2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uint)); }}
-void uint2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ uint2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_uint)); }}
-void int2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_int)); }}
-void int2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ int2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_int)); }}
-void float2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_float)); }}
-void float2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ float2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_float)); }}
-void double2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_double)); }}
-void double2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ double2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_double)); }}
-void ulong2long_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2long( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ulong)); }}
-void ulong2long_sat_many( void *out, void *in, size_t n){size_t i; for( i = 0; i < n; i++){ ulong2long_sat( (char*) out + i * sizeof(cl_long), (char*) in + i * sizeof(cl_ulong)); }}
-void long2long_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_long )); }
-void long2long_sat_many( void *out, void *in, size_t n){ memcpy( out, in, n * sizeof( cl_long )); }
-
-Convert gSaturatedConversions[kTypeCount][kTypeCount] = {
-    {    uchar2uchar_sat_many,    char2uchar_sat_many,    ushort2uchar_sat_many,    short2uchar_sat_many,    uint2uchar_sat_many,    int2uchar_sat_many,    float2uchar_sat_many,    double2uchar_sat_many,    ulong2uchar_sat_many,    long2uchar_sat_many,     },
-    {    uchar2char_sat_many,    char2char_sat_many,    ushort2char_sat_many,    short2char_sat_many,    uint2char_sat_many,    int2char_sat_many,    float2char_sat_many,    double2char_sat_many,    ulong2char_sat_many, long2char_sat_many,     },
-    {    uchar2ushort_sat_many,    char2ushort_sat_many,    ushort2ushort_sat_many,    short2ushort_sat_many,    uint2ushort_sat_many,    int2ushort_sat_many,    float2ushort_sat_many,    double2ushort_sat_many,    ulong2ushort_sat_many,    long2ushort_sat_many,     },
-    {    uchar2short_sat_many,    char2short_sat_many,    ushort2short_sat_many,    short2short_sat_many,    uint2short_sat_many,    int2short_sat_many,    float2short_sat_many,    double2short_sat_many,    ulong2short_sat_many,    long2short_sat_many,     },
-    {    uchar2uint_sat_many,    char2uint_sat_many,    ushort2uint_sat_many,    short2uint_sat_many,    uint2uint_sat_many,    int2uint_sat_many,    float2uint_sat_many,    double2uint_sat_many,    ulong2uint_sat_many, long2uint_sat_many,     },
-    {    uchar2int_sat_many,    char2int_sat_many,    ushort2int_sat_many,    short2int_sat_many,    uint2int_sat_many,    int2int_sat_many,    float2int_sat_many,    double2int_sat_many,    ulong2int_sat_many,long2int_sat_many,     },
-    {    uchar2float_sat_many,    char2float_sat_many,    ushort2float_sat_many,    short2float_sat_many,    uint2float_sat_many,    int2float_sat_many,    float2float_sat_many,    double2float_sat_many,    ulong2float_sat_many,    long2float_sat_many,     },
-    {    uchar2double_sat_many,    char2double_sat_many,    ushort2double_sat_many,    short2double_sat_many,    uint2double_sat_many,    int2double_sat_many,    float2double_sat_many,    double2double_sat_many,    ulong2double_sat_many,    long2double_sat_many,     },
-    {    uchar2ulong_sat_many,    char2ulong_sat_many,    ushort2ulong_sat_many,    short2ulong_sat_many,    uint2ulong_sat_many,    int2ulong_sat_many,    float2ulong_sat_many,    double2ulong_sat_many,    ulong2ulong_sat_many,    long2ulong_sat_many,     },
-    {    uchar2long_sat_many,    char2long_sat_many,    ushort2long_sat_many,    short2long_sat_many,    uint2long_sat_many,    int2long_sat_many,    float2long_sat_many,    double2long_sat_many,    ulong2long_sat_many, long2long_sat_many,     },
-};
+    if (*name != '\0') return -4;
 
-Convert gConversions[kTypeCount][kTypeCount] = {
-    {    uchar2uchar_many,    char2uchar_many,    ushort2uchar_many,    short2uchar_many,    uint2uchar_many,    int2uchar_many,    float2uchar_many,    double2uchar_many,    ulong2uchar_many,    long2uchar_many,     },
-    {    uchar2char_many,    char2char_many,    ushort2char_many,    short2char_many,    uint2char_many,    int2char_many,    float2char_many,    double2char_many,    ulong2char_many,    long2char_many,     },
-    {    uchar2ushort_many,    char2ushort_many,    ushort2ushort_many,    short2ushort_many,    uint2ushort_many,    int2ushort_many,    float2ushort_many,    double2ushort_many,    ulong2ushort_many,    long2ushort_many,     },
-    {    uchar2short_many,    char2short_many,    ushort2short_many,    short2short_many,    uint2short_many,    int2short_many,    float2short_many,    double2short_many,    ulong2short_many,    long2short_many,     },
-    {    uchar2uint_many,    char2uint_many,    ushort2uint_many,    short2uint_many,    uint2uint_many,    int2uint_many,    float2uint_many,    double2uint_many,    ulong2uint_many,    long2uint_many,     },
-    {    uchar2int_many,    char2int_many,    ushort2int_many,    short2int_many,    uint2int_many,    int2int_many,    float2int_many,    double2int_many,    ulong2int_many,    long2int_many,     },
-    {    uchar2float_many,    char2float_many,    ushort2float_many,    short2float_many,    uint2float_many,    int2float_many,    float2float_many,    double2float_many,    ulong2float_many,    long2float_many,     },
-    {    uchar2double_many,    char2double_many,    ushort2double_many,    short2double_many,    uint2double_many,    int2double_many,    float2double_many,    double2double_many,    ulong2double_many,    long2double_many,     },
-    {    uchar2ulong_many,    char2ulong_many,    ushort2ulong_many,    short2ulong_many,    uint2ulong_many,    int2ulong_many,    float2ulong_many,    double2ulong_many,    ulong2ulong_many,    long2ulong_many,     },
-    {    uchar2long_many,    char2long_many,    ushort2long_many,    short2long_many,    uint2long_many,    int2long_many,    float2long_many,    double2long_many,    ulong2long_many,    long2long_many,     },
-};
+    return 0;
+}
+
+} // namespace conv_test
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index ab887afdd..c1d284ec2 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -16,8 +16,6 @@
 #ifndef BASIC_TEST_CONVERSIONS_H
 #define BASIC_TEST_CONVERSIONS_H
 
-#include "harness/compat.h"
-
 #if !defined(_WIN32)
 #include <unistd.h>
 #endif
@@ -32,23 +30,32 @@
     #include <CL/opencl.h>
 #endif
 
+
 #include "harness/mt19937.h"
+#include "harness/testHarness.h"
+#include "harness/typeWrappers.h"
 
-typedef void (*Convert)( void *dest, void *src, size_t );
+#include <memory>
+#include <tuple>
+#include <vector>
 
-#define kVectorSizeCount    6
-#define kMaxVectorSize      16
+#include "conversions_data_info.h"
 
-typedef enum
-{
-    kUnsaturated = 0,
-    kSaturated,
+// typedef void (*Convert)( void *dest, void *src, size_t );
 
-    kSaturationModeCount
-}SaturationMode;
+#define kVectorSizeCount 6
+#define kMaxVectorSize 16
+#define kPageSize 4096
 
-extern Convert gConversions[kTypeCount][kTypeCount];                // [dest format][source format]
-extern Convert gSaturatedConversions[kTypeCount][kTypeCount];       // [dest format][source format]
+#define BUFFER_SIZE (1024 * 1024)
+#define EMBEDDED_REDUCTION_FACTOR 16
+#define PERF_LOOP_COUNT 100
+
+
+// extern Convert gConversions[kTypeCount][kTypeCount];                // [dest
+// format][source format] extern Convert
+// gSaturatedConversions[kTypeCount][kTypeCount];       // [dest format][source
+// format]
 extern const char *gTypeNames[ kTypeCount ];
 extern const char *gRoundingModeNames[ kRoundingModeCount ];        // { "", "_rte", "_rtp", "_rtn", "_rtz" }
 extern const char *gSaturationNames[ kSaturationModeCount ];        // { "", "_sat" }
@@ -68,5 +75,354 @@ extern InitDataFunc gInitFunctions[ kTypeCount ];
 typedef int (*CheckResults)( void *out1, void *out2, void *allowZ, uint32_t count, int vectorSize );
 extern CheckResults gCheckResults[ kTypeCount ];
 
+#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
+
+extern MTdata gMTdata;
+extern cl_command_queue gQueue;
+extern cl_context gContext;
+extern cl_mem gInBuffer;
+extern cl_mem gOutBuffers[];
+extern int gHasDouble;
+extern int gTestDouble;
+extern int gWimpyMode;
+extern int gWimpyReductionFactor;
+extern int gSkipTesting;
+extern int gMinVectorSize;
+extern int gMaxVectorSize;
+extern int gForceFTZ;
+extern int gTimeResults;
+extern int gReportAverageTimes;
+extern int gStartTestNumber;
+extern int gEndTestNumber;
+extern int gIsRTZ;
+extern void *gIn;
+extern void *gRef;
+extern void *gAllowZ;
+extern void *gOut[];
+
+extern const char **argList;
+extern int argCount;
+
+extern const char *sizeNames[];
+extern int vectorSizes[];
+
+extern size_t gComputeDevices;
+extern uint32_t gDeviceFrequency;
+
+
+namespace conv_test {
+
+cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
+                       RoundingMode round, int vectorSize,
+                       cl_kernel *outKernel);
+
+int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf, size_t blockCount);
+
+int GetTestCase(const char *name, Type *outType, Type *inType,
+                SaturationMode *sat, RoundingMode *round);
+
+cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
+cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
+uint64_t GetTime(void);
+
+void WriteInputBufferComplete(void *);
+void *FlushToZero(void);
+void UnFlushToZero(void *);
+}
+
+
+struct CalcRefValsBase
+{
+    virtual int check_result(void *, uint32_t, int) { return 0; }
+
+    // pointer back to the parent WriteInputBufferInfo struct
+    struct WriteInputBufferInfo *parent;
+    clKernelWrapper kernel; // the kernel for this vector size
+    clProgramWrapper program; // the program for this vector size
+    cl_uint vectorSize; // the vector size for this callback chain
+    void *p; // the pointer to mapped result data for this vector size
+    cl_int result;
+};
+
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+struct CalcRefValsPat : CalcRefValsBase
+{
+    int check_result(void *, uint32_t, int) override;
+};
+
+
+struct WriteInputBufferInfo
+{
+    WriteInputBufferInfo()
+        : calcReferenceValues(nullptr), doneBarrier(nullptr), count(0),
+          outType(kuchar), inType(kuchar), barrierCount(0)
+    {}
+
+    volatile cl_event
+        calcReferenceValues; // user event which signals when main thread is
+                             // done calculating reference values
+    volatile cl_event
+        doneBarrier; // user event which signals when worker threads are done
+    cl_uint count; // the number of elements in the array
+    Type outType; // the data type of the conversion result
+    Type inType; // the data type of the conversion input
+    volatile int barrierCount;
+
+    std::vector<std::unique_ptr<CalcRefValsBase>> calcInfo;
+};
+
+
+// Must be aligned with Type enums!
+using TypeIter = std::tuple<cl_uchar, cl_char, cl_ushort, cl_short, cl_uint,
+                            cl_int, cl_float, cl_double, cl_ulong, cl_long>;
+
+constexpr bool isTypeFp[] = { 0, 0, 0, 0, 0, 0, 1, 1, 0, 0 };
+
+
+// Helper test fixture for constructing OpenCL objects used in testing
+// a variety of simple command-buffer enqueue scenarios.
+struct ConversionsTest
+{
+    ConversionsTest(cl_device_id device, cl_context context,
+                    cl_command_queue queue);
+
+    virtual cl_int SetUp(int elements);
+
+    // Test body returning an OpenCL error code
+    virtual cl_int Run();
+
+    template <typename InType, typename OutType, bool InFP, bool OutFP>
+    int DoTest(Type outType, Type inType, SaturationMode sat,
+               RoundingMode round);
+
+    template <typename InType, typename OutType, bool InFP, bool OutFP>
+    void TestTypesConversion(const Type &inType, const Type &outType, int &tn);
+
+protected:
+    cl_context context;
+    cl_device_id device;
+    cl_command_queue queue;
+
+    size_t num_elements;
+
+    TypeIter typeIterator;
+};
+
+
+struct CustomConversionsTest : ConversionsTest
+{
+    CustomConversionsTest(cl_device_id device, cl_context context,
+                          cl_command_queue queue)
+        : ConversionsTest(device, context, queue)
+    {}
+
+    cl_int Run() override;
+};
+
+
+template <class T>
+int MakeAndRunTest(cl_device_id device, cl_context context,
+                   cl_command_queue queue, int num_elements)
+{
+    auto test_fixture = T(device, context, queue);
+
+    cl_int error = test_fixture.SetUp(num_elements);
+    test_error_ret(error, "Error in test initialization", TEST_FAIL);
+
+    return test_fixture.Run();
+}
+
+
+struct TestType
+{
+    template <typename T> bool testType(Type in)
+    {
+        switch (in)
+        {
+            default: return false;
+            case kuchar: return std::is_same<cl_uchar, T>::value;
+            case kchar: return std::is_same<cl_char, T>::value;
+            case kushort: return std::is_same<cl_ushort, T>::value;
+            case kshort: return std::is_same<cl_short, T>::value;
+            case kuint: return std::is_same<cl_uint, T>::value;
+            case kint: return std::is_same<cl_int, T>::value;
+            case kfloat: return std::is_same<cl_float, T>::value;
+            case kdouble: return std::is_same<cl_double, T>::value;
+            case kulong: return std::is_same<cl_ulong, T>::value;
+            case klong: return std::is_same<cl_long, T>::value;
+        }
+    }
+};
+
+
+// Helper structures to iterate over all tuple attributes of different types
+struct IterOverTypes : public TestType
+{
+    IterOverTypes(const TypeIter &typeIter, ConversionsTest &test)
+        : inType((Type)0), outType((Type)0), typeIter(typeIter), test(test),
+          testNumber(-1)
+    {}
+
+    void Run() { for_each_out_elem(typeIter); }
+
+protected:
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t Out = 0, typename OutType>
+    void iterate_out_type(const OutType &t)
+    {
+        for_each_in_elem<0, Out, OutType>(typeIter);
+        outType = (Type)(outType + 1);
+        inType = (Type)0;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t In, std::size_t Out, typename OutType,
+              typename InType>
+    void iterate_in_type(const InType &t)
+    {
+        if (!testType<InType>(inType)) vlog_error("Unexpected data type!\n");
+
+        if (!testType<OutType>(outType)) vlog_error("Unexpected data type!\n");
+
+        // run the conversions
+        test.TestTypesConversion<InType, OutType, isTypeFp[In], isTypeFp[Out]>(
+            inType, outType, testNumber);
+        inType = (Type)(inType + 1);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t Out = 0, typename... Tp>
+    inline typename std::enable_if<Out == sizeof...(Tp), void>::type
+    for_each_out_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t Out = 0, typename... Tp>
+        inline typename std::enable_if < Out<sizeof...(Tp), void>::type
+        for_each_out_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_out_type<Out>(std::get<Out>(t));
+        for_each_out_elem<Out + 1, Tp...>(t);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+    inline typename std::enable_if<In == sizeof...(Tp), void>::type
+    for_each_in_elem(
+        const std::tuple<Tp...> &) // Unused arguments are given no names.
+    {}
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+        inline typename std::enable_if < In<sizeof...(Tp), void>::type
+        for_each_in_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_in_type<In, Out, OutType>(std::get<In>(t));
+        for_each_in_elem<In + 1, Out, OutType, Tp...>(t);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+protected:
+    Type inType;
+    Type outType;
+    const TypeIter &typeIter;
+    ConversionsTest &test;
+    int testNumber;
+};
+
+
+// Helper structures to select type 2 type conversion test case
+struct IterOverSelectedTypes : public TestType
+{
+    IterOverSelectedTypes(const TypeIter &typeIter, ConversionsTest &test,
+                          const Type &in, const Type &out)
+        : inType(in), outType(out), typeIter(typeIter), test(test),
+          testNumber(-1)
+    {}
+
+    void Run() { for_each_out_elem(typeIter); }
+
+protected:
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t Out = 0, typename OutType>
+    void iterate_out_type(const OutType &t)
+    {
+        for_each_in_elem<0, Out, OutType>(typeIter);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t In, std::size_t Out, typename OutType,
+              typename InType>
+    void iterate_in_type(const InType &t)
+    {
+        if (testType<InType>(inType) && testType<OutType>(outType))
+        {
+            // run the conversions
+            test.TestTypesConversion<InType, OutType, isTypeFp[In],
+                                     isTypeFp[Out]>(inType, outType,
+                                                    testNumber);
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t Out = 0, typename... Tp>
+    inline typename std::enable_if<Out == sizeof...(Tp), void>::type
+    for_each_out_elem(const std::tuple<Tp...> &)
+    {}
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t Out = 0, typename... Tp>
+        inline typename std::enable_if < Out<sizeof...(Tp), void>::type
+        for_each_out_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_out_type<Out>(std::get<Out>(t));
+        for_each_out_elem<Out + 1, Tp...>(t);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+    inline typename std::enable_if<In == sizeof...(Tp), void>::type
+    for_each_in_elem(const std::tuple<Tp...> &)
+    {}
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+    template <std::size_t In = 0, std::size_t Out, typename OutType,
+              typename... Tp>
+        inline typename std::enable_if < In<sizeof...(Tp), void>::type
+        for_each_in_elem(const std::tuple<Tp...> &t)
+    {
+        iterate_in_type<In, Out, OutType>(std::get<In>(t));
+        for_each_in_elem<In + 1, Out, OutType, Tp...>(t);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////////////////
+
+protected:
+    Type inType;
+    Type outType;
+    const TypeIter &typeIter;
+    ConversionsTest &test;
+    int testNumber;
+};
+
+
 #endif /* BASIC_TEST_CONVERSIONS_H */
 
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
new file mode 100644
index 000000000..a4e9c9689
--- /dev/null
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -0,0 +1,812 @@
+//
+// Copyright (c) 2023 The Khronos Group Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef CONVERSIONS_DATA_INFO_H
+#define CONVERSIONS_DATA_INFO_H
+
+#if defined(__APPLE__)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+#include "fplib.h"
+extern bool qcom_sat;
+extern roundingMode qcom_rm;
+#endif
+
+#include "harness/mt19937.h"
+#include "harness/rounding_mode.h"
+#include "harness/typeWrappers.h"
+
+#include <vector>
+
+#if defined(__linux__)
+#include <sys/param.h>
+#include <libgen.h>
+#endif
+
+extern size_t gTypeSizes[kTypeCount];
+extern void *gIn;
+
+
+typedef enum
+{
+    kUnsaturated = 0,
+    kSaturated,
+
+    kSaturationModeCount
+} SaturationMode;
+
+
+struct DataInitInfo
+{
+    cl_ulong start;
+    cl_uint size;
+    Type outType;
+    Type inType;
+    SaturationMode sat;
+    RoundingMode round;
+    cl_uint threads;
+
+
+    static std::vector<uint32_t> specialValuesUInt;
+    static std::vector<float> specialValuesFloat;
+    static std::vector<double> specialValuesDouble;
+};
+
+
+struct DataInitBase : public DataInitInfo
+{
+    DataInitBase(const DataInitInfo &agg): DataInitInfo(agg) {}
+    virtual void conv_array(void *out, void *in, size_t n) {}
+    virtual void conv_array_sat(void *out, void *in, size_t n) {}
+    virtual void init(const cl_uint &, const cl_uint &) {}
+};
+
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+struct DataInfoSpec : public DataInitBase
+{
+
+    DataInfoSpec(const DataInitInfo &agg);
+
+    // helpers
+    float round_to_int(float f);
+    long long round_to_int_and_clamp(double d);
+
+    OutType absolute(const OutType &x);
+
+    // actual conversion of reference values
+    void conv(OutType *out, InType *in);
+    void conv_sat(OutType *out, InType *in);
+
+    // min/max ranges for output type of data
+    std::pair<OutType, OutType> ranges;
+
+    // matrix of clamping ranges for each rounding type
+    std::vector<std::pair<InType, InType>> clamp_ranges;
+
+    std::vector<MTdataHolder> mdv;
+
+    ////////////////////////////////////////////////////////////////////////////
+    void conv_array(void *out, void *in, size_t n) override
+    {
+        for (size_t i = 0; i < n; i++)
+            conv(&((OutType *)out)[i], &((InType *)in)[i]);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    void conv_array_sat(void *out, void *in, size_t n) override
+    {
+        for (size_t i = 0; i < n; i++)
+            conv_sat(&((OutType *)out)[i], &((InType *)in)[i]);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    void init(const cl_uint &, const cl_uint &) override;
+    InType clamp(const InType &);
+    ////////////////////////////////////////////////////////////////////////////
+    inline float fclamp(float lo, float v, float hi)
+    {
+        v = v < lo ? lo : v;
+        return v < hi ? v : hi;
+    }
+    ////////////////////////////////////////////////////////////////////////////
+    inline double dclamp(double lo, double v, double hi)
+    {
+        v = v < lo ? lo : v;
+        return v < hi ? v : hi;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
+    const DataInitInfo &agg)
+    : DataInitBase(agg), mdv(0)
+{
+    if (std::is_same<cl_float, OutType>::value)
+        ranges = std::make_pair(CL_FLT_MIN, CL_FLT_MAX);
+    else if (std::is_same<cl_double, OutType>::value)
+        ranges = std::make_pair(CL_DBL_MIN, CL_DBL_MAX);
+    else if (std::is_same<cl_uchar, OutType>::value)
+        ranges = std::make_pair(0, CL_UCHAR_MAX);
+    else if (std::is_same<cl_char, OutType>::value)
+        ranges = std::make_pair(CL_CHAR_MIN, CL_CHAR_MAX);
+    else if (std::is_same<cl_ushort, OutType>::value)
+        ranges = std::make_pair(0, CL_USHRT_MAX);
+    else if (std::is_same<cl_short, OutType>::value)
+        ranges = std::make_pair(CL_SHRT_MIN, CL_SHRT_MAX);
+    else if (std::is_same<cl_uint, OutType>::value)
+        ranges = std::make_pair(0, CL_UINT_MAX);
+    else if (std::is_same<cl_int, OutType>::value)
+        ranges = std::make_pair(CL_INT_MIN, CL_INT_MAX);
+    else if (std::is_same<cl_ulong, OutType>::value)
+        ranges = std::make_pair(0, CL_ULONG_MAX);
+    else if (std::is_same<cl_long, OutType>::value)
+        ranges = std::make_pair(CL_LONG_MIN, CL_LONG_MAX);
+
+    InType outMin = ((InType)ranges.first);
+    InType outMax = ((InType)ranges.second);
+
+    // clang-format off
+    // for readability sake keep this section unformatted
+    if (std::is_floating_point<InType>::value)
+    { // from float/double
+        InType eps = std::is_same<InType, cl_float>::value ? (InType) FLT_EPSILON : (InType) DBL_EPSILON;
+        if (std::is_integral<OutType>::value)
+        { // to char/uchar/short/ushort/int/uint/long/ulong
+            if (sizeof(OutType)<=sizeof(cl_short))
+            { // to char/uchar/short/ushort
+                clamp_ranges=
+                {{outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
+                  {outMin-0.5f, outMax + 0.5f - outMax * 0.5f * eps},
+                  {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, outMax-1.f},
+                  {outMin-0.0f, outMax - outMax * 0.5f * eps },
+                  {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, outMax - outMax * 0.5f * eps}};
+            }
+            else if (std::is_same<InType, cl_float>::value)
+            { // from float
+                if (std::is_same<OutType, cl_uint>::value)
+                { // to uint
+                    clamp_ranges=
+                    { {outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)},
+                      {outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)},
+                      {outMin-1.0f+0.5f*eps, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)},
+                      {outMin-0.0f, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7) },
+                      {outMin-1.0f+0.5f*eps, MAKE_HEX_FLOAT(0x1.fffffep31f, 0x1fffffeL, 7)}};
+                }
+                else if (std::is_same<OutType, cl_int>::value)
+                { // to int
+                    clamp_ranges=
+                    { {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)},
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)},
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)},
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6) },
+                      {outMin, MAKE_HEX_FLOAT(0x1.fffffep30f, 0x1fffffeL, 6)}};
+                }
+                else if (std::is_same<OutType, cl_ulong>::value)
+                { // to ulong
+                    clamp_ranges=
+                    {{outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)},
+                      {outMin-0.5f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)},
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)},
+                      {outMin-0.0f, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39) },
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_FLOAT(0x1.fffffep63f, 0x1fffffeL, 39)}};
+                }
+                else if (std::is_same<OutType, cl_long>::value)
+                { // to long
+                    clamp_ranges=
+                    { {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)},
+                      {MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63), MAKE_HEX_FLOAT(0x1.fffffep62f, 0x1fffffeL, 38)}};
+                }
+            }
+            else
+            { // from double
+                if (std::is_same<OutType, cl_uint>::value)
+                { // to uint
+                    clamp_ranges=
+                    { {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * eps},
+                      {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p31, 0x1LL, 31) * eps},
+                      {outMin-1.0f+0.5f*eps, outMax},
+                      {outMin-0.0f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21) },
+                      {outMin-1.0f+0.5f*eps, MAKE_HEX_DOUBLE(0x1.fffffffffffffp31, 0x1fffffffffffffLL, -21)}};
+                }
+                else if (std::is_same<OutType, cl_int>::value)
+                { // to int
+                    clamp_ranges=
+                    { {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps},
+                      {outMin-0.5f, outMax + 0.5 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps},
+                      {outMin-1.0f+outMax*eps, outMax},
+                      {outMin-0.0f, outMax + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps },
+                      {outMin-1.0f+outMax*eps, outMax + 1.0 - MAKE_HEX_DOUBLE(0x1.0p30, 0x1LL, 30) * eps}};
+                }
+                else if (std::is_same<OutType, cl_ulong>::value)
+                { // to ulong
+                    clamp_ranges=
+                    {{outMin-0.5f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)},
+                      {outMin-0.5f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)},
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)},
+                      {outMin-0.0f, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11) },
+                      {outMin-1.0f+(std::is_signed<OutType>::value?outMax:0.5f)*eps, MAKE_HEX_DOUBLE(0x1.fffffffffffffp63, 0x1fffffffffffffLL, 11)}};
+                }
+                else if (std::is_same<OutType, cl_long>::value)
+                { // to long
+                    clamp_ranges=
+                    { {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)},
+                      {MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63), MAKE_HEX_DOUBLE(0x1.fffffffffffffp62, 0x1fffffffffffffLL, 10)}};
+                }
+            }
+        }
+    }
+    // clang-format on
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
+{
+    static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23),
+                                    -MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23) };
+
+    // Round fractional values to integer in round towards nearest mode
+    if (fabsf(f) < MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23))
+    {
+        volatile float x = f;
+        float magicVal = magic[f < 0];
+
+#if defined(__SSE__)
+        // Defeat x87 based arithmetic, which cant do FTZ, and will round this
+        // incorrectly
+        __m128 v = _mm_set_ss(x);
+        __m128 m = _mm_set_ss(magicVal);
+        v = _mm_add_ss(v, m);
+        v = _mm_sub_ss(v, m);
+        _mm_store_ss((float *)&x, v);
+#else
+        x += magicVal;
+        x -= magicVal;
+#endif
+        f = x;
+    }
+    return f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+long long
+DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
+{
+    static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52),
+                                     MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
+
+    if (f >= -(double)LLONG_MIN) return LLONG_MAX;
+
+    if (f <= (double)LLONG_MIN) return LLONG_MIN;
+
+    // Round fractional values to integer in round towards nearest mode
+    if (fabs(f) < MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52))
+    {
+        volatile double x = f;
+        double magicVal = magic[f < 0];
+#if defined(__SSE2__) || defined(_MSC_VER)
+        // Defeat x87 based arithmetic, which cant do FTZ, and will round this
+        // incorrectly
+        __m128d v = _mm_set_sd(x);
+        __m128d m = _mm_set_sd(magicVal);
+        v = _mm_add_sd(v, m);
+        v = _mm_sub_sd(v, m);
+        _mm_store_sd((double *)&x, v);
+#else
+        x += magicVal;
+        x -= magicVal;
+#endif
+        f = x;
+    }
+    return (long long)f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+OutType DataInfoSpec<InType, OutType, InFP, OutFP>::absolute(const OutType &x)
+{
+    union {
+        cl_uint u;
+        OutType f;
+    } u;
+    u.f = x;
+    if (std::is_same<OutType, float>::value)
+        u.u &= 0x7fffffff;
+    else if (std::is_same<OutType, double>::value)
+        u.u &= 0x7fffffffffffffffULL;
+    else
+        log_error("Unexpected argument type of DataInfoSpec::absolute");
+
+    return u.f;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
+{
+    if (std::is_same<cl_float, InType>::value)
+    {
+        cl_float inVal = *in;
+
+        if (std::is_floating_point<OutType>::value)
+        {
+            *out = (OutType)inVal;
+        }
+        else if (std::is_same<cl_ulong, OutType>::value)
+        {
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+            // VS2005 (at least) on x86 uses fistp to store the float as a
+            // 64-bit int. However, fistp stores it as a signed int, and some of
+            // the test values won't fit into a signed int. (These test values
+            // are >= 2^63.) The result on VS2005 is that these end up silently
+            // (at least by default settings) clamped to the max lowest ulong.
+            cl_float x = round_to_int(inVal);
+            if (x >= 9223372036854775808.0f)
+            {
+                x -= 9223372036854775808.0f;
+                ((cl_ulong *)out)[0] = x;
+                ((cl_ulong *)out)[0] += 9223372036854775808ULL;
+            }
+            else
+            {
+                ((cl_ulong *)out)[0] = x;
+            }
+#else
+            *out = round_to_int(inVal);
+#endif
+        }
+        else if (std::is_same<cl_long, OutType>::value)
+        {
+            *out = round_to_int_and_clamp(inVal);
+        }
+        else
+            *out = round_to_int(inVal);
+    }
+    else if (std::is_same<cl_double, InType>::value)
+    {
+        if (std::is_same<cl_float, OutType>::value)
+            *out = (OutType)*in;
+        else
+            *out = rint(*in);
+    }
+    else if (std::is_same<cl_ulong, InType>::value
+             || std::is_same<cl_long, InType>::value)
+    {
+        if (std::is_same<cl_double, OutType>::value)
+        {
+#if defined(_MSC_VER)
+            cl_ulong l = ((cl_ulong *)in)[0];
+            double result;
+
+            if (std::is_same<cl_ulong, InType>::value)
+            {
+                cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1))
+                                              : (cl_long)l;
+#if defined(_M_X64)
+                _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), sl));
+#else
+                result = sl;
+#endif
+                ((double *)out)[0] =
+                    (l == 0 ? 0.0 : (((cl_long)l < 0) ? result * 2.0 : result));
+            }
+            else
+            {
+                _mm_store_sd(&result, _mm_cvtsi64_sd(_mm_setzero_pd(), l));
+                ((double *)out)[0] =
+                    (l == 0 ? 0.0 : result); // Per IEEE-754-2008 5.4.1, 0's
+                                             // always convert to +0.0
+            }
+#else
+            *out = (*in == 0 ? 0.0 : (OutType)*in);
+#endif
+        }
+        else if (std::is_same<cl_float, OutType>::value)
+        {
+            cl_float outVal = 0.f;
+
+#if defined(_MSC_VER) && defined(_M_X64)
+            cl_ulong l = ((cl_ulong *)in)[0];
+            float result;
+            if (std::is_same<cl_ulong, InType>::value)
+            {
+                cl_long sl = ((cl_long)l < 0) ? (cl_long)((l >> 1) | (l & 1))
+                                              : (cl_long)l;
+                _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), sl));
+                outVal = (l == 0 ? 0.0f
+                                 : (((cl_long)l < 0) ? result * 2.0f : result));
+            }
+            else
+            {
+                _mm_store_ss(&result, _mm_cvtsi64_ss(_mm_setzero_ps(), l));
+                outVal = (l == 0 ? 0.0f : result); // Per IEEE-754-2008 5.4.1,
+                                                   // 0's always convert to +0.0
+            }
+#else
+            InType l = ((InType *)in)[0];
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+            /* ARM VFP doesn't have hardware instruction for converting from
+             * 64-bit integer to float types, hence GCC ARM uses the
+             * floating-point emulation code despite which -mfloat-abi setting
+             * it is. But the emulation code in libgcc.a has only one rounding
+             * mode (round to nearest even in this case) and ignores the user
+             * rounding mode setting in hardware. As a result setting rounding
+             * modes in hardware won't give correct rounding results for type
+             * covert from 64-bit integer to float using GCC for ARM compiler so
+             * for testing different rounding modes, we need to use alternative
+             * reference function. ARM64 does have an instruction, however we
+             * cannot guarantee the compiler will use it.  On all ARM
+             * architechures use emulation to calculate reference.*/
+            if (std::is_same<cl_ulong, InType>::value)
+                outVal = qcom_u64_2_f32(l, qcom_sat, qcom_rm);
+            else
+                outVal = (l == 0 ? 0.0f : qcom_s64_2_f32(l, qcom_sat, qcom_rm));
+#else
+            outVal = (l == 0 ? 0.0f : (float)l); // Per IEEE-754-2008 5.4.1, 0's
+                                                 // always convert to +0.0
+#endif
+#endif
+
+            *out = outVal;
+        }
+        else
+        {
+            *out = (OutType)*in;
+        }
+    }
+    else
+    {
+        if (std::is_same<cl_float, OutType>::value)
+            *out = (*in == 0 ? 0.f : *in); // Per IEEE-754-2008 5.4.1, 0's
+                                           // always convert to +0.0
+        else if (std::is_same<cl_double, OutType>::value)
+            *out = (*in == 0 ? 0.0 : *in);
+        else
+            *out = (OutType)*in;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+#define CLAMP(_lo, _x, _hi)                                                    \
+    ((_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
+                                                          InType *in)
+{
+    if (std::is_floating_point<InType>::value)
+    {
+        if (std::is_floating_point<OutType>::value)
+        { // in float/double, out float/double
+            *out = (OutType)(*in);
+        }
+        else if ((std::is_same<InType, cl_float>::value)
+                 && std::is_same<cl_ulong, OutType>::value)
+        {
+            cl_float x = round_to_int(*in);
+
+#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+            // VS2005 (at least) on x86 uses fistp to store the float as a
+            // 64-bit int. However, fistp stores it as a signed int, and some of
+            // the test values won't fit into a signed int. (These test values
+            // are >= 2^63.) The result on VS2005 is that these end up silently
+            // (at least by default settings) clamped to the max lowest ulong.
+            if (x >= 18446744073709551616.0f)
+            { // 2^64
+                *out = 0xFFFFFFFFFFFFFFFFULL;
+            }
+            else if (x < 0)
+            {
+                *out = 0;
+            }
+            else if (x >= 9223372036854775808.0f)
+            { // 2^63
+                x -= 9223372036854775808.0f;
+                *out = x;
+                *out += 9223372036854775808ULL;
+            }
+            else
+            {
+                *out = x;
+            }
+#else
+            *out = x >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64)
+                ? 0xFFFFFFFFFFFFFFFFULL
+                : x < 0 ? 0 : (OutType)x;
+#endif
+        }
+        else if ((std::is_same<InType, cl_float>::value)
+                 && std::is_same<cl_long, OutType>::value)
+        {
+            cl_float f = round_to_int(*in);
+            *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63)
+                ? 0x7FFFFFFFFFFFFFFFULL
+                : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63)
+                    ? 0x8000000000000000LL
+                    : (OutType)f;
+        }
+        else if (std::is_same<InType, cl_double>::value
+                 && std::is_same<cl_ulong, OutType>::value)
+        {
+            InType f = rint(*in);
+            *out = f >= MAKE_HEX_DOUBLE(0x1.0p64, 0x1LL, 64)
+                ? 0xFFFFFFFFFFFFFFFFULL
+                : f < 0 ? 0 : (OutType)f;
+        }
+        else if (std::is_same<InType, cl_double>::value
+                 && std::is_same<cl_long, OutType>::value)
+        {
+            InType f = rint(*in);
+            *out = f >= MAKE_HEX_DOUBLE(0x1.0p63, 0x1LL, 63)
+                ? 0x7FFFFFFFFFFFFFFFULL
+                : f < MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63)
+                    ? 0x8000000000000000LL
+                    : (OutType)f;
+        }
+        else
+        { // in float/double, out char/uchar/short/ushort/int/uint
+            *out =
+                CLAMP(ranges.first, round_to_int_and_clamp(*in), ranges.second);
+        }
+    }
+    else if (std::is_integral<InType>::value
+             && std::is_integral<OutType>::value)
+    {
+        {
+            if ((std::is_signed<InType>::value
+                 && std::is_signed<OutType>::value)
+                || (!std::is_signed<InType>::value
+                    && !std::is_signed<OutType>::value))
+            {
+                if (sizeof(InType) <= sizeof(OutType))
+                {
+                    *out = (OutType)*in;
+                }
+                else
+                {
+                    *out = CLAMP(ranges.first, *in, ranges.second);
+                }
+            }
+            else
+            { // mixed signed/unsigned types
+                if (sizeof(InType) < sizeof(OutType))
+                {
+                    *out = (!std::is_signed<InType>::value)
+                        ? (OutType)*in
+                        : CLAMP(0, *in, ranges.second); // *in < 0 ? 0 : *in
+                }
+                else
+                { // bigger/equal mixed signed/unsigned types - always clamp
+                    *out = CLAMP(0, *in, ranges.second);
+                }
+            }
+        }
+    }
+    else
+    { // InType integral, OutType floating
+        *out = std::is_signed<InType>::value ? (OutType)*in
+                                             : absolute((OutType)*in);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
+                                                      const cl_uint &thread_id)
+{
+    uint64_t ulStart = start;
+    void *pIn = (char *)gIn + job_id * size * gTypeSizes[inType];
+
+    if (std::is_integral<InType>::value)
+    {
+        InType *o = (InType *)pIn;
+        if (sizeof(InType) <= sizeof(cl_short))
+        { // char/uchar/ushort/short
+            for (int i = 0; i < size; i++) o[i] = ulStart++;
+        }
+        else if (sizeof(InType) <= sizeof(cl_int))
+        { // int/uint
+            int i = 0;
+            if (gIsEmbedded)
+                for (i = 0; i < size; i++)
+                    o[i] = (InType)genrand_int32(mdv[thread_id]);
+            else
+                for (i = 0; i < size; i++) o[i] = (InType)i + ulStart;
+
+            if (0 == ulStart)
+            {
+                size_t tableSize = specialValuesUInt.size()
+                    * sizeof(decltype(specialValuesUInt)::value_type);
+                if (sizeof(InType) * size < tableSize)
+                    tableSize = sizeof(InType) * size;
+                memcpy((char *)(o + i) - tableSize, &specialValuesUInt.front(),
+                       tableSize);
+            }
+        }
+        else
+        { // long/ulong
+            cl_ulong *o = (cl_ulong *)pIn;
+            cl_ulong i, j, k;
+
+            i = 0;
+            if (ulStart == 0)
+            {
+                // Try various powers of two
+                for (j = 0; j < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    o[j] = (cl_ulong)1 << j;
+                i = j;
+
+                // try the complement of those
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    o[i++] = ~((cl_ulong)1 << j);
+
+                // Try various negative powers of two
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    o[i++] = (cl_ulong)0xFFFFFFFFFFFFFFFEULL << j;
+
+                // try various powers of two plus 1, shifted by various amounts
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    for (k = 0;
+                         i < (cl_ulong)size && k < 8 * sizeof(cl_ulong) - j;
+                         k++)
+                        o[i++] = (((cl_ulong)1 << j) + 1) << k;
+
+                // try various powers of two minus 1
+                for (j = 0; i < (cl_ulong)size && j < 8 * sizeof(cl_ulong); j++)
+                    for (k = 0;
+                         i < (cl_ulong)size && k < 8 * sizeof(cl_ulong) - j;
+                         k++)
+                        o[i++] = (((cl_ulong)1 << j) - 1) << k;
+
+                // Other patterns
+                cl_ulong pattern[] = {
+                    0x3333333333333333ULL, 0x5555555555555555ULL,
+                    0x9999999999999999ULL, 0x6666666666666666ULL,
+                    0xccccccccccccccccULL, 0xaaaaaaaaaaaaaaaaULL
+                };
+                cl_ulong mask[] = { 0xffffffffffffffffULL,
+                                    0xff00ff00ff00ff00ULL,
+                                    0xffff0000ffff0000ULL,
+                                    0xffffffff00000000ULL };
+                for (j = 0; i < (cl_ulong)size
+                     && j < sizeof(pattern) / sizeof(pattern[0]);
+                     j++)
+                    for (k = 0; i + 2 <= (cl_ulong)size
+                         && k < sizeof(mask) / sizeof(mask[0]);
+                         k++)
+                    {
+                        o[i++] = pattern[j] & mask[k];
+                        o[i++] = pattern[j] & ~mask[k];
+                    }
+            }
+
+            auto &md = mdv[thread_id];
+            for (; i < (cl_ulong)size; i++)
+                o[i] = (cl_ulong)genrand_int32(md)
+                    | ((cl_ulong)genrand_int32(md) << 32);
+        }
+    } // integrals
+    else if (std::is_same<InType, cl_float>::value)
+    {
+        cl_uint *o = (cl_uint *)pIn;
+        int i;
+
+        if (gIsEmbedded)
+            for (i = 0; i < size; i++)
+                o[i] = (cl_uint)genrand_int32(mdv[thread_id]);
+        else
+            for (i = 0; i < size; i++) o[i] = (cl_uint)i + ulStart;
+
+        if (0 == ulStart)
+        {
+            size_t tableSize = specialValuesFloat.size()
+                * sizeof(decltype(specialValuesFloat)::value_type);
+            if (sizeof(InType) * size < tableSize)
+                tableSize = sizeof(InType) * size;
+            memcpy((char *)(o + i) - tableSize, &specialValuesFloat.front(),
+                   tableSize);
+        }
+
+        if (kUnsaturated == sat)
+        {
+            InType *f = (InType *)pIn;
+            for (i = 0; i < size; i++) f[i] = clamp(f[i]);
+        }
+    }
+    else if (std::is_same<InType, cl_double>::value)
+    {
+        InType *o = (InType *)pIn;
+        int i = 0;
+
+        union {
+            uint64_t u;
+            InType d;
+        } u;
+
+        for (i = 0; i < size; i++)
+        {
+            uint64_t z = i + ulStart;
+
+            uint32_t bits = ((uint32_t)z ^ (uint32_t)(z >> 32));
+            // split 0x89abcdef to 0x89abc00000000def
+            u.u = bits & 0xfffU;
+            u.u |= (uint64_t)(bits & ~0xfffU) << 32;
+            // sign extend the leading bit of def segment as sign bit so that
+            // the middle region consists of either all 1s or 0s
+            u.u -= (bits & 0x800U) << 1;
+            o[i] = u.d;
+        }
+
+        if (0 == ulStart)
+        {
+            size_t tableSize = specialValuesDouble.size()
+                * sizeof(decltype(specialValuesDouble)::value_type);
+            if (sizeof(InType) * size < tableSize)
+                tableSize = sizeof(InType) * size;
+            memcpy((char *)(o + i) - tableSize, &specialValuesDouble.front(),
+                   tableSize);
+        }
+
+        if (0 == sat)
+            for (i = 0; i < size; i++) o[i] = clamp(o[i]);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename InType, typename OutType, bool InFP, bool OutFP>
+InType DataInfoSpec<InType, OutType, InFP, OutFP>::clamp(const InType &in)
+{
+    if (std::is_integral<OutType>::value)
+    {
+        if (std::is_same<InType, cl_float>::value)
+        {
+            return fclamp(clamp_ranges[round].first, in,
+                          clamp_ranges[round].second);
+        }
+        else if (std::is_same<InType, cl_double>::value)
+        {
+            return dclamp(clamp_ranges[round].first, in,
+                          clamp_ranges[round].second);
+        }
+    }
+    return in;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+#endif /* CONVERSIONS_DATA_INFO_H */
diff --git a/test_conformance/conversions/fplib.h b/test_conformance/conversions/fplib.h
index 534550a32..c69b1e891 100644
--- a/test_conformance/conversions/fplib.h
+++ b/test_conformance/conversions/fplib.h
@@ -13,6 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
+#ifndef CONVERSIONS_FPLIB_H
+#define CONVERSIONS_FPLIB_H
+
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -28,3 +31,5 @@ typedef enum
 
 float qcom_u64_2_f32(uint64_t data, bool sat, roundingMode rnd);
 float qcom_s64_2_f32(int64_t data, bool sat, roundingMode rnd);
+
+#endif
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index 2ee05463c..08fcdb473 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -13,12 +13,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/compat.h"
 #include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
 #include "harness/testHarness.h"
-#include "harness/kernelHelpers.h"
 #include "harness/parseParameters.h"
+#include "harness/mt19937.h"
+
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif
@@ -33,7 +33,6 @@
 #include <libgen.h>
 #endif
 
-#include "mingw_compat.h"
 #if defined(__MINGW32__)
 #include <sys/param.h>
 #endif
@@ -49,278 +48,75 @@
 #include <time.h>
 
 #include <algorithm>
+#include <type_traits>
+#include <vector>
 
 #include "Sleep.h"
-#include "basic_test_conversions.h"
 
-#if (defined(_WIN32) && defined(_MSC_VER))
-// need for _controlfp_s and rouinding modes in RoundingMode
-#include "harness/testHarness.h"
-#endif
-
-#pragma mark -
-#pragma mark globals
-
-#define BUFFER_SIZE (1024 * 1024)
-#define kPageSize 4096
-#define EMBEDDED_REDUCTION_FACTOR 16
-#define PERF_LOOP_COUNT 100
+#include "basic_test_conversions.h"
+#include <limits.h>
+#include <string.h>
 
-#define kCallStyleCount (kVectorSizeCount + 1 /* for implicit scalar */)
+#include "harness/mt19937.h"
 
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
 #include "fplib.h"
-extern bool qcom_sat;
-extern roundingMode qcom_rm;
 #endif
 
-const char **argList = NULL;
-int argCount = 0;
-cl_context gContext = NULL;
-cl_command_queue gQueue = NULL;
-char appName[64] = "ctest";
-int gStartTestNumber = -1;
-int gEndTestNumber = 0;
-#if defined(__APPLE__)
-int gTimeResults = 1;
-#else
-int gTimeResults = 0;
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
+/* Rounding modes and saturation for use with qcom 64 bit to float conversion
+ * library */
+bool qcom_sat;
+roundingMode qcom_rm;
 #endif
-int gReportAverageTimes = 0;
-void *gIn = NULL;
-void *gRef = NULL;
-void *gAllowZ = NULL;
-void *gOut[kCallStyleCount] = { NULL };
-cl_mem gInBuffer;
-cl_mem gOutBuffers[kCallStyleCount];
-size_t gComputeDevices = 0;
-uint32_t gDeviceFrequency = 0;
-int gWimpyMode = 0;
-int gWimpyReductionFactor = 128;
-int gSkipTesting = 0;
-int gForceFTZ = 0;
-int gMultithread = 1;
-int gIsRTZ = 0;
-uint32_t gSimdSize = 1;
-int gHasDouble = 0;
-int gTestDouble = 1;
-const char *sizeNames[] = { "", "", "2", "3", "4", "8", "16" };
-const int vectorSizes[] = { 1, 1, 2, 3, 4, 8, 16 };
-int gMinVectorSize = 0;
-int gMaxVectorSize = sizeof(vectorSizes) / sizeof(vectorSizes[0]);
-static MTdata gMTdata;
-
-#pragma mark -
-#pragma mark Declarations
+
 
 static int ParseArgs(int argc, const char **argv);
 static void PrintUsage(void);
 test_status InitCL(cl_device_id device);
-static int GetTestCase(const char *name, Type *outType, Type *inType,
-                       SaturationMode *sat, RoundingMode *round);
-static int DoTest(cl_device_id device, Type outType, Type inType,
-                  SaturationMode sat, RoundingMode round, MTdata d);
-static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
-                              RoundingMode round, int vectorSize,
-                              cl_kernel *outKernel);
-static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
-                     size_t blockCount);
-
-void *FlushToZero(void);
-void UnFlushToZero(void *);
-
-// Windows (since long double got deprecated) sets the x87 to 53-bit precision
-// (that's x87 default state).  This causes problems with the tests that
-// convert long and ulong to float and double or otherwise deal with values
-// that need more precision than 53-bit. So, set the x87 to 64-bit precision.
-static inline void Force64BitFPUPrecision(void)
-{
-#if __MINGW32__
-    // The usual method is to use _controlfp as follows:
-    //     #include <float.h>
-    //     _controlfp(_PC_64, _MCW_PC);
-    //
-    // _controlfp is available on MinGW32 but not on MinGW64. Instead of having
-    // divergent code just use inline assembly which works for both.
-    unsigned short int orig_cw = 0;
-    unsigned short int new_cw = 0;
-    __asm__ __volatile__("fstcw %0" : "=m"(orig_cw));
-    new_cw = orig_cw | 0x0300; // set precision to 64-bit
-    __asm__ __volatile__("fldcw  %0" ::"m"(new_cw));
-#else
-    /* Implement for other platforms if needed */
-#endif
-}
 
-int test_conversions(cl_device_id device, cl_context context,
-                     cl_command_queue queue, int num_elements)
-{
-    int error, i, testNumber = -1;
-    int startMinVectorSize = gMinVectorSize;
-    Type inType, outType;
-    RoundingMode round;
-    SaturationMode sat;
 
-    if (argCount)
-    {
-        for (i = 0; i < argCount; i++)
-        {
-            if (GetTestCase(argList[i], &outType, &inType, &sat, &round))
-            {
-                vlog_error("\n\t\t**** ERROR:  Unable to parse function name "
-                           "%s.  Skipping....  *****\n\n",
-                           argList[i]);
-                continue;
-            }
+const char *gTypeNames[kTypeCount] = { "uchar", "char", "ushort", "short",
+                                       "uint",  "int",  "float",  "double",
+                                       "ulong", "long" };
 
-            // skip double if we don't have it
-            if (!gTestDouble && (inType == kdouble || outType == kdouble))
-            {
-                if (gHasDouble)
-                {
-                    vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
-                               gTypeNames[outType], gSaturationNames[sat],
-                               gRoundingModeNames[round], gTypeNames[inType]);
-                    vlog("\t\tcl_khr_fp64 enabled, but double testing turned "
-                         "off.\n");
-                }
+const char *gRoundingModeNames[kRoundingModeCount] = { "", "_rte", "_rtp",
+                                                       "_rtn", "_rtz" };
 
-                continue;
-            }
+const char *gSaturationNames[2] = { "", "_sat" };
 
-            // skip longs on embedded
-            if (!gHasLong
-                && (inType == klong || outType == klong || inType == kulong
-                    || outType == kulong))
-            {
-                continue;
-            }
+size_t gTypeSizes[kTypeCount] = {
+    sizeof(cl_uchar), sizeof(cl_char), sizeof(cl_ushort), sizeof(cl_short),
+    sizeof(cl_uint),  sizeof(cl_int),  sizeof(cl_float),  sizeof(cl_double),
+    sizeof(cl_ulong), sizeof(cl_long),
+};
 
-            // Skip the implicit converts if the rounding mode is not default or
-            // test is saturated
-            if (0 == startMinVectorSize)
-            {
-                if (sat || round != kDefaultRoundingMode)
-                    gMinVectorSize = 1;
-                else
-                    gMinVectorSize = 0;
-            }
+char appName[64] = "ctest";
+int gMultithread = 1;
 
-            if ((error = DoTest(device, outType, inType, sat, round, gMTdata)))
-            {
-                vlog_error("\t *** convert_%sn%s%s( %sn ) FAILED ** \n",
-                           gTypeNames[outType], gSaturationNames[sat],
-                           gRoundingModeNames[round], gTypeNames[inType]);
-            }
-        }
+
+int test_conversions(cl_device_id device, cl_context context,
+                     cl_command_queue queue, int num_elements)
+{
+    if (argCount)
+    {
+        return MakeAndRunTest<CustomConversionsTest>(device, context, queue,
+                                                     num_elements);
     }
     else
     {
-        for (outType = (Type)0; outType < kTypeCount;
-             outType = (Type)(outType + 1))
-        {
-            for (inType = (Type)0; inType < kTypeCount;
-                 inType = (Type)(inType + 1))
-            {
-                // skip longs on embedded
-                if (!gHasLong
-                    && (inType == klong || outType == klong || inType == kulong
-                        || outType == kulong))
-                {
-                    continue;
-                }
-
-                for (sat = (SaturationMode)0; sat < kSaturationModeCount;
-                     sat = (SaturationMode)(sat + 1))
-                {
-                    // skip illegal saturated conversions to float type
-                    if (kSaturated == sat
-                        && (outType == kfloat || outType == kdouble))
-                    {
-                        continue;
-                    }
-
-                    for (round = (RoundingMode)0; round < kRoundingModeCount;
-                         round = (RoundingMode)(round + 1))
-                    {
-                        if (++testNumber < gStartTestNumber)
-                        {
-                            //     vlog( "%d) skipping convert_%sn%s%s( %sn
-                            //     )\n", testNumber, gTypeNames[ outType ],
-                            //     gSaturationNames[ sat ],
-                            //     gRoundingModeNames[round], gTypeNames[inType]
-                            //     );
-                            continue;
-                        }
-                        else
-                        {
-                            if (gEndTestNumber > 0
-                                && testNumber >= gEndTestNumber)
-                            {
-                                goto exit;
-                            }
-                        }
-
-                        vlog("%d) Testing convert_%sn%s%s( %sn ):\n",
-                             testNumber, gTypeNames[outType],
-                             gSaturationNames[sat], gRoundingModeNames[round],
-                             gTypeNames[inType]);
-
-                        // skip double if we don't have it
-                        if (!gTestDouble
-                            && (inType == kdouble || outType == kdouble))
-                        {
-                            if (gHasDouble)
-                            {
-                                vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
-                                           "FAILED ** \n",
-                                           testNumber, gTypeNames[outType],
-                                           gSaturationNames[sat],
-                                           gRoundingModeNames[round],
-                                           gTypeNames[inType]);
-                                vlog("\t\tcl_khr_fp64 enabled, but double "
-                                     "testing turned off.\n");
-                            }
-                            continue;
-                        }
-
-                        // Skip the implicit converts if the rounding mode is
-                        // not default or test is saturated
-                        if (0 == startMinVectorSize)
-                        {
-                            if (sat || round != kDefaultRoundingMode)
-                                gMinVectorSize = 1;
-                            else
-                                gMinVectorSize = 0;
-                        }
-
-                        if ((error = DoTest(device, outType, inType, sat, round,
-                                            gMTdata)))
-                        {
-                            vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
-                                       "FAILED ** \n",
-                                       testNumber, gTypeNames[outType],
-                                       gSaturationNames[sat],
-                                       gRoundingModeNames[round],
-                                       gTypeNames[inType]);
-                        }
-                    }
-                }
-            }
-        }
+        return MakeAndRunTest<ConversionsTest>(device, context, queue,
+                                               num_elements);
     }
-
-exit:
-    return gFailCount;
 }
 
+
 test_definition test_list[] = {
     ADD_TEST(conversions),
 };
 
 const int test_num = ARRAY_SIZE(test_list);
 
-#pragma mark -
 
 int main(int argc, const char **argv)
 {
@@ -378,8 +174,6 @@ int main(int argc, const char **argv)
     return ret;
 }
 
-#pragma mark -
-#pragma mark setup
 
 static int ParseArgs(int argc, const char **argv)
 {
@@ -509,7 +303,7 @@ static int ParseArgs(int argc, const char **argv)
         gWimpyMode = 1;
     }
 
-    vlog( "\n" );
+    vlog("\n");
 
     PrintArch();
 
@@ -526,6 +320,7 @@ static int ParseArgs(int argc, const char **argv)
     return 0;
 }
 
+
 static void PrintUsage(void)
 {
     int i;
@@ -564,63 +359,6 @@ static void PrintUsage(void)
 }
 
 
-static int GetTestCase(const char *name, Type *outType, Type *inType,
-                       SaturationMode *sat, RoundingMode *round)
-{
-    int i;
-
-    // Find the return type
-    for (i = 0; i < kTypeCount; i++)
-        if (name == strstr(name, gTypeNames[i]))
-        {
-            *outType = (Type)i;
-            name += strlen(gTypeNames[i]);
-
-            break;
-        }
-
-    if (i == kTypeCount) return -1;
-
-    // Check to see if _sat appears next
-    *sat = (SaturationMode)0;
-    for (i = 1; i < kSaturationModeCount; i++)
-        if (name == strstr(name, gSaturationNames[i]))
-        {
-            *sat = (SaturationMode)i;
-            name += strlen(gSaturationNames[i]);
-            break;
-        }
-
-    *round = (RoundingMode)0;
-    for (i = 1; i < kRoundingModeCount; i++)
-        if (name == strstr(name, gRoundingModeNames[i]))
-        {
-            *round = (RoundingMode)i;
-            name += strlen(gRoundingModeNames[i]);
-            break;
-        }
-
-    if (*name != '_') return -2;
-    name++;
-
-    for (i = 0; i < kTypeCount; i++)
-        if (name == strstr(name, gTypeNames[i]))
-        {
-            *inType = (Type)i;
-            name += strlen(gTypeNames[i]);
-
-            break;
-        }
-
-    if (i == kTypeCount) return -3;
-
-    if (*name != '\0') return -4;
-
-    return 0;
-}
-
-#pragma mark -
-#pragma mark OpenCL
 
 test_status InitCL(cl_device_id device)
 {
@@ -678,6 +416,20 @@ test_status InitCL(cl_device_id device)
     }
     gTestDouble &= gHasDouble;
 
+    // detect whether profile of the device is embedded
+    char profile[1024] = "";
+    if ((error = clGetDeviceInfo(device, CL_DEVICE_PROFILE, sizeof(profile),
+                                 profile, NULL)))
+    {
+        vlog_error("clGetDeviceInfo failed. (%d)\n", error);
+        return TEST_FAIL;
+    }
+    else if (strstr(profile, "EMBEDDED_PROFILE"))
+    {
+        gIsEmbedded = 1;
+        if (!is_extension_available(device, "cles_khr_int64")) gHasLong = 0;
+    }
+
     gContext = clCreateContext(NULL, 1, &device, notify_callback, NULL, &error);
     if (NULL == gContext || error)
     {
@@ -726,10 +478,8 @@ test_status InitCL(cl_device_id device)
         }
     }
 
-
     gMTdata = init_genrand(gRandomSeed);
 
-
     char c[1024];
     static const char *no_yes[] = { "NO", "YES" };
     vlog("\nCompute Device info:\n");
@@ -760,977 +510,4 @@ test_status InitCL(cl_device_id device)
     return TEST_PASS;
 }
 
-static int RunKernel(cl_kernel kernel, void *inBuf, void *outBuf,
-                     size_t blockCount)
-{
-    // The global dimensions are just the blockCount to execute since we haven't
-    // set up multiple queues for multiple devices.
-    int error;
-
-    error = clSetKernelArg(kernel, 0, sizeof(inBuf), &inBuf);
-    error |= clSetKernelArg(kernel, 1, sizeof(outBuf), &outBuf);
-
-    if (error)
-    {
-        vlog_error("FAILED -- could not set kernel args (%d)\n", error);
-        return error;
-    }
-
-    if ((error = clEnqueueNDRangeKernel(gQueue, kernel, 1, NULL, &blockCount,
-                                        NULL, 0, NULL, NULL)))
-    {
-        vlog_error("FAILED -- could not execute kernel (%d)\n", error);
-        return error;
-    }
-
-    return 0;
-}
-
-#if defined(__APPLE__)
-#include <mach/mach_time.h>
-#endif
-
-uint64_t GetTime(void);
-uint64_t GetTime(void)
-{
-#if defined(__APPLE__)
-    return mach_absolute_time();
-#elif defined(_MSC_VER)
-    return ReadTime();
-#else
-    // mach_absolute_time is a high precision timer with precision < 1
-    // microsecond.
-#warning need accurate clock here.  Times are invalid.
-    return 0;
-#endif
-}
-
-
-#if defined(_MSC_VER)
-/* function is defined in "compat.h" */
-#else
-double SubtractTime(uint64_t endTime, uint64_t startTime);
-double SubtractTime(uint64_t endTime, uint64_t startTime)
-{
-    uint64_t diff = endTime - startTime;
-    static double conversion = 0.0;
-
-    if (0.0 == conversion)
-    {
-#if defined(__APPLE__)
-        mach_timebase_info_data_t info = { 0, 0 };
-        kern_return_t err = mach_timebase_info(&info);
-        if (0 == err)
-            conversion = 1e-9 * (double)info.numer / (double)info.denom;
-#else
-        // This function consumes output from GetTime() above, and converts the
-        // time to secionds.
-#warning need accurate ticks to seconds conversion factor here. Times are invalid.
-#endif
-    }
-
-    // strictly speaking we should also be subtracting out timer latency here
-    return conversion * (double)diff;
-}
-#endif
-
-typedef struct CalcReferenceValuesInfo
-{
-    struct WriteInputBufferInfo
-        *parent; // pointer back to the parent WriteInputBufferInfo struct
-    cl_kernel kernel; // the kernel for this vector size
-    cl_program program; // the program for this vector size
-    cl_uint vectorSize; // the vector size for this callback chain
-    void *p; // the pointer to mapped result data for this vector size
-    cl_int result;
-} CalcReferenceValuesInfo;
-
-typedef struct WriteInputBufferInfo
-{
-    volatile cl_event
-        calcReferenceValues; // user event which signals when main thread is
-                             // done calculating reference values
-    volatile cl_event
-        doneBarrier; // user event which signals when worker threads are done
-    cl_uint count; // the number of elements in the array
-    Type outType; // the data type of the conversion result
-    Type inType; // the data type of the conversion input
-    volatile int barrierCount;
-    CalcReferenceValuesInfo calcInfo[kCallStyleCount];
-} WriteInputBufferInfo;
-
-cl_uint RoundUpToNextPowerOfTwo(cl_uint x);
-cl_uint RoundUpToNextPowerOfTwo(cl_uint x)
-{
-    if (0 == (x & (x - 1))) return x;
-
-    while (x & (x - 1)) x &= x - 1;
-
-    return x + x;
-}
-
-void WriteInputBufferComplete(void *);
-
-typedef struct DataInitInfo
-{
-    cl_ulong start;
-    cl_uint size;
-    Type outType;
-    Type inType;
-    SaturationMode sat;
-    RoundingMode round;
-    MTdata *d;
-} DataInitInfo;
-
-cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p);
-cl_int InitData(cl_uint job_id, cl_uint thread_id, void *p)
-{
-    DataInitInfo *info = (DataInitInfo *)p;
-
-    gInitFunctions[info->inType](
-        (char *)gIn + job_id * info->size * gTypeSizes[info->inType], info->sat,
-        info->round, info->outType, info->start + job_id * info->size,
-        info->size, info->d[thread_id]);
-    return CL_SUCCESS;
-}
-
-static void setAllowZ(uint8_t *allow, uint32_t *x, cl_uint count)
-{
-    cl_uint i;
-    for (i = 0; i < count; ++i)
-        allow[i] |= (uint8_t)((x[i] & 0x7f800000U) == 0);
-}
-
-cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p);
-cl_int PrepareReference(cl_uint job_id, cl_uint thread_id, void *p)
-{
-    DataInitInfo *info = (DataInitInfo *)p;
-    cl_uint count = info->size;
-    Type inType = info->inType;
-    Type outType = info->outType;
-    RoundingMode round = info->round;
-    size_t j;
-
-    Force64BitFPUPrecision();
-
-    void *s = (cl_uchar *)gIn + job_id * count * gTypeSizes[info->inType];
-    void *a = (cl_uchar *)gAllowZ + job_id * count;
-    void *d = (cl_uchar *)gRef + job_id * count * gTypeSizes[info->outType];
 
-    if (outType != inType)
-    {
-        // create the reference while we wait
-        Convert f = gConversions[outType][inType];
-        if (info->sat) f = gSaturatedConversions[outType][inType];
-
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
-        /* ARM VFP doesn't have hardware instruction for converting from 64-bit
-         * integer to float types, hence GCC ARM uses the floating-point
-         * emulation code despite which -mfloat-abi setting it is. But the
-         * emulation code in libgcc.a has only one rounding mode (round to
-         * nearest even in this case) and ignores the user rounding mode setting
-         * in hardware. As a result setting rounding modes in hardware won't
-         * give correct rounding results for type covert from 64-bit integer to
-         * float using GCC for ARM compiler so for testing different rounding
-         * modes, we need to use alternative reference function. ARM64 does have
-         * an instruction, however we cannot guarantee the compiler will use it.
-         * On all ARM architechures use emulation to calculate reference.*/
-        switch (round)
-        {
-            /* conversions to floating-point type use the current rounding mode.
-             * The only default floating-point rounding mode supported is round
-             * to nearest even i.e the current rounding mode will be _rte for
-             * floating-point types. */
-            case kDefaultRoundingMode: qcom_rm = qcomRTE; break;
-            case kRoundToNearestEven: qcom_rm = qcomRTE; break;
-            case kRoundUp: qcom_rm = qcomRTP; break;
-            case kRoundDown: qcom_rm = qcomRTN; break;
-            case kRoundTowardZero: qcom_rm = qcomRTZ; break;
-            default:
-                vlog_error("ERROR: undefined rounding mode %d\n", round);
-                break;
-        }
-        qcom_sat = info->sat;
-#endif
-
-        RoundingMode oldRound = set_round(round, outType);
-        f(d, s, count);
-        set_round(oldRound, outType);
-
-        // Decide if we allow a zero result in addition to the correctly rounded
-        // one
-        memset(a, 0, count);
-        if (gForceFTZ)
-        {
-            if (inType == kfloat) setAllowZ((uint8_t *)a, (uint32_t *)s, count);
-            if (outType == kfloat)
-                setAllowZ((uint8_t *)a, (uint32_t *)d, count);
-        }
-    }
-    else
-    {
-        // Copy the input to the reference
-        memcpy(d, s, info->size * gTypeSizes[inType]);
-    }
-
-    // Patch up NaNs conversions to integer to zero -- these can be converted to
-    // any integer
-    if (info->outType != kfloat && info->outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)d + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((double *)d)[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)s;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j])) ((float *)d)[j] = NAN;
-            }
-        }
-    }
-
-    return CL_SUCCESS;
-}
-
-static int DoTest(cl_device_id device, Type outType, Type inType,
-                  SaturationMode sat, RoundingMode round, MTdata d)
-{
-#ifdef __APPLE__
-    cl_ulong wall_start = mach_absolute_time();
-#endif
-
-    DataInitInfo init_info = { 0, 0, outType, inType, sat, round, NULL };
-    WriteInputBufferInfo writeInputBufferInfo;
-    int vectorSize;
-    int error = 0;
-    cl_uint threads = GetThreadCount();
-    uint64_t i;
-
-    gTestCount++;
-    size_t blockCount =
-        BUFFER_SIZE / std::max(gTypeSizes[inType], gTypeSizes[outType]);
-    size_t step = blockCount;
-    uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
-
-    memset(&writeInputBufferInfo, 0, sizeof(writeInputBufferInfo));
-    init_info.d = (MTdata *)malloc(threads * sizeof(MTdata));
-    if (NULL == init_info.d)
-    {
-        vlog_error(
-            "ERROR: Unable to allocate storage for random number generator!\n");
-        return -1;
-    }
-    for (i = 0; i < threads; i++)
-    {
-        init_info.d[i] = init_genrand(genrand_int32(d));
-        if (NULL == init_info.d[i])
-        {
-            vlog_error("ERROR: Unable to allocate storage for random number "
-                       "generator!\n");
-            return -1;
-        }
-    }
-
-    writeInputBufferInfo.outType = outType;
-    writeInputBufferInfo.inType = inType;
-
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        writeInputBufferInfo.calcInfo[vectorSize].program =
-            MakeProgram(outType, inType, sat, round, vectorSize,
-                        &writeInputBufferInfo.calcInfo[vectorSize].kernel);
-        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].program)
-        {
-            gFailCount++;
-            return -1;
-        }
-        if (NULL == writeInputBufferInfo.calcInfo[vectorSize].kernel)
-        {
-            gFailCount++;
-            vlog_error("\t\tFAILED -- Failed to create kernel.\n");
-            return -2;
-        }
-
-        writeInputBufferInfo.calcInfo[vectorSize].parent =
-            &writeInputBufferInfo;
-        writeInputBufferInfo.calcInfo[vectorSize].vectorSize = vectorSize;
-        writeInputBufferInfo.calcInfo[vectorSize].result = -1;
-    }
-
-    if (gSkipTesting) goto exit;
-
-    // Patch up rounding mode if default is RTZ
-    // We leave the part above in default rounding mode so that the right kernel
-    // is compiled.
-    if (round == kDefaultRoundingMode && gIsRTZ && (outType == kfloat))
-        init_info.round = round = kRoundTowardZero;
-
-    // Figure out how many elements are in a work block
-
-    // we handle 64-bit types a bit differently.
-    if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
-
-    if (!gWimpyMode && gIsEmbedded)
-        step = blockCount * EMBEDDED_REDUCTION_FACTOR;
-
-    if (gWimpyMode) step = (size_t)blockCount * (size_t)gWimpyReductionFactor;
-    vlog("Testing... ");
-    fflush(stdout);
-    for (i = 0; i < (uint64_t)lastCase; i += step)
-    {
-
-        if (0 == (i & ((lastCase >> 3) - 1)))
-        {
-            vlog(".");
-            fflush(stdout);
-        }
-
-        cl_uint count = (uint32_t)std::min((uint64_t)blockCount, lastCase - i);
-        writeInputBufferInfo.count = count;
-
-        // Crate a user event to represent the status of the reference value
-        // computation completion
-        writeInputBufferInfo.calcReferenceValues =
-            clCreateUserEvent(gContext, &error);
-        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
-        {
-            vlog_error("ERROR: Unable to create user event. (%d)\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // retain for consumption by MapOutputBufferComplete
-        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
-             vectorSize++)
-        {
-            if ((error =
-                     clRetainEvent(writeInputBufferInfo.calcReferenceValues)))
-            {
-                vlog_error("ERROR: Unable to retain user event. (%d)\n", error);
-                gFailCount++;
-                goto exit;
-            }
-        }
-
-        // Crate a user event to represent when the callbacks are done verifying
-        // correctness
-        writeInputBufferInfo.doneBarrier = clCreateUserEvent(gContext, &error);
-        if (error || NULL == writeInputBufferInfo.calcReferenceValues)
-        {
-            vlog_error("ERROR: Unable to create user event for barrier. (%d)\n",
-                       error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // retain for use by the callback that calls this
-        if ((error = clRetainEvent(writeInputBufferInfo.doneBarrier)))
-        {
-            vlog_error("ERROR: Unable to retain user event doneBarrier. (%d)\n",
-                       error);
-            gFailCount++;
-            goto exit;
-        }
-
-        //      Call this in a multithreaded manner
-        //      gInitFunctions[ inType ]( gIn, sat, round, outType, i, count, d
-        //      );
-        cl_uint chunks = RoundUpToNextPowerOfTwo(threads) * 2;
-        init_info.start = i;
-        init_info.size = count / chunks;
-        if (init_info.size < 16384)
-        {
-            chunks = RoundUpToNextPowerOfTwo(threads);
-            init_info.size = count / chunks;
-            if (init_info.size < 16384)
-            {
-                init_info.size = count;
-                chunks = 1;
-            }
-        }
-        ThreadPool_Do(InitData, chunks, &init_info);
-
-        // Copy the results to the device
-        if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_TRUE, 0,
-                                          count * gTypeSizes[inType], gIn, 0,
-                                          NULL, NULL)))
-        {
-            vlog_error("ERROR: clEnqueueWriteBuffer failed. (%d)\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // Call completion callback for the write, which will enqueue the rest
-        // of the work.
-        WriteInputBufferComplete((void *)&writeInputBufferInfo);
-
-        // Make sure the work is actually running, so we don't deadlock
-        if ((error = clFlush(gQueue)))
-        {
-            vlog_error("clFlush failed with error %d\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        ThreadPool_Do(PrepareReference, chunks, &init_info);
-
-        // signal we are done calculating the reference results
-        if ((error = clSetUserEventStatus(
-                 writeInputBufferInfo.calcReferenceValues, CL_COMPLETE)))
-        {
-            vlog_error(
-                "Error:  Failed to set user event status to CL_COMPLETE:  %d\n",
-                error);
-            gFailCount++;
-            goto exit;
-        }
-
-        // Wait for the event callbacks to finish verifying correctness.
-        if ((error = clWaitForEvents(
-                 1, (cl_event *)&writeInputBufferInfo.doneBarrier)))
-        {
-            vlog_error("Error:  Failed to wait for barrier:  %d\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-        if ((error = clReleaseEvent(writeInputBufferInfo.calcReferenceValues)))
-        {
-            vlog_error("Error:  Failed to release calcReferenceValues:  %d\n",
-                       error);
-            gFailCount++;
-            goto exit;
-        }
-
-        if ((error = clReleaseEvent(writeInputBufferInfo.doneBarrier)))
-        {
-            vlog_error("Error:  Failed to release done barrier:  %d\n", error);
-            gFailCount++;
-            goto exit;
-        }
-
-
-        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
-             vectorSize++)
-        {
-            if ((error = writeInputBufferInfo.calcInfo[vectorSize].result))
-            {
-                switch (inType)
-                {
-                    case kuchar:
-                    case kchar:
-                        vlog("Input value: 0x%2.2x ",
-                             ((unsigned char *)gIn)[error - 1]);
-                        break;
-                    case kushort:
-                    case kshort:
-                        vlog("Input value: 0x%4.4x ",
-                             ((unsigned short *)gIn)[error - 1]);
-                        break;
-                    case kuint:
-                    case kint:
-                        vlog("Input value: 0x%8.8x ",
-                             ((unsigned int *)gIn)[error - 1]);
-                        break;
-                    case kfloat:
-                        vlog("Input value: %a ", ((float *)gIn)[error - 1]);
-                        break;
-                        break;
-                    case kulong:
-                    case klong:
-                        vlog("Input value: 0x%16.16llx ",
-                             ((unsigned long long *)gIn)[error - 1]);
-                        break;
-                    case kdouble:
-                        vlog("Input value: %a ", ((double *)gIn)[error - 1]);
-                        break;
-                    default:
-                        vlog_error("Internal error at %s: %d\n", __FILE__,
-                                   __LINE__);
-                        abort();
-                        break;
-                }
-
-                // tell the user which conversion it was.
-                if (0 == vectorSize)
-                    vlog(" (implicit scalar conversion from %s to %s)\n",
-                         gTypeNames[inType], gTypeNames[outType]);
-                else
-                    vlog(" (convert_%s%s%s%s( %s%s ))\n", gTypeNames[outType],
-                         sizeNames[vectorSize], gSaturationNames[sat],
-                         gRoundingModeNames[round], gTypeNames[inType],
-                         sizeNames[vectorSize]);
-
-                gFailCount++;
-                goto exit;
-            }
-        }
-    }
-
-    log_info("done.\n");
-
-    if (gTimeResults)
-    {
-        // Kick off tests for the various vector lengths
-        for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize;
-             vectorSize++)
-        {
-            size_t workItemCount = blockCount / vectorSizes[vectorSize];
-            if (vectorSizes[vectorSize] * gTypeSizes[outType] < 4)
-                workItemCount /=
-                    4 / (vectorSizes[vectorSize] * gTypeSizes[outType]);
-
-            double sum = 0.0;
-            double bestTime = INFINITY;
-            cl_uint k;
-            for (k = 0; k < PERF_LOOP_COUNT; k++)
-            {
-                uint64_t startTime = GetTime();
-                if ((error = RunKernel(
-                         writeInputBufferInfo.calcInfo[vectorSize].kernel,
-                         gInBuffer, gOutBuffers[vectorSize], workItemCount)))
-                {
-                    gFailCount++;
-                    goto exit;
-                }
-
-                // Make sure OpenCL is done
-                if ((error = clFinish(gQueue)))
-                {
-                    vlog_error("Error %d at clFinish\n", error);
-                    goto exit;
-                }
-
-                uint64_t endTime = GetTime();
-                double time = SubtractTime(endTime, startTime);
-                sum += time;
-                if (time < bestTime) bestTime = time;
-            }
-
-            if (gReportAverageTimes) bestTime = sum / PERF_LOOP_COUNT;
-            double clocksPerOp = bestTime * (double)gDeviceFrequency
-                * gComputeDevices * gSimdSize * 1e6
-                / (workItemCount * vectorSizes[vectorSize]);
-            if (0 == vectorSize)
-                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
-                          "implicit convert %s -> %s", gTypeNames[inType],
-                          gTypeNames[outType]);
-            else
-                vlog_perf(clocksPerOp, LOWER_IS_BETTER, "clocks / element",
-                          "convert_%s%s%s%s( %s%s )", gTypeNames[outType],
-                          sizeNames[vectorSize], gSaturationNames[sat],
-                          gRoundingModeNames[round], gTypeNames[inType],
-                          sizeNames[vectorSize]);
-        }
-    }
-
-    if (gWimpyMode)
-        vlog("\tWimp pass");
-    else
-        vlog("\tpassed");
-
-#ifdef __APPLE__
-    // record the run time
-    vlog("\t(%f s)", 1e-9 * (mach_absolute_time() - wall_start));
-#endif
-    vlog("\n\n");
-    fflush(stdout);
-
-
-exit:
-    // clean up
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        clReleaseProgram(writeInputBufferInfo.calcInfo[vectorSize].program);
-        clReleaseKernel(writeInputBufferInfo.calcInfo[vectorSize].kernel);
-    }
-
-    if (init_info.d)
-    {
-        for (i = 0; i < threads; i++) free_mtdata(init_info.d[i]);
-        free(init_info.d);
-    }
-
-    return error;
-}
-
-void MapResultValuesComplete(void *data);
-
-// Note: not called reentrantly
-void WriteInputBufferComplete(void *data)
-{
-    cl_int status;
-    WriteInputBufferInfo *info = (WriteInputBufferInfo *)data;
-    cl_uint count = info->count;
-    int vectorSize;
-
-    info->barrierCount = gMaxVectorSize - gMinVectorSize;
-
-    // now that we know that the write buffer is complete, enqueue callbacks to
-    // wait for the main thread to finish calculating the reference results.
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        size_t workItemCount =
-            (count + vectorSizes[vectorSize] - 1) / (vectorSizes[vectorSize]);
-
-        if ((status = RunKernel(info->calcInfo[vectorSize].kernel, gInBuffer,
-                                gOutBuffers[vectorSize], workItemCount)))
-        {
-            gFailCount++;
-            return;
-        }
-
-        info->calcInfo[vectorSize].p = clEnqueueMapBuffer(
-            gQueue, gOutBuffers[vectorSize], CL_TRUE,
-            CL_MAP_READ | CL_MAP_WRITE, 0, count * gTypeSizes[info->outType], 0,
-            NULL, NULL, &status);
-        {
-            if (status)
-            {
-                vlog_error("ERROR: WriteInputBufferComplete calback failed "
-                           "with status: %d\n",
-                           status);
-                gFailCount++;
-                return;
-            }
-        }
-    }
-
-    for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
-    {
-        MapResultValuesComplete(info->calcInfo + vectorSize);
-    }
-
-    // Make sure the work starts moving -- otherwise we may deadlock
-    if ((status = clFlush(gQueue)))
-    {
-        vlog_error(
-            "ERROR: WriteInputBufferComplete calback failed with status: %d\n",
-            status);
-        gFailCount++;
-        return;
-    }
-
-    // e was already released by the main thread. It should be destroyed
-    // automatically soon after we exit.
-}
-
-void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
-                                             void *data);
-
-// Note: May be called reentrantly
-void MapResultValuesComplete(void *data)
-{
-    cl_int status;
-    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
-    cl_event calcReferenceValues = info->parent->calcReferenceValues;
-
-    // we know that the map is done, wait for the main thread to finish
-    // calculating the reference values
-    if ((status = clSetEventCallback(calcReferenceValues, CL_COMPLETE,
-                                     CalcReferenceValuesComplete, data)))
-    {
-        vlog_error("ERROR: clSetEventCallback failed in "
-                   "MapResultValuesComplete with status: %d\n",
-                   status);
-        gFailCount++; // not thread safe -- being lazy here
-    }
-
-    // this thread no longer needs its reference to info->calcReferenceValues,
-    // so release it
-    if ((status = clReleaseEvent(calcReferenceValues)))
-    {
-        vlog_error("ERROR: clReleaseEvent(info->calcReferenceValues) failed "
-                   "with status: %d\n",
-                   status);
-        gFailCount++; // not thread safe -- being lazy here
-    }
-
-    // no need to flush since we didn't enqueue anything
-
-    // e was already released by WriteInputBufferComplete. It should be
-    // destroyed automatically soon after we exit.
-}
-
-
-void CL_CALLBACK CalcReferenceValuesComplete(cl_event e, cl_int status,
-                                             void *data)
-{
-    CalcReferenceValuesInfo *info = (CalcReferenceValuesInfo *)data;
-    cl_uint vectorSize = info->vectorSize;
-    cl_uint count = info->parent->count;
-    Type outType =
-        info->parent->outType; // the data type of the conversion result
-    Type inType = info->parent->inType; // the data type of the conversion input
-    size_t j;
-    cl_int error;
-    cl_event doneBarrier = info->parent->doneBarrier;
-
-    // report spurious error condition
-    if (CL_SUCCESS != status)
-    {
-        vlog_error("ERROR: CalcReferenceValuesComplete did not succeed! (%d)\n",
-                   status);
-        gFailCount++; // lazy about thread safety here
-        return;
-    }
-
-    // Now we know that both results have been mapped back from the device, and
-    // the main thread is done calculating the reference results. It is now time
-    // to check the results.
-
-    // verify results
-    void *mapped = info->p;
-
-    // Patch up NaNs conversions to integer to zero -- these can be converted to
-    // any integer
-    if (outType != kfloat && outType != kdouble)
-    {
-        if (inType == kfloat)
-        {
-            float *inp = (float *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-        if (inType == kdouble)
-        {
-            double *inp = (double *)gIn;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]))
-                    memset((char *)mapped + j * gTypeSizes[outType], 0,
-                           gTypeSizes[outType]);
-            }
-        }
-    }
-    else if (inType == kfloat || inType == kdouble)
-    { // outtype and intype is float or double.  NaN conversions for float <->
-      // double can be any NaN
-        if (inType == kfloat && outType == kdouble)
-        {
-            float *inp = (float *)gIn;
-            double *outp = (double *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-        if (inType == kdouble && outType == kfloat)
-        {
-            double *inp = (double *)gIn;
-            float *outp = (float *)mapped;
-            for (j = 0; j < count; j++)
-            {
-                if (isnan(inp[j]) && isnan(outp[j])) outp[j] = NAN;
-            }
-        }
-    }
-
-    if (memcmp(mapped, gRef, count * gTypeSizes[outType]))
-        info->result = gCheckResults[outType](mapped, gRef, gAllowZ, count,
-                                              vectorSizes[vectorSize]);
-    else
-        info->result = 0;
-
-    // Fill the output buffer with junk and release it
-    {
-        cl_uint pattern = 0xffffdead;
-        memset_pattern4(mapped, &pattern, count * gTypeSizes[outType]);
-        if ((error = clEnqueueUnmapMemObject(gQueue, gOutBuffers[vectorSize],
-                                             mapped, 0, NULL, NULL)))
-        {
-            vlog_error("ERROR: clEnqueueUnmapMemObject failed in "
-                       "CalcReferenceValuesComplete  (%d)\n",
-                       error);
-            gFailCount++;
-        }
-    }
-
-    if (1 == ThreadPool_AtomicAdd(&info->parent->barrierCount, -1))
-    {
-        if ((status = clSetUserEventStatus(doneBarrier, CL_COMPLETE)))
-        {
-            vlog_error("ERROR: clSetUserEventStatus failed in "
-                       "CalcReferenceValuesComplete (err: %d). We're probably "
-                       "going to deadlock.\n",
-                       status);
-            gFailCount++;
-            return;
-        }
-
-        if ((status = clReleaseEvent(doneBarrier)))
-        {
-            vlog_error("ERROR: clReleaseEvent failed in "
-                       "CalcReferenceValuesComplete (err: %d).\n",
-                       status);
-            gFailCount++;
-            return;
-        }
-    }
-    // e was already released by WriteInputBufferComplete. It should be
-    // destroyed automatically soon after all the calls to
-    // CalcReferenceValuesComplete exit.
-}
-
-static cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
-                              RoundingMode round, int vectorSize,
-                              cl_kernel *outKernel)
-{
-    cl_program program;
-    char testName[256];
-    int error = 0;
-
-    std::ostringstream source;
-    if (outType == kdouble || inType == kdouble)
-        source << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
-
-    // Create the program. This is a bit complicated because we are trying to
-    // avoid byte and short stores.
-    if (0 == vectorSize)
-    {
-        // Create the type names.
-        char inName[32];
-        char outName[32];
-        strncpy(inName, gTypeNames[inType], sizeof(inName));
-        strncpy(outName, gTypeNames[outType], sizeof(outName));
-        sprintf(testName, "test_implicit_%s_%s", outName, inName);
-
-        source << "__kernel void " << testName << "( __global " << inName
-               << " *src, __global " << outName << " *dest )\n";
-        source << "{\n";
-        source << "   size_t i = get_global_id(0);\n";
-        source << "   dest[i] =  src[i];\n";
-        source << "}\n";
-
-        vlog("Building implicit %s -> %s conversion test\n", gTypeNames[inType],
-             gTypeNames[outType]);
-        fflush(stdout);
-    }
-    else
-    {
-        int vectorSizetmp = vectorSizes[vectorSize];
-
-        // Create the type names.
-        char convertString[128];
-        char inName[32];
-        char outName[32];
-        switch (vectorSizetmp)
-        {
-            case 1:
-                strncpy(inName, gTypeNames[inType], sizeof(inName));
-                strncpy(outName, gTypeNames[outType], sizeof(outName));
-                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
-                         outName, gSaturationNames[sat],
-                         gRoundingModeNames[round]);
-                snprintf(testName, 256, "test_%s_%s", convertString, inName);
-                vlog("Building %s( %s ) test\n", convertString, inName);
-                break;
-            case 3:
-                strncpy(inName, gTypeNames[inType], sizeof(inName));
-                strncpy(outName, gTypeNames[outType], sizeof(outName));
-                snprintf(convertString, sizeof(convertString),
-                         "convert_%s3%s%s", outName, gSaturationNames[sat],
-                         gRoundingModeNames[round]);
-                snprintf(testName, 256, "test_%s_%s3", convertString, inName);
-                vlog("Building %s( %s3 ) test\n", convertString, inName);
-                break;
-            default:
-                snprintf(inName, sizeof(inName), "%s%d", gTypeNames[inType],
-                         vectorSizetmp);
-                snprintf(outName, sizeof(outName), "%s%d", gTypeNames[outType],
-                         vectorSizetmp);
-                snprintf(convertString, sizeof(convertString), "convert_%s%s%s",
-                         outName, gSaturationNames[sat],
-                         gRoundingModeNames[round]);
-                snprintf(testName, 256, "test_%s_%s", convertString, inName);
-                vlog("Building %s( %s ) test\n", convertString, inName);
-                break;
-        }
-        fflush(stdout);
-
-        if (vectorSizetmp == 3)
-        {
-            source << "__kernel void " << testName << "( __global " << inName
-                   << " *src, __global " << outName << " *dest )\n";
-            source << "{\n";
-            source << "   size_t i = get_global_id(0);\n";
-            source << "   if( i + 1 < get_global_size(0))\n";
-            source << "       vstore3( " << convertString
-                   << "( vload3( i, src)), i, dest );\n";
-            source << "   else\n";
-            source << "   {\n";
-            source << "       " << inName << "3 in;\n";
-            source << "       " << outName << "3 out;\n";
-            source << "       if( 0 == (i & 1) )\n";
-            source << "           in.y = src[3*i+1];\n";
-            source << "       in.x = src[3*i];\n";
-            source << "       out = " << convertString << "( in ); \n";
-            source << "       dest[3*i] = out.x;\n";
-            source << "       if( 0 == (i & 1) )\n";
-            source << "           dest[3*i+1] = out.y;\n";
-            source << "   }\n";
-            source << "}\n";
-        }
-        else
-        {
-            source << "__kernel void " << testName << "( __global " << inName
-                   << " *src, __global " << outName << " *dest )\n";
-            source << "{\n";
-            source << "   size_t i = get_global_id(0);\n";
-            source << "   dest[i] = " << convertString << "( src[i] );\n";
-            source << "}\n";
-        }
-    }
-    *outKernel = NULL;
-
-    const char *flags = NULL;
-    if (gForceFTZ) flags = "-cl-denorms-are-zero";
-
-    // build it
-    std::string sourceString = source.str();
-    const char *programSource = sourceString.c_str();
-    error = create_single_kernel_helper(gContext, &program, outKernel, 1,
-                                        &programSource, testName, flags);
-    if (error)
-    {
-        vlog_error("Failed to build kernel/program (err = %d).\n", error);
-        clReleaseProgram(program);
-        return NULL;
-    }
-
-    return program;
-}

From 4dbfba8865674906c852d590c96ec427c91b1a2a Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Wed, 24 May 2023 09:27:08 +0200
Subject: [PATCH 2/8] Added missing virtual descructor

---
 test_conformance/conversions/basic_test_conversions.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index c1d284ec2..9064a7afb 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -1,6 +1,6 @@
 //
-// Copyright (c) 2017 The Khronos Group Inc.
-// 
+// Copyright (c) 2023 The Khronos Group Inc.
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@@ -133,6 +133,7 @@ void UnFlushToZero(void *);
 
 struct CalcRefValsBase
 {
+    virtual ~CalcRefValsBase() = default;
     virtual int check_result(void *, uint32_t, int) { return 0; }
 
     // pointer back to the parent WriteInputBufferInfo struct

From 76b9177c93b994f6c75cdf421f853d8c459a4636 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 9 Jun 2023 09:24:36 +0200
Subject: [PATCH 3/8] Added corrections due to code review

---
 .../conversions/basic_test_conversions.cpp    | 12 -----
 .../conversions/basic_test_conversions.h      | 50 ++-----------------
 .../conversions/conversions_data_info.h       |  5 +-
 .../conversions/test_conversions.cpp          |  7 +--
 4 files changed, 7 insertions(+), 67 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index a01f60015..1fff7cb49 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -15,12 +15,7 @@
 //
 #include "harness/testHarness.h"
 #include "harness/compat.h"
-#include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
-#include "harness/testHarness.h"
-#include "harness/kernelHelpers.h"
-#include "harness/mt19937.h"
-#include "harness/kernelHelpers.h"
 
 #if defined(__APPLE__)
 #include <sys/sysctl.h>
@@ -43,8 +38,6 @@
 
 #include <sstream>
 #include <stdarg.h>
-#include <stdio.h>
-#include <string.h>
 #if !defined(_WIN32)
 #include <libgen.h>
 #include <sys/mman.h>
@@ -58,11 +51,6 @@
 
 #include "basic_test_conversions.h"
 
-#if (defined(_WIN32) && defined(_MSC_VER))
-// need for _controlfp_s and rouinding modes in RoundingMode
-#include "harness/testHarness.h"
-#endif
-
 #if defined(_WIN32)
 #include <mmintrin.h>
 #include <emmintrin.h>
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index 9064a7afb..c0ae8817b 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -30,7 +30,6 @@
     #include <CL/opencl.h>
 #endif
 
-
 #include "harness/mt19937.h"
 #include "harness/testHarness.h"
 #include "harness/typeWrappers.h"
@@ -41,8 +40,6 @@
 
 #include "conversions_data_info.h"
 
-// typedef void (*Convert)( void *dest, void *src, size_t );
-
 #define kVectorSizeCount 6
 #define kMaxVectorSize 16
 #define kPageSize 4096
@@ -51,11 +48,6 @@
 #define EMBEDDED_REDUCTION_FACTOR 16
 #define PERF_LOOP_COUNT 100
 
-
-// extern Convert gConversions[kTypeCount][kTypeCount];                // [dest
-// format][source format] extern Convert
-// gSaturatedConversions[kTypeCount][kTypeCount];       // [dest format][source
-// format]
 extern const char *gTypeNames[ kTypeCount ];
 extern const char *gRoundingModeNames[ kRoundingModeCount ];        // { "", "_rte", "_rtp", "_rtn", "_rtz" }
 extern const char *gSaturationNames[ kSaturationModeCount ];        // { "", "_sat" }
@@ -109,7 +101,6 @@ extern int vectorSizes[];
 extern size_t gComputeDevices;
 extern uint32_t gDeviceFrequency;
 
-
 namespace conv_test {
 
 cl_program MakeProgram(Type outType, Type inType, SaturationMode sat,
@@ -130,7 +121,6 @@ void *FlushToZero(void);
 void UnFlushToZero(void *);
 }
 
-
 struct CalcRefValsBase
 {
     virtual ~CalcRefValsBase() = default;
@@ -145,14 +135,12 @@ struct CalcRefValsBase
     cl_int result;
 };
 
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 struct CalcRefValsPat : CalcRefValsBase
 {
     int check_result(void *, uint32_t, int) override;
 };
 
-
 struct WriteInputBufferInfo
 {
     WriteInputBufferInfo()
@@ -173,14 +161,12 @@ struct WriteInputBufferInfo
     std::vector<std::unique_ptr<CalcRefValsBase>> calcInfo;
 };
 
-
 // Must be aligned with Type enums!
 using TypeIter = std::tuple<cl_uchar, cl_char, cl_ushort, cl_short, cl_uint,
                             cl_int, cl_float, cl_double, cl_ulong, cl_long>;
 
 constexpr bool isTypeFp[] = { 0, 0, 0, 0, 0, 0, 1, 1, 0, 0 };
 
-
 // Helper test fixture for constructing OpenCL objects used in testing
 // a variety of simple command-buffer enqueue scenarios.
 struct ConversionsTest
@@ -188,10 +174,10 @@ struct ConversionsTest
     ConversionsTest(cl_device_id device, cl_context context,
                     cl_command_queue queue);
 
-    virtual cl_int SetUp(int elements);
+    cl_int SetUp(int elements);
 
     // Test body returning an OpenCL error code
-    virtual cl_int Run();
+    cl_int Run();
 
     template <typename InType, typename OutType, bool InFP, bool OutFP>
     int DoTest(Type outType, Type inType, SaturationMode sat,
@@ -210,7 +196,6 @@ struct ConversionsTest
     TypeIter typeIterator;
 };
 
-
 struct CustomConversionsTest : ConversionsTest
 {
     CustomConversionsTest(cl_device_id device, cl_context context,
@@ -218,10 +203,9 @@ struct CustomConversionsTest : ConversionsTest
         : ConversionsTest(device, context, queue)
     {}
 
-    cl_int Run() override;
+    cl_int Run();
 };
 
-
 template <class T>
 int MakeAndRunTest(cl_device_id device, cl_context context,
                    cl_command_queue queue, int num_elements)
@@ -234,7 +218,6 @@ int MakeAndRunTest(cl_device_id device, cl_context context,
     return test_fixture.Run();
 }
 
-
 struct TestType
 {
     template <typename T> bool testType(Type in)
@@ -256,7 +239,6 @@ struct TestType
     }
 };
 
-
 // Helper structures to iterate over all tuple attributes of different types
 struct IterOverTypes : public TestType
 {
@@ -268,7 +250,6 @@ struct IterOverTypes : public TestType
     void Run() { for_each_out_elem(typeIter); }
 
 protected:
-    ////////////////////////////////////////////////////////////////////////////////////////
 
     template <std::size_t Out = 0, typename OutType>
     void iterate_out_type(const OutType &t)
@@ -278,8 +259,6 @@ struct IterOverTypes : public TestType
         inType = (Type)0;
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t In, std::size_t Out, typename OutType,
               typename InType>
     void iterate_in_type(const InType &t)
@@ -294,16 +273,12 @@ struct IterOverTypes : public TestType
         inType = (Type)(inType + 1);
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t Out = 0, typename... Tp>
     inline typename std::enable_if<Out == sizeof...(Tp), void>::type
     for_each_out_elem(
         const std::tuple<Tp...> &) // Unused arguments are given no names.
     {}
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t Out = 0, typename... Tp>
         inline typename std::enable_if < Out<sizeof...(Tp), void>::type
         for_each_out_elem(const std::tuple<Tp...> &t)
@@ -312,8 +287,6 @@ struct IterOverTypes : public TestType
         for_each_out_elem<Out + 1, Tp...>(t);
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t In = 0, std::size_t Out, typename OutType,
               typename... Tp>
     inline typename std::enable_if<In == sizeof...(Tp), void>::type
@@ -321,8 +294,6 @@ struct IterOverTypes : public TestType
         const std::tuple<Tp...> &) // Unused arguments are given no names.
     {}
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t In = 0, std::size_t Out, typename OutType,
               typename... Tp>
         inline typename std::enable_if < In<sizeof...(Tp), void>::type
@@ -332,8 +303,6 @@ struct IterOverTypes : public TestType
         for_each_in_elem<In + 1, Out, OutType, Tp...>(t);
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
 protected:
     Type inType;
     Type outType;
@@ -355,7 +324,6 @@ struct IterOverSelectedTypes : public TestType
     void Run() { for_each_out_elem(typeIter); }
 
 protected:
-    ////////////////////////////////////////////////////////////////////////////////////////
 
     template <std::size_t Out = 0, typename OutType>
     void iterate_out_type(const OutType &t)
@@ -363,8 +331,6 @@ struct IterOverSelectedTypes : public TestType
         for_each_in_elem<0, Out, OutType>(typeIter);
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t In, std::size_t Out, typename OutType,
               typename InType>
     void iterate_in_type(const InType &t)
@@ -378,15 +344,11 @@ struct IterOverSelectedTypes : public TestType
         }
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t Out = 0, typename... Tp>
     inline typename std::enable_if<Out == sizeof...(Tp), void>::type
     for_each_out_elem(const std::tuple<Tp...> &)
     {}
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t Out = 0, typename... Tp>
         inline typename std::enable_if < Out<sizeof...(Tp), void>::type
         for_each_out_elem(const std::tuple<Tp...> &t)
@@ -395,16 +357,12 @@ struct IterOverSelectedTypes : public TestType
         for_each_out_elem<Out + 1, Tp...>(t);
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t In = 0, std::size_t Out, typename OutType,
               typename... Tp>
     inline typename std::enable_if<In == sizeof...(Tp), void>::type
     for_each_in_elem(const std::tuple<Tp...> &)
     {}
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
     template <std::size_t In = 0, std::size_t Out, typename OutType,
               typename... Tp>
         inline typename std::enable_if < In<sizeof...(Tp), void>::type
@@ -414,8 +372,6 @@ struct IterOverSelectedTypes : public TestType
         for_each_in_elem<In + 1, Out, OutType, Tp...>(t);
     }
 
-    ////////////////////////////////////////////////////////////////////////////////////////
-
 protected:
     Type inType;
     Type outType;
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index a4e9c9689..81e01e46a 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -71,7 +71,7 @@ struct DataInitInfo
 
 struct DataInitBase : public DataInitInfo
 {
-    DataInitBase(const DataInitInfo &agg): DataInitInfo(agg) {}
+    explicit DataInitBase(const DataInitInfo &agg): DataInitInfo(agg) {}
     virtual void conv_array(void *out, void *in, size_t n) {}
     virtual void conv_array_sat(void *out, void *in, size_t n) {}
     virtual void init(const cl_uint &, const cl_uint &) {}
@@ -81,8 +81,7 @@ struct DataInitBase : public DataInitInfo
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 struct DataInfoSpec : public DataInitBase
 {
-
-    DataInfoSpec(const DataInitInfo &agg);
+    explicit DataInfoSpec(const DataInitInfo &agg);
 
     // helpers
     float round_to_int(float f);
diff --git a/test_conformance/conversions/test_conversions.cpp b/test_conformance/conversions/test_conversions.cpp
index 08fcdb473..a8be2098d 100644
--- a/test_conformance/conversions/test_conversions.cpp
+++ b/test_conformance/conversions/test_conversions.cpp
@@ -13,7 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
-#include "harness/rounding_mode.h"
 #include "harness/ThreadPool.h"
 #include "harness/testHarness.h"
 #include "harness/parseParameters.h"
@@ -54,10 +53,8 @@
 #include "Sleep.h"
 
 #include "basic_test_conversions.h"
-#include <limits.h>
-#include <string.h>
-
-#include "harness/mt19937.h"
+#include <climits>
+#include <cstring>
 
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__GNUC__)
 #include "fplib.h"

From 70838ce82bb70ae3ae8f2f0e889612493661a6d2 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 9 Jun 2023 09:39:18 +0200
Subject: [PATCH 4/8] More separators removed

---
 .../conversions/conversions_data_info.h       | 30 +------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index 81e01e46a..d1f42ce36 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -30,7 +30,6 @@ extern roundingMode qcom_rm;
 
 #include "harness/mt19937.h"
 #include "harness/rounding_mode.h"
-#include "harness/typeWrappers.h"
 
 #include <vector>
 
@@ -51,7 +50,6 @@ typedef enum
     kSaturationModeCount
 } SaturationMode;
 
-
 struct DataInitInfo
 {
     cl_ulong start;
@@ -68,7 +66,6 @@ struct DataInitInfo
     static std::vector<double> specialValuesDouble;
 };
 
-
 struct DataInitBase : public DataInitInfo
 {
     explicit DataInitBase(const DataInitInfo &agg): DataInitInfo(agg) {}
@@ -77,7 +74,6 @@ struct DataInitBase : public DataInitInfo
     virtual void init(const cl_uint &, const cl_uint &) {}
 };
 
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 struct DataInfoSpec : public DataInitBase
 {
@@ -101,30 +97,26 @@ struct DataInfoSpec : public DataInitBase
 
     std::vector<MTdataHolder> mdv;
 
-    ////////////////////////////////////////////////////////////////////////////
     void conv_array(void *out, void *in, size_t n) override
     {
         for (size_t i = 0; i < n; i++)
             conv(&((OutType *)out)[i], &((InType *)in)[i]);
     }
 
-    ////////////////////////////////////////////////////////////////////////////
     void conv_array_sat(void *out, void *in, size_t n) override
     {
         for (size_t i = 0; i < n; i++)
             conv_sat(&((OutType *)out)[i], &((InType *)in)[i]);
     }
 
-    ////////////////////////////////////////////////////////////////////////////
     void init(const cl_uint &, const cl_uint &) override;
     InType clamp(const InType &);
-    ////////////////////////////////////////////////////////////////////////////
     inline float fclamp(float lo, float v, float hi)
     {
         v = v < lo ? lo : v;
         return v < hi ? v : hi;
     }
-    ////////////////////////////////////////////////////////////////////////////
+
     inline double dclamp(double lo, double v, double hi)
     {
         v = v < lo ? lo : v;
@@ -132,8 +124,6 @@ struct DataInfoSpec : public DataInitBase
     }
 };
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
     const DataInitInfo &agg)
@@ -262,8 +252,6 @@ DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
     // clang-format on
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
 {
@@ -293,8 +281,6 @@ float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
     return f;
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 long long
 DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
@@ -328,8 +314,6 @@ DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
     return (long long)f;
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 OutType DataInfoSpec<InType, OutType, InFP, OutFP>::absolute(const OutType &x)
 {
@@ -348,8 +332,6 @@ OutType DataInfoSpec<InType, OutType, InFP, OutFP>::absolute(const OutType &x)
     return u.f;
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
 {
@@ -495,13 +477,9 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 #define CLAMP(_lo, _x, _hi)                                                    \
     ((_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
                                                           InType *in)
@@ -621,8 +599,6 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
                                                       const cl_uint &thread_id)
@@ -785,8 +761,6 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
     }
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 template <typename InType, typename OutType, bool InFP, bool OutFP>
 InType DataInfoSpec<InType, OutType, InFP, OutFP>::clamp(const InType &in)
 {
@@ -806,6 +780,4 @@ InType DataInfoSpec<InType, OutType, InFP, OutFP>::clamp(const InType &in)
     return in;
 }
 
-////////////////////////////////////////////////////////////////////////////////////////
-
 #endif /* CONVERSIONS_DATA_INFO_H */

From 122a08b26f92a5be06a60951239e83514abef173 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Fri, 9 Jun 2023 09:58:21 +0200
Subject: [PATCH 5/8] Fixed clang format

---
 test_conformance/conversions/basic_test_conversions.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index c0ae8817b..23f959532 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -250,7 +250,6 @@ struct IterOverTypes : public TestType
     void Run() { for_each_out_elem(typeIter); }
 
 protected:
-
     template <std::size_t Out = 0, typename OutType>
     void iterate_out_type(const OutType &t)
     {
@@ -324,7 +323,6 @@ struct IterOverSelectedTypes : public TestType
     void Run() { for_each_out_elem(typeIter); }
 
 protected:
-
     template <std::size_t Out = 0, typename OutType>
     void iterate_out_type(const OutType &t)
     {

From 616ef769ad558a2679111d3f38c49edc51aec562 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Thu, 15 Jun 2023 10:13:04 +0200
Subject: [PATCH 6/8] Added multiple corrections related to code review

---
 .../conversions/basic_test_conversions.cpp    | 24 ++++-------
 .../conversions/basic_test_conversions.h      | 20 ++++-----
 .../conversions/conversions_data_info.h       | 42 +++++++++----------
 3 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index 1fff7cb49..4571be3c3 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -283,10 +283,9 @@ static inline void Force64BitFPUPrecision(void)
 }
 
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-int CalcRefValsPat<InType, OutType, InFP, OutFP>::check_result(void *test,
-                                                               uint32_t count,
-                                                               int vectorSize)
+template <typename InType, typename OutType>
+int CalcRefValsPat<InType, OutType>::check_result(void *test, uint32_t count,
+                                                  int vectorSize)
 {
     const cl_uchar *a = (const cl_uchar *)gAllowZ;
 
@@ -449,7 +448,7 @@ cl_int ConversionsTest::SetUp(int elements)
 }
 
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
+template <typename InType, typename OutType>
 void ConversionsTest::TestTypesConversion(const Type &inType,
                                           const Type &outType, int &testNumber)
 {
@@ -517,8 +516,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
                     gMinVectorSize = 0;
             }
 
-            if ((error = DoTest<InType, OutType, InFP, OutFP>(outType, inType,
-                                                              sat, round)))
+            if ((error = DoTest<InType, OutType>(outType, inType, sat, round)))
             {
                 vlog_error("\t *** %d) convert_%sn%s%s( %sn ) "
                            "FAILED ** \n",
@@ -531,7 +529,7 @@ void ConversionsTest::TestTypesConversion(const Type &inType,
 }
 
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
+template <typename InType, typename OutType>
 int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
                             RoundingMode round)
 {
@@ -539,15 +537,11 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
     cl_ulong wall_start = mach_absolute_time();
 #endif
 
-#if 0
     uint64_t lastCase = 1ULL << (8 * gTypeSizes[inType]);
-#else
     cl_uint threads = GetThreadCount();
-    uint64_t lastCase = 1000000ULL;
-#endif
 
     DataInitInfo info = { 0, 0, outType, inType, sat, round, threads };
-    DataInfoSpec<InType, OutType, InFP, OutFP> init_info(info);
+    DataInfoSpec<InType, OutType> init_info(info);
     WriteInputBufferInfo writeInputBufferInfo;
     int vectorSize;
     int error = 0;
@@ -570,7 +564,7 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
     for (vectorSize = gMinVectorSize; vectorSize < gMaxVectorSize; vectorSize++)
     {
         writeInputBufferInfo.calcInfo[vectorSize].reset(
-            new CalcRefValsPat<InType, OutType, InFP, OutFP>());
+            new CalcRefValsPat<InType, OutType>());
         writeInputBufferInfo.calcInfo[vectorSize]->program =
             conv_test::MakeProgram(
                 outType, inType, sat, round, vectorSize,
@@ -604,11 +598,9 @@ int ConversionsTest::DoTest(Type outType, Type inType, SaturationMode sat,
             init_info.round = round = kRoundTowardZero;
     }
 
-#if 0
     // Figure out how many elements are in a work block
     // we handle 64-bit types a bit differently.
     if (8 * gTypeSizes[inType] > 32) lastCase = 0x100000000ULL;
-#endif
 
     if (!gWimpyMode && gIsEmbedded)
         step = blockCount * EMBEDDED_REDUCTION_FACTOR;
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index 23f959532..cf0e2c66a 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -135,7 +135,7 @@ struct CalcRefValsBase
     cl_int result;
 };
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
+template <typename InType, typename OutType>
 struct CalcRefValsPat : CalcRefValsBase
 {
     int check_result(void *, uint32_t, int) override;
@@ -165,12 +165,12 @@ struct WriteInputBufferInfo
 using TypeIter = std::tuple<cl_uchar, cl_char, cl_ushort, cl_short, cl_uint,
                             cl_int, cl_float, cl_double, cl_ulong, cl_long>;
 
-constexpr bool isTypeFp[] = { 0, 0, 0, 0, 0, 0, 1, 1, 0, 0 };
-
 // Helper test fixture for constructing OpenCL objects used in testing
 // a variety of simple command-buffer enqueue scenarios.
 struct ConversionsTest
 {
+    virtual ~ConversionsTest() = default;
+
     ConversionsTest(cl_device_id device, cl_context context,
                     cl_command_queue queue);
 
@@ -179,11 +179,11 @@ struct ConversionsTest
     // Test body returning an OpenCL error code
     cl_int Run();
 
-    template <typename InType, typename OutType, bool InFP, bool OutFP>
+    template <typename InType, typename OutType>
     int DoTest(Type outType, Type inType, SaturationMode sat,
                RoundingMode round);
 
-    template <typename InType, typename OutType, bool InFP, bool OutFP>
+    template <typename InType, typename OutType>
     void TestTypesConversion(const Type &inType, const Type &outType, int &tn);
 
 protected:
@@ -210,7 +210,7 @@ template <class T>
 int MakeAndRunTest(cl_device_id device, cl_context context,
                    cl_command_queue queue, int num_elements)
 {
-    auto test_fixture = T(device, context, queue);
+    T test_fixture(device, context, queue);
 
     cl_int error = test_fixture.SetUp(num_elements);
     test_error_ret(error, "Error in test initialization", TEST_FAIL);
@@ -267,8 +267,7 @@ struct IterOverTypes : public TestType
         if (!testType<OutType>(outType)) vlog_error("Unexpected data type!\n");
 
         // run the conversions
-        test.TestTypesConversion<InType, OutType, isTypeFp[In], isTypeFp[Out]>(
-            inType, outType, testNumber);
+        test.TestTypesConversion<InType, OutType>(inType, outType, testNumber);
         inType = (Type)(inType + 1);
     }
 
@@ -336,9 +335,8 @@ struct IterOverSelectedTypes : public TestType
         if (testType<InType>(inType) && testType<OutType>(outType))
         {
             // run the conversions
-            test.TestTypesConversion<InType, OutType, isTypeFp[In],
-                                     isTypeFp[Out]>(inType, outType,
-                                                    testNumber);
+            test.TestTypesConversion<InType, OutType>(inType, outType,
+                                                      testNumber);
         }
     }
 
diff --git a/test_conformance/conversions/conversions_data_info.h b/test_conformance/conversions/conversions_data_info.h
index d1f42ce36..b02773b1f 100644
--- a/test_conformance/conversions/conversions_data_info.h
+++ b/test_conformance/conversions/conversions_data_info.h
@@ -60,7 +60,6 @@ struct DataInitInfo
     RoundingMode round;
     cl_uint threads;
 
-
     static std::vector<uint32_t> specialValuesUInt;
     static std::vector<float> specialValuesFloat;
     static std::vector<double> specialValuesDouble;
@@ -68,13 +67,15 @@ struct DataInitInfo
 
 struct DataInitBase : public DataInitInfo
 {
+    virtual ~DataInitBase() = default;
+
     explicit DataInitBase(const DataInitInfo &agg): DataInitInfo(agg) {}
     virtual void conv_array(void *out, void *in, size_t n) {}
     virtual void conv_array_sat(void *out, void *in, size_t n) {}
     virtual void init(const cl_uint &, const cl_uint &) {}
 };
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
+template <typename InType, typename OutType>
 struct DataInfoSpec : public DataInitBase
 {
     explicit DataInfoSpec(const DataInitInfo &agg);
@@ -124,9 +125,8 @@ struct DataInfoSpec : public DataInitBase
     }
 };
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
-    const DataInitInfo &agg)
+template <typename InType, typename OutType>
+DataInfoSpec<InType, OutType>::DataInfoSpec(const DataInitInfo &agg)
     : DataInitBase(agg), mdv(0)
 {
     if (std::is_same<cl_float, OutType>::value)
@@ -252,8 +252,8 @@ DataInfoSpec<InType, OutType, InFP, OutFP>::DataInfoSpec(
     // clang-format on
 }
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
+template <typename InType, typename OutType>
+float DataInfoSpec<InType, OutType>::round_to_int(float f)
 {
     static const float magic[2] = { MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23),
                                     -MAKE_HEX_FLOAT(0x1.0p23f, 0x1, 23) };
@@ -281,9 +281,8 @@ float DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int(float f)
     return f;
 }
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-long long
-DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
+template <typename InType, typename OutType>
+long long DataInfoSpec<InType, OutType>::round_to_int_and_clamp(double f)
 {
     static const double magic[2] = { MAKE_HEX_DOUBLE(0x1.0p52, 0x1LL, 52),
                                      MAKE_HEX_DOUBLE(-0x1.0p52, -0x1LL, 52) };
@@ -314,8 +313,8 @@ DataInfoSpec<InType, OutType, InFP, OutFP>::round_to_int_and_clamp(double f)
     return (long long)f;
 }
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-OutType DataInfoSpec<InType, OutType, InFP, OutFP>::absolute(const OutType &x)
+template <typename InType, typename OutType>
+OutType DataInfoSpec<InType, OutType>::absolute(const OutType &x)
 {
     union {
         cl_uint u;
@@ -332,8 +331,8 @@ OutType DataInfoSpec<InType, OutType, InFP, OutFP>::absolute(const OutType &x)
     return u.f;
 }
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
+template <typename InType, typename OutType>
+void DataInfoSpec<InType, OutType>::conv(OutType *out, InType *in)
 {
     if (std::is_same<cl_float, InType>::value)
     {
@@ -480,9 +479,8 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv(OutType *out, InType *in)
 #define CLAMP(_lo, _x, _hi)                                                    \
     ((_x) < (_lo) ? (_lo) : ((_x) > (_hi) ? (_hi) : (_x)))
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
-                                                          InType *in)
+template <typename InType, typename OutType>
+void DataInfoSpec<InType, OutType>::conv_sat(OutType *out, InType *in)
 {
     if (std::is_floating_point<InType>::value)
     {
@@ -599,9 +597,9 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::conv_sat(OutType *out,
     }
 }
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
-                                                      const cl_uint &thread_id)
+template <typename InType, typename OutType>
+void DataInfoSpec<InType, OutType>::init(const cl_uint &job_id,
+                                         const cl_uint &thread_id)
 {
     uint64_t ulStart = start;
     void *pIn = (char *)gIn + job_id * size * gTypeSizes[inType];
@@ -761,8 +759,8 @@ void DataInfoSpec<InType, OutType, InFP, OutFP>::init(const cl_uint &job_id,
     }
 }
 
-template <typename InType, typename OutType, bool InFP, bool OutFP>
-InType DataInfoSpec<InType, OutType, InFP, OutFP>::clamp(const InType &in)
+template <typename InType, typename OutType>
+InType DataInfoSpec<InType, OutType>::clamp(const InType &in)
 {
     if (std::is_integral<OutType>::value)
     {

From ebea247d321193de2a7cf7aa63aa3fba97286773 Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Tue, 20 Jun 2023 11:46:13 +0200
Subject: [PATCH 7/8] Corrected missing implicit test lost after modernization
 corrections

---
 .../conversions/basic_test_conversions.cpp       |  4 ++--
 .../conversions/basic_test_conversions.h         | 16 ++++++++++------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index 4571be3c3..ffdb948ac 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -450,12 +450,12 @@ cl_int ConversionsTest::SetUp(int elements)
 
 template <typename InType, typename OutType>
 void ConversionsTest::TestTypesConversion(const Type &inType,
-                                          const Type &outType, int &testNumber)
+                                          const Type &outType, int &testNumber,
+                                          int startMinVectorSize)
 {
     SaturationMode sat;
     RoundingMode round;
     int error;
-    int startMinVectorSize = gMinVectorSize;
 
     // skip longs on embedded
     if (!gHasLong
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index cf0e2c66a..aed0601be 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -184,7 +184,8 @@ struct ConversionsTest
                RoundingMode round);
 
     template <typename InType, typename OutType>
-    void TestTypesConversion(const Type &inType, const Type &outType, int &tn);
+    void TestTypesConversion(const Type &inType, const Type &outType, int &tn,
+                             const int smvs);
 
 protected:
     cl_context context;
@@ -244,7 +245,7 @@ struct IterOverTypes : public TestType
 {
     IterOverTypes(const TypeIter &typeIter, ConversionsTest &test)
         : inType((Type)0), outType((Type)0), typeIter(typeIter), test(test),
-          testNumber(-1)
+          testNumber(-1), startMinVectorSize(gMinVectorSize)
     {}
 
     void Run() { for_each_out_elem(typeIter); }
@@ -267,7 +268,8 @@ struct IterOverTypes : public TestType
         if (!testType<OutType>(outType)) vlog_error("Unexpected data type!\n");
 
         // run the conversions
-        test.TestTypesConversion<InType, OutType>(inType, outType, testNumber);
+        test.TestTypesConversion<InType, OutType>(inType, outType, testNumber,
+                                                  startMinVectorSize);
         inType = (Type)(inType + 1);
     }
 
@@ -307,6 +309,7 @@ struct IterOverTypes : public TestType
     const TypeIter &typeIter;
     ConversionsTest &test;
     int testNumber;
+    int startMinVectorSize;
 };
 
 
@@ -316,7 +319,7 @@ struct IterOverSelectedTypes : public TestType
     IterOverSelectedTypes(const TypeIter &typeIter, ConversionsTest &test,
                           const Type &in, const Type &out)
         : inType(in), outType(out), typeIter(typeIter), test(test),
-          testNumber(-1)
+          testNumber(-1), startMinVectorSize(gMinVectorSize)
     {}
 
     void Run() { for_each_out_elem(typeIter); }
@@ -335,8 +338,8 @@ struct IterOverSelectedTypes : public TestType
         if (testType<InType>(inType) && testType<OutType>(outType))
         {
             // run the conversions
-            test.TestTypesConversion<InType, OutType>(inType, outType,
-                                                      testNumber);
+            test.TestTypesConversion<InType, OutType>(
+                inType, outType, testNumber, startMinVectorSize);
         }
     }
 
@@ -374,6 +377,7 @@ struct IterOverSelectedTypes : public TestType
     const TypeIter &typeIter;
     ConversionsTest &test;
     int testNumber;
+    int startMinVectorSize;
 };
 
 

From 3c3d5b23ea08418e3f8c68c6f6315a9748bceeba Mon Sep 17 00:00:00 2001
From: Marcin Hajder <marcin.hajder@gmail.com>
Date: Wed, 21 Jun 2023 12:38:45 +0200
Subject: [PATCH 8/8] Corrected single, selected test to limit number of
 unnecessary operations

---
 .../conversions/basic_test_conversions.cpp      |  3 ++-
 .../conversions/basic_test_conversions.h        | 17 +++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/test_conformance/conversions/basic_test_conversions.cpp b/test_conformance/conversions/basic_test_conversions.cpp
index ffdb948ac..43fb449bc 100644
--- a/test_conformance/conversions/basic_test_conversions.cpp
+++ b/test_conformance/conversions/basic_test_conversions.cpp
@@ -406,7 +406,8 @@ cl_int CustomConversionsTest::Run()
                 gMinVectorSize = 0;
         }
 
-        IterOverSelectedTypes iter(typeIterator, *this, inType, outType);
+        IterOverSelectedTypes iter(typeIterator, *this, inType, outType, round,
+                                   sat);
 
         iter.Run();
 
diff --git a/test_conformance/conversions/basic_test_conversions.h b/test_conformance/conversions/basic_test_conversions.h
index aed0601be..2314ee748 100644
--- a/test_conformance/conversions/basic_test_conversions.h
+++ b/test_conformance/conversions/basic_test_conversions.h
@@ -317,9 +317,11 @@ struct IterOverTypes : public TestType
 struct IterOverSelectedTypes : public TestType
 {
     IterOverSelectedTypes(const TypeIter &typeIter, ConversionsTest &test,
-                          const Type &in, const Type &out)
-        : inType(in), outType(out), typeIter(typeIter), test(test),
-          testNumber(-1), startMinVectorSize(gMinVectorSize)
+                          const Type in, const Type out,
+                          const RoundingMode round, const SaturationMode sat)
+        : inType(in), outType(out), rounding(round), saturation(sat),
+          typeIter(typeIter), test(test), testNumber(-1),
+          startMinVectorSize(gMinVectorSize)
     {}
 
     void Run() { for_each_out_elem(typeIter); }
@@ -337,9 +339,9 @@ struct IterOverSelectedTypes : public TestType
     {
         if (testType<InType>(inType) && testType<OutType>(outType))
         {
-            // run the conversions
-            test.TestTypesConversion<InType, OutType>(
-                inType, outType, testNumber, startMinVectorSize);
+            // run selected conversion
+            // testing of the result will happen afterwards
+            test.DoTest<InType, OutType>(outType, inType, saturation, rounding);
         }
     }
 
@@ -374,6 +376,9 @@ struct IterOverSelectedTypes : public TestType
 protected:
     Type inType;
     Type outType;
+    RoundingMode rounding;
+    SaturationMode saturation;
+
     const TypeIter &typeIter;
     ConversionsTest &test;
     int testNumber;