From e2b8eaed2747af6856f31160b1383f6790334f43 Mon Sep 17 00:00:00 2001
From: mehmet yusufoglu <mehmetyusufoglu01@gmail.com>
Date: Thu, 7 Nov 2024 11:12:22 +0100
Subject: [PATCH] make kernels depend each other, use original variables and
 access order

---
 .../babelstream/src/babelStreamCommon.hpp     | 299 +++++++++--
 .../babelstream/src/babelStreamMainTest.cpp   | 482 +++++++++++-------
 2 files changed, 561 insertions(+), 220 deletions(-)
diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp
index a22f7d032d3..0a8d0a3db7c 100644
--- a/benchmarks/babelstream/src/babelStreamCommon.hpp
+++ b/benchmarks/babelstream/src/babelStreamCommon.hpp
@@ -8,7 +8,9 @@
 #include <iostream>
 #include <limits>
 #include <map>
+#include <memory>
 #include <numeric>
+#include <ranges>
 #include <sstream>
 #include <stdexcept>
 #include <string>
@@ -28,55 +30,106 @@ namespace
     [[maybe_unused]] constexpr auto minArrSize = 1024 * 128;
 
     // Scalar value for Mul and Triad kernel parameters.
-    [[maybe_unused]] constexpr auto scalarVal = 2.0f;
+    [[maybe_unused]] constexpr double scalarVal = 0.4;
 
     // Block thread extent for DotKernel test work division parameters.
     [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
+    [[maybe_unused]] constexpr auto dotGridBlockExtent = 256;
 
     // Number of runs for each kernel, can be changed by command line arguments.
     // At least 100 runs are recommended for good benchmarking.
     // To prevent timeouts in CI, a small value is used.
     [[maybe_unused]] auto numberOfRuns = 2;
 
-    // Data input value for babelstream.
-    [[maybe_unused]] constexpr auto valA = 1.0f;
+    // Data input values for babelstream.
+    [[maybe_unused]] constexpr double initA = 0.1;
+    [[maybe_unused]] constexpr double initB = 0.2;
+    // Change this if triad kernel is going to be run alone
+    [[maybe_unused]] constexpr double initC = 0.0;
+
+    //! Values corresponding to the command line argument run-kernels
+    enum class KernelsToRun
+    {
+        All, // init, add, copy, mul, triad, dot
+        Triad, // only init and triad
+        NStream // only init and nstream
+    };
+
+    // Define the variable showing the kernel(s) being run
+    [[maybe_unused]] KernelsToRun kernelsToBeExecuted{KernelsToRun::All};
 
     //! handleCustomArguments Gets custom cmd line arguments from the all arguments.
     //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
     //! command line args for Catch2 session.
     [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[])
     {
-        std::vector<char*> newArgv;
-        newArgv.push_back(argv[0]); // Keep the program name
+        std::vector<char*> newArgv({argv[0]}); // keep program name
 
         for(int i = 1; i < argc; ++i)
         {
             std::string arg = argv[i];
             if(arg.rfind("--array-size=", 0) == 0)
             {
-                auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer
-                if(arrSize > minArrSize)
+                try
                 {
-                    arraySizeMain = arrSize;
-                    std::cout << "Array size provided(items): " << arraySizeMain << std::endl;
+                    // Convert argument to integer
+                    auto arrSize = std::stoi(arg.substr(13));
+                    if(arrSize > minArrSize)
+                    {
+                        arraySizeMain = arrSize;
+                        std::cout << "Array size set to: " << arraySizeMain << std::endl;
+                    }
+                    else
+                    {
+                        std::cout << "Array size too small. Must be at least " << minArrSize
+                                  << ", using default: " << arraySizeMain << std::endl;
+                    }
                 }
-                else
+                catch(std::invalid_argument const&)
                 {
-                    std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl;
-                    std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl;
+                    std::cerr << "Invalid array size argument: " << arg << ". Default value used." << std::endl;
                 }
             }
             else if(arg.rfind("--number-runs=", 0) == 0)
             {
-                auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer
-                if(numRuns > 0)
+                try
+                {
+                    // Convert argument to integer
+                    auto const numRuns = std::stoi(arg.substr(14));
+                    if(numRuns > 0)
+                    {
+                        numberOfRuns = numRuns;
+                        std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
+                    }
+                    else
+                    {
+                        std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
+                    }
+                }
+                catch(std::invalid_argument const&)
+                {
+                    std::cerr << "Invalid number of runs argument: " << arg << " . Default value used." << std::endl;
+                }
+            }
+            else if(arg.rfind("--run-kernels=", 0) == 0)
+            {
+                // Get argument to determine which kernels will be run
+                auto const kernelsString = arg.substr(14);
+                if(kernelsString == "nstream")
+                {
+                    std::cout << "Only nstream kernel will be executed." << std::endl;
+                    kernelsToBeExecuted = KernelsToRun::NStream;
+                }
+                else if(kernelsString == "triad")
                 {
-                    numberOfRuns = numRuns;
-                    std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
+                    kernelsToBeExecuted = KernelsToRun::Triad;
+                    std::cout << "Only triad kernel will be executed." << std::endl;
                 }
-                else
+                else if(kernelsString == "all")
                 {
-                    std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
+                    // The variable kernelsToBeExecuted default value is  "all";
+                    kernelsToBeExecuted = KernelsToRun::All;
+                    std::cout << "All 5 babelstream kernels are going to be executed." << std::endl;
                 }
             }
             else
@@ -87,7 +140,11 @@ namespace
             if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0)
             {
                 std::cout << "Usage of custom arguments (arguments which are not Catch2):  --array-size=33554432 and "
-                             "--number-runs=100"
+                             "--number-runs=100\n"
+                          << std::endl;
+                std::cout << "If you want to run only nstream kernel or triad kernel use --run-kernels=nstream or  "
+                             "--run-kernels=triad. Otherwise all 5 standard kernels will be executed. Copy, Mul, Add, "
+                             "Triad. (and Dot kernel, if multi-threaded acc is used.)"
                           << std::endl;
             }
         }
@@ -98,6 +155,12 @@ namespace
         {
             argv[i] = newArgv[static_cast<size_t>(i)];
         }
+
+        // Array size must a multiple of
+        if(arraySizeMain % blockThreadExtentMain != 0)
+            throw std::runtime_error(
+                "Array size is " + std::to_string(arraySizeMain) + ". It must be a multiple of block-size "
+                + std::to_string(blockThreadExtentMain));
     }
 
     //! FuzzyEqual compares two floating-point or integral type values.
@@ -111,7 +174,7 @@ namespace
     {
         if constexpr(std::is_floating_point_v<T>)
         {
-            return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
+            return std::fabs(a - b) < (std::numeric_limits<T>::epsilon() * static_cast<T>(100.0));
         }
         else if constexpr(std::is_integral_v<T>)
         {
@@ -219,6 +282,7 @@ namespace
         WorkDivTriad,
         WorkDivMult,
         WorkDivDot,
+        WorkDivNStream,
         DeviceName,
         TimeUnit,
         KernelNames,
@@ -279,6 +343,8 @@ namespace
             return "WorkDivMult ";
         case BMInfoDataType::WorkDivDot:
             return "WorkDivDot  ";
+        case BMInfoDataType::WorkDivNStream:
+            return "WorkDivNStream";
         default:
             return "";
         }
@@ -314,11 +380,158 @@ namespace
         return bytesReadWriteGB / static_cast<double>(runTimeSeconds);
     }
 
+    //! \brief calculateBabelstreamExpectedResults Fins the expected values by doing the same operations with the
+    //! initial values \param expectedA Array item value expected (all array values are equal in original babelstream)
+    //! \param expectedB Array item value expected (all array values are equal in original babelstream)
+    //! \param expectedC Array item value expected (all array values are equal in original babelstream)
+    template<typename T>
+    [[maybe_unused]] static void calculateBabelstreamExpectedResults(T& expectedA, T& expectedB, T& expectedC)
+    {
+        //
+        // All items of arrays are the same, therefore an expected value is for the whole array
+        for(auto i = 0; i < numberOfRuns; i++)
+        {
+            if(kernelsToBeExecuted == KernelsToRun::All)
+            {
+                expectedC = expectedA;
+                expectedB = static_cast<T>(scalarVal) * expectedC;
+                expectedC = expectedA + expectedB;
+                expectedA = expectedB + static_cast<T>(scalarVal) * expectedC;
+            }
+            else if(kernelsToBeExecuted == KernelsToRun::Triad)
+            {
+                expectedA = expectedB + static_cast<T>(scalarVal) * expectedC;
+            }
+            else if(kernelsToBeExecuted == KernelsToRun::NStream)
+            {
+                // each run changes the result
+                expectedA += expectedB + static_cast<T>(scalarVal) * expectedC;
+            }
+        }
+    }
+
+    /**
+     * /brief The RuntimeResults class bundles the kernel runtime data in a map
+     * The keys of the map are kernel names the values of the map are KernelRunData struct pointers
+     */
+    class RuntimeResults
+    {
+        struct KernelRunData
+        {
+            std::vector<double> timingsSuccessiveRuns; // Stores execution timings of successive runs
+            double byteReadWriteMB{0}; // Bytes read/write in MB
+            double bandwidthKernel{0}; // Bandwidth of kernel
+            double minExecTime{0}; // Minimum execution time
+            double maxExecTime{0}; // Maximum execution time
+            double avgExecTime{0}; // Average execution time
+        };
+
+    public:
+        // Map from kernelName (string) to a unique_ptr for KernelRunData
+        // Using unique_ptr for automatic memory management
+        std::map<std::string, std::unique_ptr<KernelRunData>> kernelToRundataMap;
+
+        // Function to initialize the byteReadWriteMB field for each kernel
+        template<typename DataType>
+        void initializeByteReadWrite(size_t arraySize)
+        {
+            // Define kernel names and their throughput values based on the provided array size
+            std::map<std::string, double> throughputValues
+                = {{"CopyKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))},
+                   {"MultKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))},
+                   {"AddKernel", getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize))},
+                   {"TriadKernel", getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize))},
+                   {"DotKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))},
+                   {"NStreamKernel", getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize))}};
+
+            // Populate each KernelRunData entry in kernelToRundataMap
+            for(auto const& [kernelName, throughput] : throughputValues)
+            {
+                // Check if the kernel name exists in the map
+                if(kernelToRundataMap.find(kernelName) != kernelToRundataMap.end())
+                {
+                    // Set the byteReadWriteMB field in the corresponding KernelRunData
+                    kernelToRundataMap[kernelName]->byteReadWriteMB = throughput;
+                }
+            }
+        }
+
+        //! /brief calculateBandwidthsForKernels Function calculates bandwidth for each kernel and update execution
+        //! times Fills the fields of bandwidth, execution min-time, execution max-time and execution avg-time
+        template<typename DataType>
+        void calculateBandwidthsForKernels()
+        {
+            for(auto const& [kernelName, kernelData] : kernelToRundataMap)
+            {
+                // Calculate min and max execution times from recorded vector of times for the kernel named kernelName
+                auto const minmaxPair = findMinMax(kernelData->timingsSuccessiveRuns);
+                kernelData->minExecTime = minmaxPair.first;
+                kernelData->maxExecTime = minmaxPair.second;
+
+                // Calculate average execution time
+                kernelData->avgExecTime = findAverage(kernelData->timingsSuccessiveRuns);
+
+                // Calculate bandwidth based on byteReadWriteMB and min execution time
+                kernelData->bandwidthKernel = calculateBandwidth(kernelData->byteReadWriteMB, minmaxPair.first);
+            }
+        }
+
+        //! /brief  Get item from each struct in the map and make a vector
+        //! /tparam Func is the accessor function to access to a specific field
+        template<typename Func>
+        std::vector<double> getItemFromStructs(Func accessor) const noexcept
+        {
+            std::vector<double> results;
+            for(auto const& [key, dataStruct] : kernelToRundataMap)
+            {
+                results.push_back(accessor(dataStruct.get())); // Dereference unique_ptr with .get()
+            }
+            return results;
+        }
+
+        // Functions to retrieve specific fields as vectors for all kernels
+        std::vector<double> getBandwidthKernelVec() const noexcept
+        {
+            return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->bandwidthKernel; });
+        }
+
+        std::vector<double> getThroughputKernelArray() const noexcept
+        {
+            return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->byteReadWriteMB; });
+        }
+
+        std::vector<double> getAvgExecTimeKernelArray() const noexcept
+        {
+            return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->avgExecTime; });
+        }
+
+        std::vector<double> getMinExecTimeKernelArray() const noexcept
+        {
+            return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->minExecTime; });
+        }
+
+        std::vector<double> getMaxExecTimeKernelArray() const noexcept
+        {
+            return getItemFromStructs([](KernelRunData const* dataStruct) { return dataStruct->maxExecTime; });
+        }
+
+        // Function to add a kernelName-timesForRuns pair by storing a unique_ptr to KernelRunData
+        void addKernelTimingsVec(std::string const& kernelName) noexcept
+        {
+            // Use make_unique to create a new KernelRunData object and store it in the map
+            kernelToRundataMap[kernelName] = std::make_unique<KernelRunData>();
+        }
+    };
+
     //! MetaData class to store and serialize benchmark information.
     //! \details The MetaData class includes a single map to keep all benchmark information and provides serialization
     //! methods for generating output.
-    class MetaData
+    class BenchmarkMetaData
     {
+    private:
+        // Information type to string. String can be comma separated values.
+        std::map<BMInfoDataType, std::string> metaDataMap;
+
     public:
         //! setItem  Sets an item in the metadata map.
         //! \tparam T The type of the value to store.
@@ -353,29 +566,31 @@ namespace
         {
             std::stringstream ss;
             // define lambda to add values to a string stream created already
-            auto addItemValue = [&, this](BMInfoDataType item) {
-                ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
+            auto addItemValueToSS = [&, this](BMInfoDataType item)
+            {
+                if(metaDataMap.count(item) != 0)
+                    ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
             };
 
-            // Initially chose some data to serialize
+            // Initially choose some data to serialize from the meta-data map to add to string stream
             ss << "\n";
-            addItemValue(BMInfoDataType::AcceleratorType);
-            addItemValue(BMInfoDataType::NumRuns);
-            addItemValue(BMInfoDataType::DataType);
-            addItemValue(BMInfoDataType::DataSize);
-            addItemValue(BMInfoDataType::DeviceName);
-            addItemValue(BMInfoDataType::WorkDivInit);
-            addItemValue(BMInfoDataType::WorkDivCopy);
-            addItemValue(BMInfoDataType::WorkDivMult);
-            addItemValue(BMInfoDataType::WorkDivAdd);
-            addItemValue(BMInfoDataType::WorkDivTriad);
-            if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0)
-                addItemValue(BMInfoDataType::WorkDivDot);
-
+            addItemValueToSS(BMInfoDataType::AcceleratorType);
+            addItemValueToSS(BMInfoDataType::NumRuns);
+            addItemValueToSS(BMInfoDataType::DataType);
+            addItemValueToSS(BMInfoDataType::DataSize);
+            addItemValueToSS(BMInfoDataType::DeviceName);
+            addItemValueToSS(BMInfoDataType::WorkDivInit);
+            addItemValueToSS(BMInfoDataType::WorkDivCopy);
+            addItemValueToSS(BMInfoDataType::WorkDivMult);
+            addItemValueToSS(BMInfoDataType::WorkDivAdd);
+            addItemValueToSS(BMInfoDataType::WorkDivTriad);
+            addItemValueToSS(BMInfoDataType::WorkDivDot);
+            addItemValueToSS(BMInfoDataType::WorkDivNStream);
+
+            // if the item is a string with delimited values get the item then the value at the index
             auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string
             {
                 std::string const str = metaDataMap.at(item);
-
                 if(index < 1)
                 {
                     throw std::invalid_argument("Index must be 1 or greater.");
@@ -405,7 +620,7 @@ namespace
                 throw std::out_of_range("Index out of range");
             };
 
-            // Prepare Table
+            // Prepare Table Display
             // Table column names
             ss << std::endl;
             ss << std::left << std::setw(15) << typeToTypeStr(BMInfoDataType::KernelNames) << " " << std::left
@@ -421,7 +636,7 @@ namespace
             // Table rows. Print test results for each kernel line by line
             for(auto i = 1; i <= numberOfKernels; i++)
             {
-                // Print the row for the kernel i
+                // Print the row for the kernel i.
                 ss << " " << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelNames, i) << " ";
                 ss << std::left << std::setw(15) << getItemFromStrList(BMInfoDataType::KernelBandwidths, i) << " ";
                 ss << std::left << std::setw(8) << getItemFromStrList(BMInfoDataType::KernelMinTimes, i) << " ";
@@ -433,8 +648,6 @@ namespace
 
             return ss.str();
         }
-
-    private:
-        std::map<BMInfoDataType, std::string> metaDataMap;
     };
+
 } // namespace
diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp
index 79ec6216508..cd59b4969c4 100644
--- a/benchmarks/babelstream/src/babelStreamMainTest.cpp
+++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp
@@ -9,24 +9,33 @@
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
+#include <algorithm>
+#include <iterator>
+#include <ranges>
 #include <string>
 
 /**
- * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot.
- * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP)
- * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each
- * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP.
+ * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. NStream is
+ * optional. Init kernel is run before 5 standard kernel sequence. Babelstream is a memory-bound benchmark since the
+ * main operation in the kernels has high Code Balance (bytes/FLOP) value. For example c[i] = a[i] + b[i]; has 2 reads
+ * 1 writes and has one FLOP operation. For double precision each read-write is 8 bytes. Hence Code Balance (3*8 / 1) =
+ * 24 bytes/FLOP.
  *
  * Some implementations and the documents are accessible through https://github.com/UoB-HPC
  *
  * Can be run with custom arguments as well as catch2 arguments
- * Run with Custom arguments:
+ * Run with Custom arguments and for kernels: init, copy, mul, add, triad (and dot kernel if a multi-thread acc
+ * available):
  * ./babelstream --array-size=33554432 --number-runs=100
- * Runt with default array size and num runs:
+ * Run with Custom arguments and select from 3 kernel groups: all, triad, nstream
+ * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=triad (only triad kernel)
+ * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=nstream (only nstream kernel)
+ * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=all (default case. Add, Multiply, Copy, Triad
+ * and Dot) Run with default array size and num runs:
  * ./babelstream
- * Run with Catch2 arguments and defaul arrary size and num runs:
+ * Run with Catch2 arguments and default array size and num runs:
  * ./babelstream --success
- * ./babelstream -r a.xml
+ * ./babelstream -r xml
  * Run with Custom and catch2 arguments together:
  * ./babelstream  --success --array-size=1280000 --number-runs=10
  * Help to list custom and catch2 arguments
@@ -57,13 +66,14 @@ struct InitKernel
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
     //! \param a Pointer for vector a
-    //! \param initA the value to set all items in the vector
+    //! \param initialA the value to set all items in the vector a
+    //! \param initialB the value to set all items in the vector b
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initialA, T initialB) const
     {
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        a[i] = initA;
-        b[i] = static_cast<T>(0.0);
+        a[i] = initialA;
+        b[i] = initialB;
         c[i] = static_cast<T>(0.0);
     }
 };
@@ -76,12 +86,12 @@ struct CopyKernel
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
     //! \param a Pointer for vector a
-    //! \param b Pointer for vector b
+    //! \param c Pointer for vector c
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const
     {
         auto const [index] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[index] = a[index];
+        c[index] = a[index];
     }
 };
 
@@ -92,14 +102,14 @@ struct MultKernel
     //! \tparam TAcc The accelerator environment to be executed on.
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
-    //! \param a Pointer for vector a
+    //! \param c Pointer for vector c
     //! \param b Pointer for result vector b
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const c, T* b) const
     {
         const T scalar = static_cast<T>(scalarVal);
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[i] = scalar * a[i];
+        b[i] = scalar * c[i];
     }
 };
 
@@ -132,11 +142,23 @@ struct TriadKernel
     //! \param b Pointer for vector b
     //! \param c Pointer for result vector c
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
+    {
+        const T scalar = static_cast<T>(scalarVal);
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        a[i] = b[i] + scalar * c[i];
+    }
+};
+
+//! Optional kernel, not one of the 5 standard Babelstream kernels
+struct NstreamKernel
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
     {
         const T scalar = static_cast<T>(scalarVal);
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i] + scalar * b[i];
+        a[i] += b[i] + scalar * c[i];
     }
 };
 
@@ -150,7 +172,8 @@ struct DotKernel
     //! \param acc The accelerator to be executed on.
     //! \param a Pointer for vector a
     //! \param b Pointer for vector b
-    //! \param sum Pointer for result vector consisting sums for each block
+    //! \param sum Pointer for result vector consisting sums of blocks
+    //! \param arraySize the size of the array
     template<typename TAcc, typename T>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx<TAcc> arraySize) const
     {
@@ -186,15 +209,20 @@ struct DotKernel
 template<typename TAcc, typename DataType>
 void testKernels()
 {
+    if(kernelsToBeExecuted == KernelsToRun::All)
+    {
+        std::cout << "Kernels: Init, Copy, Mul, Add, Triad Kernels (and Dot kernel, if acc is multi-thread per block.)"
+                  << std::endl;
+    }
     using Acc = TAcc;
-    // Define the index domain
     // Set the number of dimensions as an integral constant. Set to 1 for 1D.
     using Dim = alpaka::Dim<Acc>;
     using Idx = alpaka::Idx<Acc>;
+    // A MetaData class instance to keep the benchmark info and results to print later. Does not include intermediate
+    // runtime data.
+    BenchmarkMetaData metaData;
 
-    // Meta data
-    // A MetaData class instance to keep the problem and results to print later
-    MetaData metaData;
+    // Convert data-type to string to display
     std::string dataTypeStr;
     if(std::is_same<DataType, float>::value)
     {
@@ -251,7 +279,8 @@ void testKernels()
         bufAccInputAPtr,
         bufAccInputBPtr,
         bufAccOutputCPtr,
-        static_cast<DataType>(valA));
+        static_cast<DataType>(initA),
+        static_cast<DataType>(initB));
     auto const workDivCopy
         = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr);
     auto const workDivMult
@@ -267,194 +296,293 @@ void testKernels()
         bufAccInputBPtr,
         bufAccOutputCPtr);
 
-    // Vector of average run-times of babelstream kernels
-    std::vector<double> avgExecTimesOfKernels;
-    std::vector<double> minExecTimesOfKernels;
-    std::vector<double> maxExecTimesOfKernels;
-    std::vector<std::string> kernelLabels;
-    // Vector for collecting successive run-times of a single kernel in benchmark macro
-    std::vector<double> times;
+    auto const workDivNStream = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        NstreamKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr);
+
+    // To record runtime data generated while running the kernels
+    RuntimeResults runtimeResults;
 
     // Lambda for measuring run-time
     auto measureKernelExec = [&](auto&& kernelFunc, [[maybe_unused]] auto&& kernelLabel)
     {
-        for(auto i = 0; i < numberOfRuns; i++)
+        double runtime = 0.0;
+        auto start = std::chrono::high_resolution_clock::now();
+        kernelFunc();
+        alpaka::wait(queue);
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> duration = end - start;
+        runtime = duration.count();
+        runtimeResults.kernelToRundataMap[kernelLabel]->timingsSuccessiveRuns.push_back(runtime);
+    };
+
+    // Run kernels one by one.
+    // Init kernel
+    alpaka::exec<Acc>(
+        queue,
+        workDivInit,
+        InitKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr,
+        static_cast<DataType>(initA),
+        static_cast<DataType>(initB));
+
+
+    // Runtime result initialisation to be filled by each kernel
+    float resultDot{0.0f};
+    if(kernelsToBeExecuted == KernelsToRun::All)
+    {
+        runtimeResults.addKernelTimingsVec("CopyKernel");
+        runtimeResults.addKernelTimingsVec("AddKernel");
+        runtimeResults.addKernelTimingsVec("TriadKernel");
+        runtimeResults.addKernelTimingsVec("MultKernel");
+
+        if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
         {
-            double runtime = 0.0;
-            auto start = std::chrono::high_resolution_clock::now();
-            kernelFunc();
-            alpaka::wait(queue);
-            auto end = std::chrono::high_resolution_clock::now();
-            std::chrono::duration<double> duration = end - start;
-            runtime = duration.count();
-            times.push_back(runtime);
+            runtimeResults.addKernelTimingsVec("DotKernel");
         }
+    }
+    else if(kernelsToBeExecuted == KernelsToRun::NStream)
+    {
+        runtimeResults.addKernelTimingsVec("NStreamKernel");
+    }
+    else if(kernelsToBeExecuted == KernelsToRun::Triad)
+    {
+        runtimeResults.addKernelTimingsVec("TriadKernel");
+    }
 
-        // find the minimum of the durations array.
-        // In benchmarking the first item of the runtimes array is not included in calculations.
-        const auto minmaxPair = findMinMax(times);
-        minExecTimesOfKernels.push_back(minmaxPair.first);
-        maxExecTimesOfKernels.push_back(minmaxPair.second);
-        avgExecTimesOfKernels.push_back(findAverage(times));
-        kernelLabels.push_back(kernelLabel);
-        times.clear();
-    };
 
-    // Run kernels one by one
-    // Test the init-kernel.
-    measureKernelExec(
-        [&]()
+    // Main for loop to run the kernel-sequence
+    for(auto i = 0; i < numberOfRuns; i++)
+    {
+        if(kernelsToBeExecuted == KernelsToRun::All)
         {
-            alpaka::exec<Acc>(
-                queue,
-                workDivInit,
-                InitKernel(),
-                bufAccInputAPtr,
-                bufAccInputBPtr,
-                bufAccOutputCPtr,
-                static_cast<DataType>(valA));
-        },
-        "InitKernel");
-
-    // Test the copy-kernel. Copy A one by one to B.
-    measureKernelExec(
-        [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); },
-        "CopyKernel");
-
-    // Test the scaling-kernel. Calculate B=scalar*A.
-    measureKernelExec(
-        [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); },
-        "MultKernel");
-
-    // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A.
-    measureKernelExec(
-        [&]()
-        { alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
-        "AddKernel");
-
-    // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A.
-    measureKernelExec(
-        [&]()
-        { alpaka::exec<Acc>(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
-        "TriadKernel");
-
-
-    // Copy arrays back to host
+            // Test the copy-kernel. Copy A one by one to C.
+            measureKernelExec(
+                [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); },
+                "CopyKernel");
+
+            // Test the scaling-kernel. Calculate B=scalar*C. Where C = A.
+            measureKernelExec(
+                [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); },
+                "MultKernel");
+
+            // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A.
+            measureKernelExec(
+                [&]() {
+                    alpaka::exec<Acc>(
+                        queue,
+                        workDivAdd,
+                        AddKernel(),
+                        bufAccInputAPtr,
+                        bufAccInputBPtr,
+                        bufAccOutputCPtr);
+                },
+                "AddKernel");
+        }
+
+        if(kernelsToBeExecuted == KernelsToRun::All || kernelsToBeExecuted == KernelsToRun::Triad)
+        {
+            // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A.
+            measureKernelExec(
+                [&]() {
+                    alpaka::exec<Acc>(
+                        queue,
+                        workDivTriad,
+                        TriadKernel(),
+                        bufAccInputAPtr,
+                        bufAccInputBPtr,
+                        bufAccOutputCPtr);
+                },
+                "TriadKernel");
+        }
+        if(kernelsToBeExecuted == KernelsToRun::All)
+        {
+            // Test Dot kernel with specific blocksize which is larger than 1
+            if constexpr(alpaka::
+                             accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
+            {
+                using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+                // Threads per block for Dot kernel
+                constexpr Idx blockThreadExtent = blockThreadExtentMain;
+                // Blocks per grid for Dot kernel
+                Idx const gridBlockExtent = static_cast<Idx>(dotGridBlockExtent);
+                // Vector of sums of each block
+                auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
+                auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
+                // A specific work-division is used for dotKernel
+                auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)};
+
+                measureKernelExec(
+                    [&]()
+                    {
+                        alpaka::exec<Acc>(
+                            queue,
+                            workDivDot,
+                            DotKernel(), // Dot kernel
+                            bufAccInputAPtr,
+                            bufAccInputBPtr,
+                            alpaka::getPtrNative(bufAccSumPerBlock),
+                            static_cast<alpaka::Idx<Acc>>(arraySize));
+                        alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent);
+                        alpaka::wait(queue);
+                        DataType const* sumPtr = std::data(bufHostSumPerBlock);
+                        resultDot = std::reduce(sumPtr, sumPtr + gridBlockExtent, 0.0f);
+                    },
+                    "DotKernel");
+                // Add workdiv to the list of workdivs to print later
+                metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
+            }
+        }
+        else if(kernelsToBeExecuted == KernelsToRun::NStream)
+        {
+            // Test the NStream-kernel. Calculate A += B + scalar * C;
+            measureKernelExec(
+                [&]() {
+                    alpaka::exec<Acc>(
+                        queue,
+                        workDivNStream,
+                        NstreamKernel(),
+                        bufAccInputAPtr,
+                        bufAccInputBPtr,
+                        bufAccOutputCPtr);
+                },
+                "NStreamKernel");
+        }
+        alpaka::wait(queue);
+
+    } // End of MAIN LOOP which runs the kernels many times
+
+
+    // Copy arrays back to host since the execution of kernels except dot kernel finished
     alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize);
     alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize);
     alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize);
 
-    // Verify the results
     //
-    // Find sum of the errors as sum of the differences from expected values
-    DataType initVal{static_cast<DataType>(0.0)};
-    DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
+    // Result Verification and BW Calculation for 3 cases
+    //
+
+    // Generated expected values by doing the same chain of operations due to floating point error
+    DataType expectedA = static_cast<DataType>(initA);
+    DataType expectedB = static_cast<DataType>(initB);
+    DataType expectedC = static_cast<DataType>(initC);
 
-    auto const expectedC = static_cast<DataType>(valA + scalarVal * scalarVal * valA);
-    auto const expectedB = static_cast<DataType>(scalarVal * valA);
-    auto const expectedA = static_cast<DataType>(valA);
+    // calculate expected results by applying at host the same operation sequence
+    calculateBabelstreamExpectedResults(expectedA, expectedB, expectedC);
 
-    // sum of the errors for each array
-    for(Idx i = 0; i < arraySize; ++i)
+    // Verify the resulting data, if kernels are init, copy, mul, add, triad and -depending on acc- dot kernel
+    if(kernelsToBeExecuted == KernelsToRun::All)
     {
-        sumErrC += bufHostOutputC[static_cast<Idx>(i)] - expectedC;
-        sumErrB += bufHostOutputB[static_cast<Idx>(i)] - expectedB;
-        sumErrA += bufHostOutputA[static_cast<Idx>(i)] - expectedA;
-    }
+        // Find sum of the errors as sum of the differences from expected values
+        constexpr DataType initVal{static_cast<DataType>(0.0)};
+        DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
 
-    // Normalize and compare sum of the errors
-    REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
-    REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
-    REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
-    alpaka::wait(queue);
+        // sum of the errors for each array
+        for(Idx i = 0; i < arraySize; ++i)
+        {
+            sumErrC += std::fabs(bufHostOutputC[static_cast<Idx>(i)] - expectedC);
+            sumErrB += std::fabs(bufHostOutputB[static_cast<Idx>(i)] - expectedB);
+            sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
+        }
 
-    // Test Dot kernel with specific blocksize which is larger than 1
-    if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
-    {
-        using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-        // Threads per block for Dot kernel
-        constexpr Idx blockThreadExtent = blockThreadExtentMain;
-        // Blocks per grid for Dot kernel
-        constexpr Idx gridBlockExtent = static_cast<Idx>(256);
-        // Vector of sums of each block
-        auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
-        auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
-        // A specific work-division is used for dotKernel
-        auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)};
-
-        measureKernelExec(
-            [&]()
-            {
-                alpaka::exec<Acc>(
-                    queue,
-                    workDivDot,
-                    DotKernel(), // Dot kernel
-                    alpaka::getPtrNative(bufAccInputA),
-                    alpaka::getPtrNative(bufAccInputB),
-                    alpaka::getPtrNative(bufAccSumPerBlock),
-                    static_cast<alpaka::Idx<Acc>>(arraySize));
-            },
-            "DotKernel");
-
-        alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent);
+        // Normalize and compare sum of the errors
+        // Use a different equality check if floating point errors exceed precision of FuzzyEqual function
+        REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
+        REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
+        REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
         alpaka::wait(queue);
 
-        DataType const* sumPtr = std::data(bufHostSumPerBlock);
-        auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
-        // Since vector values are 1, dot product should be identical to arraySize
-        REQUIRE(FuzzyEqual(static_cast<DataType>(result), static_cast<DataType>(arraySize * 2)));
-        // Add workdiv to the list of workdivs to print later
-        metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
-    }
+        // Verify Dot kernel with specific blocksize which is larger than 1
+        if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
+        {
+            float const expectedSum = static_cast<float>(static_cast<DataType>(arraySize) * expectedA * expectedB);
 
+            //  Dot product should be identical to arraySize*initA*initB
+            //  Use a different equality check if floating point errors exceed precision of FuzzyEqual function
+            REQUIRE(FuzzyEqual((resultDot - expectedSum) / expectedSum, 0.0f));
+        }
 
-    //
-    // Calculate and Display Benchmark Results
-    //
-    std::vector<double> bytesReadWriteMB = {
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-    };
+        // Set workdivs of benchmark metadata to be displayed at the end
+        metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit);
+        metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy);
+        metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd);
+        metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult);
+        metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
+
+    } // for kernels add, copy, mul, triad and -if acc is suitable- dot kernel
 
-    // calculate the bandwidth as throughput per seconds
-    std::vector<double> bandwidthsPerKernel;
-    if(minExecTimesOfKernels.size() == kernelLabels.size())
+    // Verify the Triad Kernel result if "--run-kernels=triad".
+    else if(kernelsToBeExecuted == KernelsToRun::Triad)
     {
-        for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+        // Verify triad by summing the error
+        auto sumErrA = static_cast<DataType>(0.0);
+        // sum of the errors for each array
+        for(Idx i = 0; i < arraySize; ++i)
         {
-            bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+            sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
         }
+
+        REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
+        metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
     }
 
-    // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map
+    // Verify the NStream Kernel result if "--run-kernels=nstream".
+    else if(kernelsToBeExecuted == KernelsToRun::NStream)
+    {
+        auto sumErrA = static_cast<DataType>(0.0);
+        // sum of the errors for each array
+        for(Idx i = 0; i < arraySize; ++i)
+        {
+            sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
+        }
+        REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
+
+        metaData.setItem(BMInfoDataType::WorkDivNStream, workDivNStream);
+    }
+
+    // Runtime results of the benchmark: Calculate throughput and bandwidth
+    // Set throuput values depending on the kernels
+    runtimeResults.initializeByteReadWrite<DataType>(arraySize);
+    runtimeResults.calculateBandwidthsForKernels<DataType>();
+
+    // Set metadata to display all benchmark related information.
+    //
+    // All information about benchmark and results are stored in a single map
     metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp());
     metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns));
     metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain));
     metaData.setItem(BMInfoDataType::DataType, dataTypeStr);
-
-    metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit);
-    metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy);
-    metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd);
-    metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult);
-    metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
-
     // Device and accelerator
     metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc));
     metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName<Acc>());
     // XML reporter of catch2 always converts to Nano Seconds
     metaData.setItem(BMInfoDataType::TimeUnit, "Nano Seconds");
-    // Join elements and create a comma separated string
-    metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", "));
-    metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(bytesReadWriteMB, ", "));
-    metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(bandwidthsPerKernel, ", "));
-    metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(minExecTimesOfKernels, ", "));
-    metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(maxExecTimesOfKernels, ", "));
-    metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(avgExecTimesOfKernels, ", "));
 
+    // get labels from the map
+    std::vector<std::string> kernelLabels;
+    std::transform(
+        runtimeResults.kernelToRundataMap.begin(),
+        runtimeResults.kernelToRundataMap.end(),
+        std::back_inserter(kernelLabels),
+        [](auto const& pair) { return pair.first; });
+    // Join elements and create a comma separated string and set item
+    metaData.setItem(BMInfoDataType::KernelNames, joinElements(kernelLabels, ", "));
+    // Join elements and create a comma separated string and set item
+    std::vector<double> values(runtimeResults.getThroughputKernelArray());
+    metaData.setItem(BMInfoDataType::KernelDataUsageValues, joinElements(values, ", "));
+    // Join elements and create a comma separated string and set item
+    std::vector<double> valuesBW(runtimeResults.getBandwidthKernelVec());
+    metaData.setItem(BMInfoDataType::KernelBandwidths, joinElements(valuesBW, ", "));
+
+    metaData.setItem(BMInfoDataType::KernelMinTimes, joinElements(runtimeResults.getMinExecTimeKernelArray(), ", "));
+    metaData.setItem(BMInfoDataType::KernelMaxTimes, joinElements(runtimeResults.getMaxExecTimeKernelArray(), ", "));
+    metaData.setItem(BMInfoDataType::KernelAvgTimes, joinElements(runtimeResults.getAvgExecTimeKernelArray(), ", "));
     // Print the summary as a table, if a standard serialization is needed other functions of the class can be used
     std::cout << metaData.serializeAsTable() << std::endl;
 }
@@ -462,7 +590,7 @@ void testKernels()
 using TestAccs1D = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
 
 // Run for all Accs given by the argument
-TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Float>", "[benchmark-test]", TestAccs1D)
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels<Float>", "[benchmark-test]", TestAccs1D)
 {
     using Acc = TestType;
     // Run tests for the float data type
@@ -470,7 +598,7 @@ TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Float>", "[benchmark-tes
 }
 
 // Run for all Accs given by the argument
-TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Double>", "[benchmark-test]", TestAccs1D)
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels<Double>", "[benchmark-test]", TestAccs1D)
 {
     using Acc = TestType;
     // Run tests for the double data type