diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp
index a22f7d032d3..a625fcedf6c 100644
--- a/benchmarks/babelstream/src/babelStreamCommon.hpp
+++ b/benchmarks/babelstream/src/babelStreamCommon.hpp
@@ -28,55 +28,105 @@ namespace
     [[maybe_unused]] constexpr auto minArrSize = 1024 * 128;
 
     // Scalar value for Mul and Triad kernel parameters.
-    [[maybe_unused]] constexpr auto scalarVal = 2.0f;
+    [[maybe_unused]] constexpr double scalarVal = 0.4;
 
     // Block thread extent for DotKernel test work division parameters.
     [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024;
+    [[maybe_unused]] constexpr auto dotGridBlockExtent = 256;
 
     // Number of runs for each kernel, can be changed by command line arguments.
     // At least 100 runs are recommended for good benchmarking.
     // To prevent timeouts in CI, a small value is used.
     [[maybe_unused]] auto numberOfRuns = 2;
 
-    // Data input value for babelstream.
-    [[maybe_unused]] constexpr auto valA = 1.0f;
+    // Data input values for babelstream.
+    [[maybe_unused]] constexpr double valA = 0.1;
+    [[maybe_unused]] constexpr double valB = 0.2;
+    // Change this if triad kernel is going to be run alone
+    [[maybe_unused]] constexpr double valC = 0.0;
+
+    //! Values corresponding to the command line argument run-kernels
+    enum class KernelsToRun
+    {
+        All, // init, add, copy, mul, triad
+        Triad, // only init and triad
+        NStream // only init and nstream
+    };
+
+    [[maybe_unused]] KernelsToRun kernelsToBeExecuted{KernelsToRun::All};
 
     //! handleCustomArguments Gets custom cmd line arguments from the all arguments.
     //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are
     //! command line args for Catch2 session.
     [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[])
     {
-        std::vector<char*> newArgv;
-        newArgv.push_back(argv[0]); // Keep the program name
+        std::vector<char*> newArgv({argv[0]}); // keep program name
 
         for(int i = 1; i < argc; ++i)
         {
             std::string arg = argv[i];
             if(arg.rfind("--array-size=", 0) == 0)
             {
-                auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer
-                if(arrSize > minArrSize)
+                try
                 {
-                    arraySizeMain = arrSize;
-                    std::cout << "Array size provided(items): " << arraySizeMain << std::endl;
+                    // Convert argument to integer
+                    auto arrSize = std::stoi(arg.substr(13));
+                    if(arrSize > minArrSize)
+                    {
+                        arraySizeMain = arrSize;
+                        std::cout << "Array size set to: " << arraySizeMain << std::endl;
+                    }
+                    else
+                    {
+                        std::cout << "Array size too small. Must be at least " << minArrSize
+                                  << ", using default: " << arraySizeMain << std::endl;
+                    }
                 }
-                else
+                catch(std::invalid_argument const&)
                 {
-                    std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl;
-                    std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl;
+                    std::cerr << "Invalid array size argument: " << arg << ". Default value used." << std::endl;
                 }
             }
             else if(arg.rfind("--number-runs=", 0) == 0)
             {
-                auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer
-                if(numRuns > 0)
+                try
                 {
-                    numberOfRuns = numRuns;
-                    std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
+                    // Convert argument to integer
+                    auto const numRuns = std::stoi(arg.substr(14));
+                    if(numRuns > 0)
+                    {
+                        numberOfRuns = numRuns;
+                        std::cout << "Number of runs provided: " << numberOfRuns << std::endl;
+                    }
+                    else
+                    {
+                        std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
+                    }
                 }
-                else
+                catch(std::invalid_argument const&)
                 {
-                    std::cout << "Using default number of runs: " << numberOfRuns << std::endl;
+                    std::cerr << "Invalid number of runs argument: " << arg << " . Default value used." << std::endl;
+                }
+            }
+            else if(arg.rfind("--run-kernels=", 0) == 0)
+            {
+                // Get argument to determine which kernels will be run
+                auto const kernelsString = arg.substr(14);
+                if(kernelsString == "nstream")
+                {
+                    std::cout << "Only nstream kernel will be executed." << std::endl;
+                    kernelsToBeExecuted = KernelsToRun::NStream;
+                }
+                else if(kernelsString == "triad")
+                {
+                    kernelsToBeExecuted = KernelsToRun::Triad;
+                    std::cout << "Only triad kernel will be executed." << std::endl;
+                }
+                else if(kernelsString == "all")
+                {
+                    // The variable kernelsToBeExecuted default value is  "all";
+                    kernelsToBeExecuted = KernelsToRun::All;
+                    std::cout << "All 5 babelstream kernels are going to be executed." << std::endl;
                 }
             }
             else
@@ -87,8 +137,13 @@ namespace
             if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0)
             {
                 std::cout << "Usage of custom arguments (arguments which are not Catch2):  --array-size=33554432 and "
-                             "--number-runs=100"
+                             "--number-runs=100\n"
                           << std::endl;
+                std::cout
+                    << "If you want to run only nstream kernel or triad kernel use --run-kernels=nstream or  "
+                       "--run-kernels=triad. Otherwise all 5 standard kernels will be executed. Init, Copy, Mul, Add, "
+                       "Triad. (and Dot kernel, if multi-threaded acc is used.)"
+                    << std::endl;
             }
         }
 
@@ -98,6 +153,12 @@ namespace
         {
             argv[i] = newArgv[static_cast<size_t>(i)];
         }
+
+        // Array size must a multiple of
+        if(arraySizeMain % blockThreadExtentMain != 0)
+            throw std::runtime_error(
+                "Array size is " + std::to_string(arraySizeMain) + ". It must be a multiple of block-size "
+                + std::to_string(blockThreadExtentMain));
     }
 
     //! FuzzyEqual compares two floating-point or integral type values.
@@ -111,7 +172,7 @@ namespace
     {
         if constexpr(std::is_floating_point_v<T>)
         {
-            return std::fabs(a - b) < std::numeric_limits<T>::epsilon() * static_cast<T>(100.0);
+            return std::fabs(a - b) < (std::numeric_limits<T>::epsilon() * static_cast<T>(100.0));
         }
         else if constexpr(std::is_integral_v<T>)
         {
@@ -219,6 +280,7 @@ namespace
         WorkDivTriad,
         WorkDivMult,
         WorkDivDot,
+        WorkDivNStream,
         DeviceName,
         TimeUnit,
         KernelNames,
@@ -279,6 +341,8 @@ namespace
             return "WorkDivMult ";
         case BMInfoDataType::WorkDivDot:
             return "WorkDivDot  ";
+        case BMInfoDataType::WorkDivNStream:
+            return "WorkDivNStream";
         default:
             return "";
         }
@@ -353,11 +417,13 @@ namespace
         {
             std::stringstream ss;
             // define lambda to add values to a string stream created already
-            auto addItemValue = [&, this](BMInfoDataType item) {
-                ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
+            auto addItemValue = [&, this](BMInfoDataType item)
+            {
+                if(metaDataMap.count(item) != 0)
+                    ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item);
             };
 
-            // Initially chose some data to serialize
+            // Initially chose some data to serialize from the meta-data map to add to string stream
             ss << "\n";
             addItemValue(BMInfoDataType::AcceleratorType);
             addItemValue(BMInfoDataType::NumRuns);
@@ -369,9 +435,8 @@ namespace
             addItemValue(BMInfoDataType::WorkDivMult);
             addItemValue(BMInfoDataType::WorkDivAdd);
             addItemValue(BMInfoDataType::WorkDivTriad);
-            if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0)
-                addItemValue(BMInfoDataType::WorkDivDot);
-
+            addItemValue(BMInfoDataType::WorkDivDot);
+            addItemValue(BMInfoDataType::WorkDivNStream);
             auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string
             {
                 std::string const str = metaDataMap.at(item);
diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp
index 79ec6216508..aeddba0ea61 100644
--- a/benchmarks/babelstream/src/babelStreamMainTest.cpp
+++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp
@@ -12,21 +12,26 @@
 #include <string>
 
 /**
- * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot.
- * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP)
- * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each
- * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP.
+ * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. NStream is
+ * optional. Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance
+ * (bytes/FLOP) value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double
+ * precision each read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP.
  *
  * Some implementations and the documents are accessible through https://github.com/UoB-HPC
  *
  * Can be run with custom arguments as well as catch2 arguments
- * Run with Custom arguments:
+ * Run with Custom arguments and for kernels: init,copy, mul, add, triad (and dot kernel if a multi-thread acc
+ * available):
  * ./babelstream --array-size=33554432 --number-runs=100
- * Runt with default array size and num runs:
+ * Run with Custom arguments and select from 3 kernel groups: all, triad, nstream
+ * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=triad (only triad kernel)
+ * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=nstream (only nstream kernel)
+ * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=all (default case. Add, Multiply, Copy, Triad
+ * and Dot) Run with default array size and num runs:
  * ./babelstream
- * Run with Catch2 arguments and defaul arrary size and num runs:
+ * Run with Catch2 arguments and default arrary size and num runs:
  * ./babelstream --success
- * ./babelstream -r a.xml
+ * ./babelstream -r xml
  * Run with Custom and catch2 arguments together:
  * ./babelstream  --success --array-size=1280000 --number-runs=10
  * Help to list custom and catch2 arguments
@@ -59,11 +64,11 @@ struct InitKernel
     //! \param a Pointer for vector a
     //! \param initA the value to set all items in the vector
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB) const
     {
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
         a[i] = initA;
-        b[i] = static_cast<T>(0.0);
+        b[i] = initB;
         c[i] = static_cast<T>(0.0);
     }
 };
@@ -76,12 +81,12 @@ struct CopyKernel
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
     //! \param a Pointer for vector a
-    //! \param b Pointer for vector b
+    //! \param c Pointer for vector c
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const
     {
         auto const [index] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[index] = a[index];
+        c[index] = a[index];
     }
 };
 
@@ -92,14 +97,14 @@ struct MultKernel
     //! \tparam TAcc The accelerator environment to be executed on.
     //! \tparam T The data type
     //! \param acc The accelerator to be executed on.
-    //! \param a Pointer for vector a
+    //! \param c Pointer for vector c
     //! \param b Pointer for result vector b
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const c, T* b) const
     {
         const T scalar = static_cast<T>(scalarVal);
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        b[i] = scalar * a[i];
+        b[i] = scalar * c[i];
     }
 };
 
@@ -132,11 +137,23 @@ struct TriadKernel
     //! \param b Pointer for vector b
     //! \param c Pointer for result vector c
     template<typename TAcc, typename T>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
     {
         const T scalar = static_cast<T>(scalarVal);
         auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
-        c[i] = a[i] + scalar * b[i];
+        a[i] = b[i] + scalar * c[i];
+    }
+};
+
+//! Optional kernel, not one of the 5 standard Babelstream kernels
+struct NstreamKernel
+{
+    template<typename TAcc, typename T>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const
+    {
+        const T scalar = static_cast<T>(scalarVal);
+        auto const [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
+        a[i] += b[i] + scalar * c[i];
     }
 };
 
@@ -150,7 +167,8 @@ struct DotKernel
     //! \param acc The accelerator to be executed on.
     //! \param a Pointer for vector a
     //! \param b Pointer for vector b
-    //! \param sum Pointer for result vector consisting sums for each block
+    //! \param sum Pointer for result vector consisting sums of blocks
+    //! \param arraySize the size of the array
     template<typename TAcc, typename T>
     ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx<TAcc> arraySize) const
     {
@@ -186,6 +204,11 @@ struct DotKernel
 template<typename TAcc, typename DataType>
 void testKernels()
 {
+    if(kernelsToBeExecuted == KernelsToRun::All)
+    {
+        std::cout << "Kernels: Init, Copy, Mul, Add Kernels (and Dot kernel, if acc is multi-thread per block.)"
+                  << std::endl;
+    }
     using Acc = TAcc;
     // Define the index domain
     // Set the number of dimensions as an integral constant. Set to 1 for 1D.
@@ -251,7 +274,8 @@ void testKernels()
         bufAccInputAPtr,
         bufAccInputBPtr,
         bufAccOutputCPtr,
-        static_cast<DataType>(valA));
+        static_cast<DataType>(valA),
+        static_cast<DataType>(valB));
     auto const workDivCopy
         = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr);
     auto const workDivMult
@@ -267,6 +291,14 @@ void testKernels()
         bufAccInputBPtr,
         bufAccOutputCPtr);
 
+    auto const workDivNStream = alpaka::getValidWorkDiv(
+        kernelCfg,
+        devAcc,
+        NstreamKernel(),
+        bufAccInputAPtr,
+        bufAccInputBPtr,
+        bufAccOutputCPtr);
+
     // Vector of average run-times of babelstream kernels
     std::vector<double> avgExecTimesOfKernels;
     std::vector<double> minExecTimesOfKernels;
@@ -312,136 +344,252 @@ void testKernels()
                 bufAccInputAPtr,
                 bufAccInputBPtr,
                 bufAccOutputCPtr,
-                static_cast<DataType>(valA));
+                static_cast<DataType>(valA),
+                static_cast<DataType>(valB));
         },
         "InitKernel");
 
-    // Test the copy-kernel. Copy A one by one to B.
-    measureKernelExec(
-        [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); },
-        "CopyKernel");
+    if(kernelsToBeExecuted == KernelsToRun::All)
+    {
+        // Test the copy-kernel. Copy A one by one to C.
+        measureKernelExec(
+            [&]() { alpaka::exec<Acc>(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); },
+            "CopyKernel");
 
-    // Test the scaling-kernel. Calculate B=scalar*A.
-    measureKernelExec(
-        [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); },
-        "MultKernel");
+        // Test the scaling-kernel. Calculate B=scalar*C. Where C = A.
+        measureKernelExec(
+            [&]() { alpaka::exec<Acc>(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); },
+            "MultKernel");
 
-    // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A.
-    measureKernelExec(
-        [&]()
-        { alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
-        "AddKernel");
+        // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A.
+        measureKernelExec(
+            [&]()
+            { alpaka::exec<Acc>(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
+            "AddKernel");
+    }
 
-    // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A.
-    measureKernelExec(
-        [&]()
-        { alpaka::exec<Acc>(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); },
-        "TriadKernel");
+    if(kernelsToBeExecuted == KernelsToRun::All || kernelsToBeExecuted == KernelsToRun::Triad)
+    {
+        // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A.
+        measureKernelExec(
+            [&]() {
+                alpaka::exec<Acc>(
+                    queue,
+                    workDivTriad,
+                    TriadKernel(),
+                    bufAccInputAPtr,
+                    bufAccInputBPtr,
+                    bufAccOutputCPtr);
+            },
+            "TriadKernel");
+    }
 
+    else if(kernelsToBeExecuted == KernelsToRun::NStream)
+    {
+        // Test the NStream-kernel. Calculate A += B + scalar * C;
+        measureKernelExec(
+            [&]() {
+                alpaka::exec<Acc>(
+                    queue,
+                    workDivNStream,
+                    NstreamKernel(),
+                    bufAccInputAPtr,
+                    bufAccInputBPtr,
+                    bufAccOutputCPtr);
+            },
+            "NstreamKernel");
+    }
 
-    // Copy arrays back to host
+    // Copy arrays back to host since the execution of kernels except dot kernel finished
     alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize);
     alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize);
     alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize);
 
-    // Verify the results
     //
-    // Find sum of the errors as sum of the differences from expected values
-    DataType initVal{static_cast<DataType>(0.0)};
-    DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
-
-    auto const expectedC = static_cast<DataType>(valA + scalarVal * scalarVal * valA);
-    auto const expectedB = static_cast<DataType>(scalarVal * valA);
-    auto const expectedA = static_cast<DataType>(valA);
+    // Result Verification and BW Calculation
+    //
 
-    // sum of the errors for each array
-    for(Idx i = 0; i < arraySize; ++i)
+    std::vector<double> bandwidthsPerKernel;
+    std::vector<double> bytesReadWriteMB;
+    if(kernelsToBeExecuted == KernelsToRun::All)
     {
-        sumErrC += bufHostOutputC[static_cast<Idx>(i)] - expectedC;
-        sumErrB += bufHostOutputB[static_cast<Idx>(i)] - expectedB;
-        sumErrA += bufHostOutputA[static_cast<Idx>(i)] - expectedA;
-    }
+        // Verify the results for all 5 babelstream kernels
+        // Find sum of the errors as sum of the differences from expected values
+        DataType initVal{static_cast<DataType>(0.0)};
+        DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal};
 
-    // Normalize and compare sum of the errors
-    REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
-    REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
-    REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
-    alpaka::wait(queue);
 
-    // Test Dot kernel with specific blocksize which is larger than 1
-    if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
-    {
-        using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
-        // Threads per block for Dot kernel
-        constexpr Idx blockThreadExtent = blockThreadExtentMain;
-        // Blocks per grid for Dot kernel
-        constexpr Idx gridBlockExtent = static_cast<Idx>(256);
-        // Vector of sums of each block
-        auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
-        auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
-        // A specific work-division is used for dotKernel
-        auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)};
+        auto const expectedB = static_cast<DataType>(scalarVal * valA);
+        auto const expectedC = static_cast<DataType>(static_cast<DataType>(valA) + expectedB);
+        auto const expectedA = static_cast<DataType>(expectedB + static_cast<DataType>(scalarVal) * expectedC);
 
-        measureKernelExec(
-            [&]()
-            {
-                alpaka::exec<Acc>(
-                    queue,
-                    workDivDot,
-                    DotKernel(), // Dot kernel
-                    alpaka::getPtrNative(bufAccInputA),
-                    alpaka::getPtrNative(bufAccInputB),
-                    alpaka::getPtrNative(bufAccSumPerBlock),
-                    static_cast<alpaka::Idx<Acc>>(arraySize));
-            },
-            "DotKernel");
+        // sum of the errors for each array
+        for(Idx i = 0; i < arraySize; ++i)
+        {
+            sumErrC += std::fabs(bufHostOutputC[static_cast<Idx>(i)] - expectedC);
+            sumErrB += std::fabs(bufHostOutputB[static_cast<Idx>(i)] - expectedB);
+            sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
+        }
 
-        alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent);
+        // Normalize and compare sum of the errors
+        // Use a different equality check if floating point errors exceed precision of FuzzyEqual function
+        REQUIRE(FuzzyEqual(sumErrC / static_cast<DataType>(arraySize) / expectedC, static_cast<DataType>(0.0)));
+        REQUIRE(FuzzyEqual(sumErrB / static_cast<DataType>(arraySize) / expectedB, static_cast<DataType>(0.0)));
+        REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
         alpaka::wait(queue);
 
-        DataType const* sumPtr = std::data(bufHostSumPerBlock);
-        auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
-        // Since vector values are 1, dot product should be identical to arraySize
-        REQUIRE(FuzzyEqual(static_cast<DataType>(result), static_cast<DataType>(arraySize * 2)));
-        // Add workdiv to the list of workdivs to print later
-        metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
-    }
+        // Test Dot kernel with specific blocksize which is larger than 1
+        if constexpr(alpaka::accMatchesTags<TAcc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>)
+        {
+            using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
+            // Threads per block for Dot kernel
+            constexpr Idx blockThreadExtent = blockThreadExtentMain;
+            // Blocks per grid for Dot kernel
+            const Idx gridBlockExtent = static_cast<Idx>(dotGridBlockExtent);
+            // Vector of sums of each block
+            auto bufAccSumPerBlock = alpaka::allocBuf<DataType, Idx>(devAcc, gridBlockExtent);
+            auto bufHostSumPerBlock = alpaka::allocBuf<DataType, Idx>(devHost, gridBlockExtent);
+            // A specific work-division is used for dotKernel
+            auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)};
+
+            measureKernelExec(
+                [&]()
+                {
+                    alpaka::exec<Acc>(
+                        queue,
+                        workDivDot,
+                        DotKernel(), // Dot kernel
+                        alpaka::getPtrNative(bufAccInputA),
+                        alpaka::getPtrNative(bufAccInputB),
+                        alpaka::getPtrNative(bufAccSumPerBlock),
+                        static_cast<alpaka::Idx<Acc>>(arraySize));
+                },
+                "DotKernel");
+
+            alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent);
+            alpaka::wait(queue);
 
+            DataType const* sumPtr = std::data(bufHostSumPerBlock);
+            auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0});
 
-    //
-    // Calculate and Display Benchmark Results
-    //
-    std::vector<double> bytesReadWriteMB = {
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)),
-        getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)),
-    };
+            auto const expectedSum = static_cast<DataType>(arraySize) * expectedA * expectedB;
+            //  Dot product should be identical to arraySize*valA*valB
+            //  Use a different equality check if floating point errors exceed precision of FuzzyEqual function
+            REQUIRE(
+                FuzzyEqual((static_cast<DataType>(result) - expectedSum) / expectedSum, static_cast<DataType>(0.0)));
 
-    // calculate the bandwidth as throughput per seconds
-    std::vector<double> bandwidthsPerKernel;
-    if(minExecTimesOfKernels.size() == kernelLabels.size())
+            // Add workdiv to the list of workdivs to print later
+            metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot);
+        }
+
+        //
+        // Calculate and Display Benchmark Results for Kernels
+        //
+        bytesReadWriteMB = {
+            getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)), // init
+            getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)), // copy
+            getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)), // mul
+            getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)), // add
+            getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)), // triad
+            getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)), // dot
+        };
+
+        // calculate the bandwidth as throughput per seconds
+
+        if(minExecTimesOfKernels.size() == kernelLabels.size())
+        {
+            for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+            {
+                bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+            }
+        }
+
+        metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit);
+        metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy);
+        metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd);
+        metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult);
+        metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
+
+    } // for all 5 kernels and dot if accelerator is suitable
+
+    // Verify the Triad Kernel for "--run-kernels=triad". Namely, for only-triad run case and calculate the bandwidth
+    else if(kernelsToBeExecuted == KernelsToRun::Triad)
+    {
+        // Verify triad
+        DataType sumErrA{0.0};
+        auto const expectedA = static_cast<DataType>(valB + scalarVal * valC);
+
+        // sum of the errors for each array
+        for(Idx i = 0; i < arraySize; ++i)
+        {
+            sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
+        }
+
+        REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
+
+        // Calculate and record benchmark results
+        bytesReadWriteMB = {
+            getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)), // init
+            getDataThroughput<DataType>(3u, static_cast<unsigned>(arraySize)) // triad
+        };
+
+        // calculate the bandwidth as throughput per seconds
+        if(minExecTimesOfKernels.size() == kernelLabels.size())
+        {
+            for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+            {
+                bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+            }
+        }
+        metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
+    }
+
+    // Verify the NStream Kernel for "--run-kernels=nstream". Namely for the only-nstream run case and calculate the
+    // bandwidth
+    else if(kernelsToBeExecuted == KernelsToRun::NStream)
     {
-        for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+        DataType sumErrA{0.0};
+        // initial run of NStream kernel
+        DataType expectedA = static_cast<DataType>(valA);
+        // each run changes the result
+        for(int i = 0; i < numberOfRuns; i++)
+        {
+            expectedA += static_cast<DataType>(valB + scalarVal * valC);
+        }
+
+        // sum of the errors for each array
+        for(Idx i = 0; i < arraySize; ++i)
+        {
+            sumErrA += std::fabs(bufHostOutputA[static_cast<Idx>(i)] - expectedA);
+        }
+
+        REQUIRE(FuzzyEqual(sumErrA / static_cast<DataType>(arraySize) / expectedA, static_cast<DataType>(0.0)));
+
+
+        // Calculate and record benchmark results
+        bytesReadWriteMB = {
+            getDataThroughput<DataType>(2u, static_cast<unsigned>(arraySize)), // init
+            getDataThroughput<DataType>(4u, static_cast<unsigned>(arraySize)) // NStream
+        };
+
+        // Calculate the Bandwidth as Throughput per Seconds
+        if(minExecTimesOfKernels.size() == kernelLabels.size())
         {
-            bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+            for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i)
+            {
+                bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i)));
+            }
         }
+        metaData.setItem(BMInfoDataType::WorkDivNStream, workDivNStream);
     }
 
+
+    // For all options
     // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map
     metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp());
     metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns));
     metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain));
     metaData.setItem(BMInfoDataType::DataType, dataTypeStr);
-
-    metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit);
-    metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy);
-    metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd);
-    metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult);
-    metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad);
-
     // Device and accelerator
     metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc));
     metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName<Acc>());
@@ -462,15 +610,15 @@ void testKernels()
 using TestAccs1D = alpaka::test::EnabledAccs<alpaka::DimInt<1u>, std::uint32_t>;
 
 // Run for all Accs given by the argument
-TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Float>", "[benchmark-test]", TestAccs1D)
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels<Float>", "[benchmark-test]", TestAccs1D)
 {
     using Acc = TestType;
     // Run tests for the float data type
     testKernels<Acc, float>();
 }
 
-// Run for all Accs given by the argument
-TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels<Double>", "[benchmark-test]", TestAccs1D)
+// // Run for all Accs given by the argument
+TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels<Double>", "[benchmark-test]", TestAccs1D)
 {
     using Acc = TestType;
     // Run tests for the double data type