diff --git a/benchmarks/babelstream/src/babelStreamCommon.hpp b/benchmarks/babelstream/src/babelStreamCommon.hpp index a22f7d032d3..a625fcedf6c 100644 --- a/benchmarks/babelstream/src/babelStreamCommon.hpp +++ b/benchmarks/babelstream/src/babelStreamCommon.hpp @@ -28,55 +28,105 @@ namespace [[maybe_unused]] constexpr auto minArrSize = 1024 * 128; // Scalar value for Mul and Triad kernel parameters. - [[maybe_unused]] constexpr auto scalarVal = 2.0f; + [[maybe_unused]] constexpr double scalarVal = 0.4; // Block thread extent for DotKernel test work division parameters. [[maybe_unused]] constexpr auto blockThreadExtentMain = 1024; + [[maybe_unused]] constexpr auto dotGridBlockExtent = 256; // Number of runs for each kernel, can be changed by command line arguments. // At least 100 runs are recommended for good benchmarking. // To prevent timeouts in CI, a small value is used. [[maybe_unused]] auto numberOfRuns = 2; - // Data input value for babelstream. - [[maybe_unused]] constexpr auto valA = 1.0f; + // Data input values for babelstream. + [[maybe_unused]] constexpr double valA = 0.1; + [[maybe_unused]] constexpr double valB = 0.2; + // Change this if triad kernel is going to be run alone + [[maybe_unused]] constexpr double valC = 0.0; + + //! Values corresponding to the command line argument run-kernels + enum class KernelsToRun + { + All, // init, add, copy, mul, triad + Triad, // only init and triad + NStream // only init and nstream + }; + + [[maybe_unused]] KernelsToRun kernelsToBeExecuted{KernelsToRun::All}; //! handleCustomArguments Gets custom cmd line arguments from the all arguments. //! Namely gets --array-size=1234 and --number-runs=1234 and keeps the others which are //! command line args for Catch2 session. [[maybe_unused]] static void handleCustomArguments(int& argc, char* argv[]) { - std::vector newArgv; - newArgv.push_back(argv[0]); // Keep the program name + std::vector newArgv({argv[0]}); // keep program name for(int i = 1; i < argc; ++i) { std::string arg = argv[i]; if(arg.rfind("--array-size=", 0) == 0) { - auto const arrSize = std::stoi(arg.substr(13)); // Convert to integer - if(arrSize > minArrSize) + try { - arraySizeMain = arrSize; - std::cout << "Array size provided(items): " << arraySizeMain << std::endl; + // Convert argument to integer + auto arrSize = std::stoi(arg.substr(13)); + if(arrSize > minArrSize) + { + arraySizeMain = arrSize; + std::cout << "Array size set to: " << arraySizeMain << std::endl; + } + else + { + std::cout << "Array size too small. Must be at least " << minArrSize + << ", using default: " << arraySizeMain << std::endl; + } } - else + catch(std::invalid_argument const&) { - std::cout << "Too small array size given. Must be at least " << minArrSize << std::endl; - std::cout << "Using default array size(number of items): " << arraySizeMain << std::endl; + std::cerr << "Invalid array size argument: " << arg << ". Default value used." << std::endl; } } else if(arg.rfind("--number-runs=", 0) == 0) { - auto const numRuns = std::stoi(arg.substr(14)); // Convert to integer - if(numRuns > 0) + try { - numberOfRuns = numRuns; - std::cout << "Number of runs provided: " << numberOfRuns << std::endl; + // Convert argument to integer + auto const numRuns = std::stoi(arg.substr(14)); + if(numRuns > 0) + { + numberOfRuns = numRuns; + std::cout << "Number of runs provided: " << numberOfRuns << std::endl; + } + else + { + std::cout << "Using default number of runs: " << numberOfRuns << std::endl; + } } - else + catch(std::invalid_argument const&) { - std::cout << "Using default number of runs: " << numberOfRuns << std::endl; + std::cerr << "Invalid number of runs argument: " << arg << " . Default value used." << std::endl; + } + } + else if(arg.rfind("--run-kernels=", 0) == 0) + { + // Get argument to determine which kernels will be run + auto const kernelsString = arg.substr(14); + if(kernelsString == "nstream") + { + std::cout << "Only nstream kernel will be executed." << std::endl; + kernelsToBeExecuted = KernelsToRun::NStream; + } + else if(kernelsString == "triad") + { + kernelsToBeExecuted = KernelsToRun::Triad; + std::cout << "Only triad kernel will be executed." << std::endl; + } + else if(kernelsString == "all") + { + // The variable kernelsToBeExecuted default value is "all"; + kernelsToBeExecuted = KernelsToRun::All; + std::cout << "All 5 babelstream kernels are going to be executed." << std::endl; } } else @@ -87,8 +137,13 @@ namespace if(arg.rfind("-?", 0) == 0 || arg.rfind("--help", 0) == 0 || arg.rfind("-h", 0) == 0) { std::cout << "Usage of custom arguments (arguments which are not Catch2): --array-size=33554432 and " - "--number-runs=100" + "--number-runs=100\n" << std::endl; + std::cout + << "If you want to run only nstream kernel or triad kernel use --run-kernels=nstream or " + "--run-kernels=triad. Otherwise all 5 standard kernels will be executed. Init, Copy, Mul, Add, " + "Triad. (and Dot kernel, if multi-threaded acc is used.)" + << std::endl; } } @@ -98,6 +153,12 @@ namespace { argv[i] = newArgv[static_cast(i)]; } + + // Array size must a multiple of + if(arraySizeMain % blockThreadExtentMain != 0) + throw std::runtime_error( + "Array size is " + std::to_string(arraySizeMain) + ". It must be a multiple of block-size " + + std::to_string(blockThreadExtentMain)); } //! FuzzyEqual compares two floating-point or integral type values. @@ -111,7 +172,7 @@ namespace { if constexpr(std::is_floating_point_v) { - return std::fabs(a - b) < std::numeric_limits::epsilon() * static_cast(100.0); + return std::fabs(a - b) < (std::numeric_limits::epsilon() * static_cast(100.0)); } else if constexpr(std::is_integral_v) { @@ -219,6 +280,7 @@ namespace WorkDivTriad, WorkDivMult, WorkDivDot, + WorkDivNStream, DeviceName, TimeUnit, KernelNames, @@ -279,6 +341,8 @@ namespace return "WorkDivMult "; case BMInfoDataType::WorkDivDot: return "WorkDivDot "; + case BMInfoDataType::WorkDivNStream: + return "WorkDivNStream"; default: return ""; } @@ -353,11 +417,13 @@ namespace { std::stringstream ss; // define lambda to add values to a string stream created already - auto addItemValue = [&, this](BMInfoDataType item) { - ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); + auto addItemValue = [&, this](BMInfoDataType item) + { + if(metaDataMap.count(item) != 0) + ss << "\n" << typeToTypeStr(item) << ":" << metaDataMap.at(item); }; - // Initially chose some data to serialize + // Initially chose some data to serialize from the meta-data map to add to string stream ss << "\n"; addItemValue(BMInfoDataType::AcceleratorType); addItemValue(BMInfoDataType::NumRuns); @@ -369,9 +435,8 @@ namespace addItemValue(BMInfoDataType::WorkDivMult); addItemValue(BMInfoDataType::WorkDivAdd); addItemValue(BMInfoDataType::WorkDivTriad); - if(metaDataMap.count(BMInfoDataType::WorkDivDot) != 0) - addItemValue(BMInfoDataType::WorkDivDot); - + addItemValue(BMInfoDataType::WorkDivDot); + addItemValue(BMInfoDataType::WorkDivNStream); auto getItemFromStrList = [this](BMInfoDataType item, int index) -> std::string { std::string const str = metaDataMap.at(item); diff --git a/benchmarks/babelstream/src/babelStreamMainTest.cpp b/benchmarks/babelstream/src/babelStreamMainTest.cpp index 79ec6216508..aeddba0ea61 100644 --- a/benchmarks/babelstream/src/babelStreamMainTest.cpp +++ b/benchmarks/babelstream/src/babelStreamMainTest.cpp @@ -12,21 +12,26 @@ #include /** - * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. - * Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance (bytes/FLOP) - * value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double precision each - * read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP. + * Babelstream benchmarking example. Babelstream has 5 kernels. Add, Multiply, Copy, Triad and Dot. NStream is + * optional. Babelstream is a memory-bound benchmark since the main operation in the kernels has high Code Balance + * (bytes/FLOP) value. For example c[i] = a[i] + b[i]; has 2 reads 1 writes and has one FLOP operation. For double + * precision each read-write is 8 bytes. Hence Code Balance (3*8 / 1) = 24 bytes/FLOP. * * Some implementations and the documents are accessible through https://github.com/UoB-HPC * * Can be run with custom arguments as well as catch2 arguments - * Run with Custom arguments: + * Run with Custom arguments and for kernels: init,copy, mul, add, triad (and dot kernel if a multi-thread acc + * available): * ./babelstream --array-size=33554432 --number-runs=100 - * Runt with default array size and num runs: + * Run with Custom arguments and select from 3 kernel groups: all, triad, nstream + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=triad (only triad kernel) + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=nstream (only nstream kernel) + * ./babelstream --array-size=33554432 --number-runs=100 --run-kernels=all (default case. Add, Multiply, Copy, Triad + * and Dot) Run with default array size and num runs: * ./babelstream - * Run with Catch2 arguments and defaul arrary size and num runs: + * Run with Catch2 arguments and default arrary size and num runs: * ./babelstream --success - * ./babelstream -r a.xml + * ./babelstream -r xml * Run with Custom and catch2 arguments together: * ./babelstream --success --array-size=1280000 --number-runs=10 * Help to list custom and catch2 arguments @@ -59,11 +64,11 @@ struct InitKernel //! \param a Pointer for vector a //! \param initA the value to set all items in the vector template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T* b, T* c, T initA, T initB) const { auto const [i] = alpaka::getIdx(acc); a[i] = initA; - b[i] = static_cast(0.0); + b[i] = initB; c[i] = static_cast(0.0); } }; @@ -76,12 +81,12 @@ struct CopyKernel //! \tparam T The data type //! \param acc The accelerator to be executed on. //! \param a Pointer for vector a - //! \param b Pointer for vector b + //! \param c Pointer for vector c template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* b) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T* c) const { auto const [index] = alpaka::getIdx(acc); - b[index] = a[index]; + c[index] = a[index]; } }; @@ -92,14 +97,14 @@ struct MultKernel //! \tparam TAcc The accelerator environment to be executed on. //! \tparam T The data type //! \param acc The accelerator to be executed on. - //! \param a Pointer for vector a + //! \param c Pointer for vector c //! \param b Pointer for result vector b template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const a, T* b) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* const c, T* b) const { const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); - b[i] = scalar * a[i]; + b[i] = scalar * c[i]; } }; @@ -132,11 +137,23 @@ struct TriadKernel //! \param b Pointer for vector b //! \param c Pointer for result vector c template - ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* c) const + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const { const T scalar = static_cast(scalarVal); auto const [i] = alpaka::getIdx(acc); - c[i] = a[i] + scalar * b[i]; + a[i] = b[i] + scalar * c[i]; + } +}; + +//! Optional kernel, not one of the 5 standard Babelstream kernels +struct NstreamKernel +{ + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, T* a, T const* b, T const* c) const + { + const T scalar = static_cast(scalarVal); + auto const [i] = alpaka::getIdx(acc); + a[i] += b[i] + scalar * c[i]; } }; @@ -150,7 +167,8 @@ struct DotKernel //! \param acc The accelerator to be executed on. //! \param a Pointer for vector a //! \param b Pointer for vector b - //! \param sum Pointer for result vector consisting sums for each block + //! \param sum Pointer for result vector consisting sums of blocks + //! \param arraySize the size of the array template ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* a, T const* b, T* sum, alpaka::Idx arraySize) const { @@ -186,6 +204,11 @@ struct DotKernel template void testKernels() { + if(kernelsToBeExecuted == KernelsToRun::All) + { + std::cout << "Kernels: Init, Copy, Mul, Add Kernels (and Dot kernel, if acc is multi-thread per block.)" + << std::endl; + } using Acc = TAcc; // Define the index domain // Set the number of dimensions as an integral constant. Set to 1 for 1D. @@ -251,7 +274,8 @@ void testKernels() bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr, - static_cast(valA)); + static_cast(valA), + static_cast(valB)); auto const workDivCopy = alpaka::getValidWorkDiv(kernelCfg, devAcc, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); auto const workDivMult @@ -267,6 +291,14 @@ void testKernels() bufAccInputBPtr, bufAccOutputCPtr); + auto const workDivNStream = alpaka::getValidWorkDiv( + kernelCfg, + devAcc, + NstreamKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + // Vector of average run-times of babelstream kernels std::vector avgExecTimesOfKernels; std::vector minExecTimesOfKernels; @@ -312,136 +344,252 @@ void testKernels() bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr, - static_cast(valA)); + static_cast(valA), + static_cast(valB)); }, "InitKernel"); - // Test the copy-kernel. Copy A one by one to B. - measureKernelExec( - [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccInputBPtr); }, - "CopyKernel"); + if(kernelsToBeExecuted == KernelsToRun::All) + { + // Test the copy-kernel. Copy A one by one to C. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivCopy, CopyKernel(), bufAccInputAPtr, bufAccOutputCPtr); }, + "CopyKernel"); - // Test the scaling-kernel. Calculate B=scalar*A. - measureKernelExec( - [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccInputAPtr, bufAccInputBPtr); }, - "MultKernel"); + // Test the scaling-kernel. Calculate B=scalar*C. Where C = A. + measureKernelExec( + [&]() { alpaka::exec(queue, workDivMult, MultKernel(), bufAccOutputCPtr, bufAccInputBPtr); }, + "MultKernel"); - // Test the addition-kernel. Calculate C=A+B. Where B=scalar*A. - measureKernelExec( - [&]() - { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, - "AddKernel"); + // Test the addition-kernel. Calculate C=A+B. Where B=scalar*C or B=scalar*A. + measureKernelExec( + [&]() + { alpaka::exec(queue, workDivAdd, AddKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, + "AddKernel"); + } - // Test the Triad-kernel. Calculate C=A+scalar*B where B=scalar*A. - measureKernelExec( - [&]() - { alpaka::exec(queue, workDivTriad, TriadKernel(), bufAccInputAPtr, bufAccInputBPtr, bufAccOutputCPtr); }, - "TriadKernel"); + if(kernelsToBeExecuted == KernelsToRun::All || kernelsToBeExecuted == KernelsToRun::Triad) + { + // Test the Triad-kernel. Calculate A=B+scalar*C. Where C is A+scalar*A. + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivTriad, + TriadKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "TriadKernel"); + } + else if(kernelsToBeExecuted == KernelsToRun::NStream) + { + // Test the NStream-kernel. Calculate A += B + scalar * C; + measureKernelExec( + [&]() { + alpaka::exec( + queue, + workDivNStream, + NstreamKernel(), + bufAccInputAPtr, + bufAccInputBPtr, + bufAccOutputCPtr); + }, + "NstreamKernel"); + } - // Copy arrays back to host + // Copy arrays back to host since the execution of kernels except dot kernel finished alpaka::memcpy(queue, bufHostOutputC, bufAccOutputC, arraySize); alpaka::memcpy(queue, bufHostOutputB, bufAccInputB, arraySize); alpaka::memcpy(queue, bufHostOutputA, bufAccInputA, arraySize); - // Verify the results // - // Find sum of the errors as sum of the differences from expected values - DataType initVal{static_cast(0.0)}; - DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; - - auto const expectedC = static_cast(valA + scalarVal * scalarVal * valA); - auto const expectedB = static_cast(scalarVal * valA); - auto const expectedA = static_cast(valA); + // Result Verification and BW Calculation + // - // sum of the errors for each array - for(Idx i = 0; i < arraySize; ++i) + std::vector bandwidthsPerKernel; + std::vector bytesReadWriteMB; + if(kernelsToBeExecuted == KernelsToRun::All) { - sumErrC += bufHostOutputC[static_cast(i)] - expectedC; - sumErrB += bufHostOutputB[static_cast(i)] - expectedB; - sumErrA += bufHostOutputA[static_cast(i)] - expectedA; - } + // Verify the results for all 5 babelstream kernels + // Find sum of the errors as sum of the differences from expected values + DataType initVal{static_cast(0.0)}; + DataType sumErrC{initVal}, sumErrB{initVal}, sumErrA{initVal}; - // Normalize and compare sum of the errors - REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); - REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); - REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); - alpaka::wait(queue); - // Test Dot kernel with specific blocksize which is larger than 1 - if constexpr(alpaka::accMatchesTags) - { - using WorkDiv = alpaka::WorkDivMembers; - // Threads per block for Dot kernel - constexpr Idx blockThreadExtent = blockThreadExtentMain; - // Blocks per grid for Dot kernel - constexpr Idx gridBlockExtent = static_cast(256); - // Vector of sums of each block - auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); - auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); - // A specific work-division is used for dotKernel - auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)}; + auto const expectedB = static_cast(scalarVal * valA); + auto const expectedC = static_cast(static_cast(valA) + expectedB); + auto const expectedA = static_cast(expectedB + static_cast(scalarVal) * expectedC); - measureKernelExec( - [&]() - { - alpaka::exec( - queue, - workDivDot, - DotKernel(), // Dot kernel - alpaka::getPtrNative(bufAccInputA), - alpaka::getPtrNative(bufAccInputB), - alpaka::getPtrNative(bufAccSumPerBlock), - static_cast>(arraySize)); - }, - "DotKernel"); + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrC += std::fabs(bufHostOutputC[static_cast(i)] - expectedC); + sumErrB += std::fabs(bufHostOutputB[static_cast(i)] - expectedB); + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } - alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent); + // Normalize and compare sum of the errors + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE(FuzzyEqual(sumErrC / static_cast(arraySize) / expectedC, static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrB / static_cast(arraySize) / expectedB, static_cast(0.0))); + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); alpaka::wait(queue); - DataType const* sumPtr = std::data(bufHostSumPerBlock); - auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); - // Since vector values are 1, dot product should be identical to arraySize - REQUIRE(FuzzyEqual(static_cast(result), static_cast(arraySize * 2))); - // Add workdiv to the list of workdivs to print later - metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); - } + // Test Dot kernel with specific blocksize which is larger than 1 + if constexpr(alpaka::accMatchesTags) + { + using WorkDiv = alpaka::WorkDivMembers; + // Threads per block for Dot kernel + constexpr Idx blockThreadExtent = blockThreadExtentMain; + // Blocks per grid for Dot kernel + const Idx gridBlockExtent = static_cast(dotGridBlockExtent); + // Vector of sums of each block + auto bufAccSumPerBlock = alpaka::allocBuf(devAcc, gridBlockExtent); + auto bufHostSumPerBlock = alpaka::allocBuf(devHost, gridBlockExtent); + // A specific work-division is used for dotKernel + auto const workDivDot = WorkDiv{Vec{gridBlockExtent}, Vec{blockThreadExtent}, Vec::all(1)}; + + measureKernelExec( + [&]() + { + alpaka::exec( + queue, + workDivDot, + DotKernel(), // Dot kernel + alpaka::getPtrNative(bufAccInputA), + alpaka::getPtrNative(bufAccInputB), + alpaka::getPtrNative(bufAccSumPerBlock), + static_cast>(arraySize)); + }, + "DotKernel"); + + alpaka::memcpy(queue, bufHostSumPerBlock, bufAccSumPerBlock, gridBlockExtent); + alpaka::wait(queue); + DataType const* sumPtr = std::data(bufHostSumPerBlock); + auto const result = std::reduce(sumPtr, sumPtr + gridBlockExtent, DataType{0}); - // - // Calculate and Display Benchmark Results - // - std::vector bytesReadWriteMB = { - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - getDataThroughput(3u, static_cast(arraySize)), - getDataThroughput(3u, static_cast(arraySize)), - getDataThroughput(2u, static_cast(arraySize)), - }; + auto const expectedSum = static_cast(arraySize) * expectedA * expectedB; + // Dot product should be identical to arraySize*valA*valB + // Use a different equality check if floating point errors exceed precision of FuzzyEqual function + REQUIRE( + FuzzyEqual((static_cast(result) - expectedSum) / expectedSum, static_cast(0.0))); - // calculate the bandwidth as throughput per seconds - std::vector bandwidthsPerKernel; - if(minExecTimesOfKernels.size() == kernelLabels.size()) + // Add workdiv to the list of workdivs to print later + metaData.setItem(BMInfoDataType::WorkDivDot, workDivDot); + } + + // + // Calculate and Display Benchmark Results for Kernels + // + bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), // init + getDataThroughput(2u, static_cast(arraySize)), // copy + getDataThroughput(2u, static_cast(arraySize)), // mul + getDataThroughput(3u, static_cast(arraySize)), // add + getDataThroughput(3u, static_cast(arraySize)), // triad + getDataThroughput(2u, static_cast(arraySize)), // dot + }; + + // calculate the bandwidth as throughput per seconds + + if(minExecTimesOfKernels.size() == kernelLabels.size()) + { + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } + } + + metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); + metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); + metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); + metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); + + } // for all 5 kernels and dot if accelerator is suitable + + // Verify the Triad Kernel for "--run-kernels=triad". Namely, for only-triad run case and calculate the bandwidth + else if(kernelsToBeExecuted == KernelsToRun::Triad) + { + // Verify triad + DataType sumErrA{0.0}; + auto const expectedA = static_cast(valB + scalarVal * valC); + + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } + + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + + // Calculate and record benchmark results + bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), // init + getDataThroughput(3u, static_cast(arraySize)) // triad + }; + + // calculate the bandwidth as throughput per seconds + if(minExecTimesOfKernels.size() == kernelLabels.size()) + { + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } + } + metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); + } + + // Verify the NStream Kernel for "--run-kernels=nstream". Namely for the only-nstream run case and calculate the + // bandwidth + else if(kernelsToBeExecuted == KernelsToRun::NStream) { - for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + DataType sumErrA{0.0}; + // initial run of NStream kernel + DataType expectedA = static_cast(valA); + // each run changes the result + for(int i = 0; i < numberOfRuns; i++) + { + expectedA += static_cast(valB + scalarVal * valC); + } + + // sum of the errors for each array + for(Idx i = 0; i < arraySize; ++i) + { + sumErrA += std::fabs(bufHostOutputA[static_cast(i)] - expectedA); + } + + REQUIRE(FuzzyEqual(sumErrA / static_cast(arraySize) / expectedA, static_cast(0.0))); + + + // Calculate and record benchmark results + bytesReadWriteMB = { + getDataThroughput(2u, static_cast(arraySize)), // init + getDataThroughput(4u, static_cast(arraySize)) // NStream + }; + + // Calculate the Bandwidth as Throughput per Seconds + if(minExecTimesOfKernels.size() == kernelLabels.size()) { - bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + for(size_t i = 0; i < minExecTimesOfKernels.size(); ++i) + { + bandwidthsPerKernel.push_back(calculateBandwidth(bytesReadWriteMB.at(i), minExecTimesOfKernels.at(i))); + } } + metaData.setItem(BMInfoDataType::WorkDivNStream, workDivNStream); } + + // For all options // Setting fields of Benchmark Info map. All information about benchmark and results are stored in a single map metaData.setItem(BMInfoDataType::TimeStamp, getCurrentTimestamp()); metaData.setItem(BMInfoDataType::NumRuns, std::to_string(numberOfRuns)); metaData.setItem(BMInfoDataType::DataSize, std::to_string(arraySizeMain)); metaData.setItem(BMInfoDataType::DataType, dataTypeStr); - - metaData.setItem(BMInfoDataType::WorkDivInit, workDivInit); - metaData.setItem(BMInfoDataType::WorkDivCopy, workDivCopy); - metaData.setItem(BMInfoDataType::WorkDivAdd, workDivAdd); - metaData.setItem(BMInfoDataType::WorkDivMult, workDivMult); - metaData.setItem(BMInfoDataType::WorkDivTriad, workDivTriad); - // Device and accelerator metaData.setItem(BMInfoDataType::DeviceName, alpaka::getName(devAcc)); metaData.setItem(BMInfoDataType::AcceleratorType, alpaka::getAccName()); @@ -462,15 +610,15 @@ void testKernels() using TestAccs1D = alpaka::test::EnabledAccs, std::uint32_t>; // Run for all Accs given by the argument -TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels", "[benchmark-test]", TestAccs1D) { using Acc = TestType; // Run tests for the float data type testKernels(); } -// Run for all Accs given by the argument -TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Five Kernels", "[benchmark-test]", TestAccs1D) +// // Run for all Accs given by the argument +TEMPLATE_LIST_TEST_CASE("TEST: Babelstream Kernels", "[benchmark-test]", TestAccs1D) { using Acc = TestType; // Run tests for the double data type