From fc921ca3c94a2021e65db81e60d62bb370c4e988 Mon Sep 17 00:00:00 2001 From: Michal Babej Date: Fri, 5 May 2023 09:27:48 +0300 Subject: [PATCH] samples/hip-cuda: convert tests to report also Best iteration time original code reports Average iteration time --- .../BinomialOption/BinomialOption.cpp | 53 ++++++++--------- samples/hip-cuda/BitonicSort/BitonicSort.cpp | 32 ++++++---- samples/hip-cuda/DCT/DCT.cpp | 25 ++++++-- .../FastWalshTransform/FastWalshTransform.cpp | 47 ++++++++------- .../hip-cuda/FloydWarshall/FloydWarshall.cpp | 52 +++++++++------- samples/hip-cuda/dwtHaar1D/dwtHaar1D.cpp | 59 +++++++++---------- 6 files changed, 149 insertions(+), 119 deletions(-) diff --git a/samples/hip-cuda/BinomialOption/BinomialOption.cpp b/samples/hip-cuda/BinomialOption/BinomialOption.cpp index 0aa7b1303..9d32af82c 100644 --- a/samples/hip-cuda/BinomialOption/BinomialOption.cpp +++ b/samples/hip-cuda/BinomialOption/BinomialOption.cpp @@ -53,6 +53,7 @@ class BinomialOption { double setupTime; /**< Time taken to setup resources and building kernel */ double kernelTime; /**< Time taken to run kernel and read result back */ + float bestIterTimeMS; int numSamples; /**< No. of samples*/ unsigned int samplesPerVectorWidth; /**< No. of samples per vector width */ unsigned int numSteps; /**< No. of time steps*/ @@ -75,18 +76,14 @@ class BinomialOption * Initialize member variables */ BinomialOption() - : setupTime(0), - kernelTime(0), - randArray(NULL), - output(NULL), - refOutput(NULL), - iterations(1) - { - numSamples = 256; - numSteps = 254; - sampleArgs = new HIPCommandArgs() ; - sampleTimer = new SDKTimer(); - sampleArgs->sampleVerStr = SAMPLE_VERSION; + : setupTime(0), kernelTime(0), + bestIterTimeMS(std::numeric_limits::max()), + randArray(NULL), output(NULL), refOutput(NULL), iterations(1) { + numSamples = 256; + numSteps = 254; + sampleArgs = new HIPCommandArgs(); + sampleTimer = new SDKTimer(); + sampleArgs->sampleVerStr = SAMPLE_VERSION; } inline long long get_time() @@ -347,7 +344,7 @@ BinomialOption::runKernels() hipEventCreate(&start); hipEventCreate(&stop); - float eventMs = 1.0f; + float eventMs = 1000000.0f; unsigned int localThreads = {numSteps + 1}; @@ -360,14 +357,15 @@ BinomialOption::runKernels() 0, 0, numSteps ,(float4*)randBuffer ,(float4*)outBuffer); + hipMemcpy(output, dout, samplesPerVectorWidth * sizeof(float4), + hipMemcpyDeviceToHost); + hipEventRecord(stop, NULL); hipEventSynchronize(stop); hipEventElapsedTime(&eventMs, start, stop); - -// printf ("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs); - - hipMemcpy(output, dout, samplesPerVectorWidth * sizeof(float4), hipMemcpyDeviceToHost); + if (eventMs < bestIterTimeMS) + bestIterTimeMS = eventMs; return SDK_SUCCESS; } @@ -539,8 +537,8 @@ int BinomialOption::run() if(!sampleArgs->quiet) { - printArray("input", randArray, numSamples, 1); - printArray("Output", output, numSamples, 1); + printArray("input", randArray, numSamples, 1); + printArray("Output", output, numSamples, 1); } return SDK_SUCCESS; @@ -589,23 +587,20 @@ void BinomialOption::printStats() { if(sampleArgs->timing) { - std::string strArray[4] = - { - "Option Samples", - "Time(sec)", - "Transfer+kernel(sec)" , - "Options/sec" - }; + std::string strArray[5] = {"Option Samples", "Time(sec)", + "Transfer+kernel(sec)", + "Best Iter. time (msec)", "Options/sec"}; sampleTimer->totalTime = setupTime + kernelTime; - std::string stats[4]; + std::string stats[5]; stats[0] = toString(numSamples, std::dec); stats[1] = toString(sampleTimer->totalTime, std::dec); stats[2] = toString(kernelTime, std::dec); - stats[3] = toString(numSamples / sampleTimer->totalTime, std::dec); + stats[3] = toString(bestIterTimeMS, std::dec); + stats[4] = toString(numSamples / sampleTimer->totalTime, std::dec); - printStatistics(strArray, stats, 4); + printStatistics(strArray, stats, 5); } } diff --git a/samples/hip-cuda/BitonicSort/BitonicSort.cpp b/samples/hip-cuda/BitonicSort/BitonicSort.cpp index 814f60d8a..eadd86104 100644 --- a/samples/hip-cuda/BitonicSort/BitonicSort.cpp +++ b/samples/hip-cuda/BitonicSort/BitonicSort.cpp @@ -44,6 +44,7 @@ class BitonicSort double totalKernelTime; /**< Time for kernel execution */ double totalProgramTime; /**< Time for program execution */ double referenceKernelTime; /**< Time for reference implementation */ + float bestIterTimeMS; unsigned int sortFlag; /**< Flag to indicate sorting order */ std::string sortOrder; /**< Argument to indicate sorting order */ unsigned int *input; /**< Input array */ @@ -69,6 +70,7 @@ class BitonicSort length = 32768; setupTime = 0; totalKernelTime = 0; + bestIterTimeMS = std::numeric_limits::max(); iterations = 1; sampleArgs = new HIPCommandArgs() ; sampleTimer = new SDKTimer(); @@ -270,7 +272,7 @@ BitonicSort::runKernels(void) hipEventCreate(&start); hipEventCreate(&stop); - float eventMs = 1.0f; + float eventMs = 1000000.0f; unsigned int numStages = 0; unsigned int temp; @@ -331,12 +333,12 @@ BitonicSort::runKernels(void) sortFlag = 0; } + hipEventRecord(start, NULL); for(stage = 0; stage < numStages; ++stage) { for(passOfStage = 0; passOfStage < stage + 1; ++passOfStage) { - hipEventRecord(start, NULL); hipLaunchKernelGGL(bitonicSort, dim3(globalThreads/localThreads), @@ -344,15 +346,18 @@ BitonicSort::runKernels(void) 0, 0, inputBuffer ,stage, passOfStage ,sortFlag); - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - // printf ("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs); } } - hipMemcpy(input, din,length * sizeof(unsigned int), hipMemcpyDeviceToHost); + hipMemcpy(input, din, length * sizeof(unsigned int), hipMemcpyDeviceToHost); + + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + if (eventMs < bestIterTimeMS) + bestIterTimeMS = eventMs; + return SDK_SUCCESS; } @@ -586,17 +591,20 @@ void BitonicSort::printStats() { if(sampleArgs->timing) { - std::string strArray[4] = {"Elements", "Setup Time (sec)", "Avg. Kernel Time (sec)", "Elements/sec"}; - std::string stats[4]; + std::string strArray[5] = {"Elements", "Setup Time (sec)", + "Avg. Kernel Time (sec)", + "Best Iter. Time (msec)", "Elements/sec"}; + std::string stats[5]; sampleTimer->totalTime = ( totalKernelTime/ iterations ); stats[0] = toString(length, std::dec); stats[1] = toString(setupTime, std::dec); stats[2] = toString(sampleTimer->totalTime, std::dec); - stats[3] = toString(( length/sampleTimer->totalTime ), std::dec); + stats[3] = toString(bestIterTimeMS, std::dec); + stats[4] = toString((length / sampleTimer->totalTime), std::dec); - printStatistics(strArray, stats, 4); + printStatistics(strArray, stats, 5); } } int BitonicSort::cleanup() diff --git a/samples/hip-cuda/DCT/DCT.cpp b/samples/hip-cuda/DCT/DCT.cpp index 2d92f6879..4779bc351 100644 --- a/samples/hip-cuda/DCT/DCT.cpp +++ b/samples/hip-cuda/DCT/DCT.cpp @@ -75,6 +75,7 @@ class DCT double totalKernelTime; /**< Time for kernel execution */ double totalProgramTime; /**< Time for program execution */ double referenceKernelTime; /**< Time for reference implementation */ + float bestIterTimeMS; int width; /**< Width of the input array */ int height; /**< height of the input array */ float *input; /**< Input array */ @@ -112,6 +113,7 @@ class DCT inverse = 0; setupTime = 0; totalKernelTime = 0; + bestIterTimeMS = std::numeric_limits::max(); iterations = 1; sampleArgs = new HIPCommandArgs() ; sampleTimer = new SDKTimer(); @@ -385,7 +387,7 @@ int DCT::runKernels(void) { hipEvent_t start, stop; - float eventMs = 1.0f; + float eventMs = 1000000.0f; hipEventCreate(&start); hipEventCreate(&stop); @@ -399,15 +401,18 @@ DCT::runKernels(void) 0, 0, outputBuffer ,inputBuffer ,dctBuffer, dct_transBuffer, width, blockWidth, inverse ); + hipMemcpy(output, dout, sizeof(float) * width * height, + hipMemcpyDeviceToHost); + hipEventRecord(stop, NULL); hipEventSynchronize(stop); - hipEventElapsedTime(&eventMs, start, stop); + if (eventMs < bestIterTimeMS) + bestIterTimeMS = eventMs; // printf ("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs); - hipMemcpy(output, dout,sizeof(float) * width * height, hipMemcpyDeviceToHost); return SDK_SUCCESS; } @@ -656,8 +661,14 @@ void DCT::printStats() { if(sampleArgs->timing) { - std::string strArray[4] = {"Width", "Height", "Time(sec)", "[Transfer+Kernel]Time(sec)"}; - std::string stats[4]; + std::string strArray[5] = { + "Width", + "Height", + "Time(sec)", + "[Transfer+Kernel]Time(sec)", + "Best Iter. time (msec)", + }; + std::string stats[5]; sampleTimer->totalTime = setupTime + totalKernelTime; @@ -665,10 +676,12 @@ void DCT::printStats() stats[1] = toString(height , std::dec); stats[2] = toString(sampleTimer->totalTime, std::dec); stats[3] = toString(totalKernelTime, std::dec); + stats[4] = toString(bestIterTimeMS, std::dec); - printStatistics(strArray, stats, 4); + printStatistics(strArray, stats, 5); } } + int DCT::cleanup() { diff --git a/samples/hip-cuda/FastWalshTransform/FastWalshTransform.cpp b/samples/hip-cuda/FastWalshTransform/FastWalshTransform.cpp index 039ee9ec5..06e8034c3 100644 --- a/samples/hip-cuda/FastWalshTransform/FastWalshTransform.cpp +++ b/samples/hip-cuda/FastWalshTransform/FastWalshTransform.cpp @@ -44,6 +44,7 @@ class FastWalshTransform double setupTime; /**< Time for setting up OpenCL */ double totalKernelTime; /**< Time for kernel execution */ double totalProgramTime; /**< Time for program execution */ + float bestIterTimeMS; double referenceKernelTime; /**< Time for reference implementation */ int length; /**< Length of the input array */ float *input; /**< Input array */ @@ -70,6 +71,7 @@ class FastWalshTransform verificationInput = NULL; setupTime = 0; totalKernelTime = 0; + bestIterTimeMS = std::numeric_limits::max(); iterations = 1; sampleArgs = new HIPCommandArgs() ; sampleTimer = new SDKTimer(); @@ -191,7 +193,10 @@ FastWalshTransform::runKernels(void) hipEventCreate(&start); hipEventCreate(&stop); - float eventMs = 1.0f; + float eventMs = 10000000.0f; + + // Record the start event + hipEventRecord(start, NULL); float *din; hipHostGetDevicePointer((void**)&din, inputBuffer,0); @@ -201,28 +206,24 @@ FastWalshTransform::runKernels(void) int globalThreads = length / 2; int localThreads = 256; - for(int step = 1; step < length; step <<= 1) - { - // Record the start event - hipEventRecord(start, NULL); + for(int step = 1; step < length; step <<= 1) { + hipLaunchKernelGGL(fastWalshTransform, + dim3(globalThreads/localThreads), + dim3(localThreads), + 0, 0, + inputBuffer ,step); + } - hipLaunchKernelGGL(fastWalshTransform, - dim3(globalThreads/localThreads), - dim3(localThreads), - 0, 0, - inputBuffer ,step); + hipMemcpy(output, din, length * sizeof(float), hipMemcpyDeviceToHost); hipEventRecord(stop, NULL); hipEventSynchronize(stop); hipEventElapsedTime(&eventMs, start, stop); + if (eventMs < bestIterTimeMS) + bestIterTimeMS = eventMs; -// printf ("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs); - } - - hipMemcpy(output, din, length * sizeof(float), hipMemcpyDeviceToHost); - - return SDK_SUCCESS; + return SDK_SUCCESS; } /* @@ -408,16 +409,22 @@ FastWalshTransform::printStats() { if(sampleArgs->timing) { - std::string strArray[3] = {"Length", "Time(sec)", "[Transfer+Kernel]Time(sec)"}; - std::string stats[3]; + std::string strArray[4] = { + "Length", + "Time(sec)", + "[Transfer+Kernel]Time(sec)", + "Best Iter. time (msec)", + }; + std::string stats[4]; - sampleTimer->totalTime = setupTime + totalKernelTime ; + sampleTimer->totalTime = setupTime + totalKernelTime; stats[0] = toString(length, std::dec); stats[1] = toString(sampleTimer->totalTime, std::dec); stats[2] = toString(totalKernelTime, std::dec); + stats[3] = toString(bestIterTimeMS, std::dec); - printStatistics(strArray, stats, 3); + printStatistics(strArray, stats, 4); } } int diff --git a/samples/hip-cuda/FloydWarshall/FloydWarshall.cpp b/samples/hip-cuda/FloydWarshall/FloydWarshall.cpp index 3a0341abb..440219f7c 100644 --- a/samples/hip-cuda/FloydWarshall/FloydWarshall.cpp +++ b/samples/hip-cuda/FloydWarshall/FloydWarshall.cpp @@ -50,6 +50,7 @@ class FloydWarshall double setupTime; /**< Time for setting up Open*/ double totalKernelTime; /**< Time for kernel execution */ double totalProgramTime; /**< Time for program execution */ + float bestIterTimeMS; double referenceKernelTime; /**< Time for reference implementation */ unsigned int numNodes; /**< Number of nodes in the graph */ unsigned int *pathDistanceMatrix; /**< path distance array */ @@ -82,6 +83,7 @@ class FloydWarshall verificationPathMatrix = NULL; setupTime = 0; totalKernelTime = 0; + bestIterTimeMS = std::numeric_limits::max(); iterations = 1; blockSize = 16; sampleArgs = new HIPCommandArgs() ; @@ -354,40 +356,40 @@ FloydWarshall::runKernels(void) float *din, *di; - hipHostGetDevicePointer((void**)&din, pathDistanceBuffer,0); - hipHostGetDevicePointer((void**)&di, pathBuffer,0); - - hipMemcpy(din, pathDistanceMatrix, sizeof(unsigned int) * numNodes * numNodes, hipMemcpyHostToDevice); - hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); - float eventMs = 1.0f; + float eventMs = 1000000.0f; - for(unsigned int i = 0; i < numPasses; i += 1) - { // Record the start event hipEventRecord(start, NULL); - hipLaunchKernelGGL(floydWarshallPass, - dim3(globalThreads[0]/localThreads[0],globalThreads[1]/localThreads[1]), - dim3(localThreads[0],localThreads[1]), - 0, 0, - pathDistanceBuffer,pathBuffer,numNodes ,i); + hipHostGetDevicePointer((void **)&din, pathDistanceBuffer, 0); + hipHostGetDevicePointer((void **)&di, pathBuffer, 0); - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); + hipMemcpy(din, pathDistanceMatrix, + sizeof(unsigned int) * numNodes * numNodes, + hipMemcpyHostToDevice); - //printf ("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs); + for (unsigned int i = 0; i < numPasses; i += 1) { + hipLaunchKernelGGL(floydWarshallPass, + dim3(globalThreads[0] / localThreads[0], + globalThreads[1] / localThreads[1]), + dim3(localThreads[0], localThreads[1]), 0, 0, + pathDistanceBuffer, pathBuffer, numNodes, i); } - hipMemcpy(pathDistanceMatrix, din,numNodes * numNodes * sizeof(unsigned int), hipMemcpyDeviceToHost); hipMemcpy(pathMatrix, di,numNodes * numNodes * sizeof(unsigned int), hipMemcpyDeviceToHost); + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + if (eventMs < bestIterTimeMS) + bestIterTimeMS = eventMs; + return SDK_SUCCESS; } @@ -602,16 +604,22 @@ void FloydWarshall::printStats() { if(sampleArgs->timing) { - std::string strArray[3] = {"Nodes", "Time(sec)", "[Transfer+Kernel]Time(sec)"}; - std::string stats[3]; + std::string strArray[4] = { + "Nodes", + "Time(sec)", + "[Transfer+Kernel]Time(sec)", + "Best Iter. time (msec)", + }; + std::string stats[4]; sampleTimer->totalTime = setupTime + totalKernelTime; stats[0] = toString(numNodes, std::dec); stats[1] = toString(sampleTimer->totalTime, std::dec); stats[2] = toString(totalKernelTime, std::dec); + stats[3] = toString(bestIterTimeMS, std::dec); - printStatistics(strArray, stats, 3); + printStatistics(strArray, stats, 4); } } diff --git a/samples/hip-cuda/dwtHaar1D/dwtHaar1D.cpp b/samples/hip-cuda/dwtHaar1D/dwtHaar1D.cpp index aa2eb7399..fd7d689f9 100644 --- a/samples/hip-cuda/dwtHaar1D/dwtHaar1D.cpp +++ b/samples/hip-cuda/dwtHaar1D/dwtHaar1D.cpp @@ -50,6 +50,7 @@ class DwtHaar1D float *hOutData; /**< output data calculated on host */ double setupTime; /**< time taken to setup resources and building kernel */ double kernelTime; /**< time taken to run kernel and read result back */ + float bestIterTimeMS; float* inDataBuf; /**< memory buffer for input data */ float* dOutDataBuf; /**< memory buffer for output data */ float* dPartialOutDataBuf; /**< memory buffer for paritial decomposed signal */ @@ -69,19 +70,13 @@ class DwtHaar1D * @param name name of sample (string) */ DwtHaar1D() - : - signalLength(SIGNAL_LENGTH), - setupTime(0), - kernelTime(0), - inData(NULL), - dOutData(NULL), - dPartialOutData(NULL), - hOutData(NULL), - iterations(1) - { - sampleArgs = new HIPCommandArgs() ; - sampleTimer = new SDKTimer(); - sampleArgs->sampleVerStr = SAMPLE_VERSION; + : signalLength(SIGNAL_LENGTH), setupTime(0), kernelTime(0), + bestIterTimeMS(std::numeric_limits::max()), inData(NULL), + dOutData(NULL), dPartialOutData(NULL), hOutData(NULL), + iterations(1) { + sampleArgs = new HIPCommandArgs(); + sampleTimer = new SDKTimer(); + sampleArgs->sampleVerStr = SAMPLE_VERSION; } inline long long get_time() @@ -403,26 +398,12 @@ int DwtHaar1D::runDwtHaar1DKernel() hipMemcpy(din, inData, sizeof(float) * curSignalLength, hipMemcpyHostToDevice); - hipEvent_t start, stop; - - hipEventCreate(&start); - hipEventCreate(&stop); - float eventMs = 1.0f; - - // Record the start event - hipEventRecord(start, NULL); - hipLaunchKernelGGL(dwtHaar1D, dim3(globalThreads/localThreads), dim3(localThreads), 0, 0, inDataBuf ,dOutDataBuf ,dPartialOutDataBuf, totalLevels, curSignalLength,levelsDone, maxLevelsOnDevice); - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - // printf ("kernel_time (hipEventElapsedTime) =%6.3fms\n", eventMs); hipMemcpy(dOutData, dout, signalLength * sizeof(float), hipMemcpyDeviceToHost); @@ -453,6 +434,15 @@ DwtHaar1D::runKernels(void) float* temp = (float*)malloc(signalLength * sizeof(float)); memcpy(temp, inData, signalLength * sizeof(float)); + hipEvent_t start, stop; + + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 10000000.0f; + + // Record the start event + hipEventRecord(start, NULL); + levelsDone = 0; int one = 1; while((unsigned int)levelsDone < actualLevels) @@ -493,6 +483,12 @@ DwtHaar1D::runKernels(void) } + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + if (eventMs < bestIterTimeMS) + bestIterTimeMS = eventMs; memcpy(inData, temp, signalLength * sizeof(float)); free(temp); @@ -642,15 +638,18 @@ void DwtHaar1D::printStats() { if(sampleArgs->timing) { - std::string strArray[3] = {"SignalLength", "Time(sec)", "[Transfer+Kernel]Time(sec)"}; + std::string strArray[4] = {"SignalLength", "Time(sec)", + "[Transfer+Kernel]Time(sec)", + "Best Iter. time (msec)"}; sampleTimer->totalTime = setupTime + kernelTime; - std::string stats[3]; + std::string stats[4]; stats[0] = toString(signalLength, std::dec); stats[1] = toString(sampleTimer->totalTime, std::dec); stats[2] = toString(kernelTime, std::dec); + stats[3] = toString(bestIterTimeMS, std::dec); - printStatistics(strArray, stats, 3); + printStatistics(strArray, stats, 4); } }