Coherent Multithreaded Single Level Cache Example

#include<fstream>
#include<iostream>
#include<vector>
#include"NWaySetAssociativeMultiThreadCache.h"
// or #include"DirectMappedMultiThreadCache.h"
int main()
{
	  int imageWidth, imageHeight, maxN;
	  double minR, maxR, minI, maxI;
	  const int tileSize = 8;

	  maxN = 512;
	  minR = -1.5;
	  maxR = 0.7;
	  minI = -1.0;
	  maxI = 1.0;

	  size_t cacheScaling = 1;
	  for (int cacheScalingIter = 0; cacheScalingIter < 7; cacheScalingIter++) {
		imageWidth = tileSize*4*cacheScaling;
		imageHeight = tileSize*4*cacheScaling;
	    cacheScaling *= 2;

	    const int cacheSize = 1024 * 512;

	    // multithreaded read/write to vector is ok as long as each key is guarded by its own lock (which NWaySetAssociativeMultiThreadCache does)
	    std::vector < int > backingStore(imageWidth*imageHeight);

            // first parameter is number of sets
            // second parameter is number of tags per set
            // total size = tags * sets
            // You can also use DirectMappedMultiThreadCache <int,int> (totalSize,funcMissRead,funcMissWrite)
	    NWaySetAssociativeMultiThreadCache <int, int > cache(1024,cacheSize/(1024),[ & ](int key) {
	    	// no two threads will read same key simultaneously here
	    	// safe
	        return backingStore[key];
	      },
	      [ & ](int key, int value) {
		    // no two threads will write to same key simultaneously here
	    	// safe
	        backingStore[key] = value;
	      });

	    std::ofstream output("output_image_scaling" + std::to_string(cacheScaling) + ".ppm");
	    output << "P6" << std::endl;
	    output << imageWidth << " " << imageHeight << std::endl;
	    output << "255" << std::endl;

	    for (int i = 0; i < imageHeight; i++) {
	      for (int j = 0; j < imageWidth; j++) {
	        double cr = mapToReal(j, imageWidth, minR, maxR);
	        double ci = mapToImaginary(i, imageHeight, minI, maxI);

	        cache.setThreadSafe(i + j * imageWidth, findMandelbrot(cr, ci, maxN));


	      }
	    }

	    // benchmark
	    {

	      const int nGauss = 9;
	      const int GaussianBlurKernel[nGauss] = {
	        1,        2,        1,
	        2,        4,        2,
	        1,        2,        1
	      };
	      size_t nRepeats = 5;
	      size_t nReadsPerIter = nGauss;
	      size_t nWritesPerIter = 1;
	      size_t totalLookups = nRepeats * (imageHeight - 2) * (imageWidth - 2) * (nReadsPerIter + nWritesPerIter);
	      CpuBenchmarker bench(totalLookups * sizeof(int), "cacheSize = " + std::to_string(cacheSize) +"  image size="+std::to_string(imageWidth)+"x"+std::to_string(imageHeight) + "   performance", totalLookups);

	      // image softening (may not be accurate)
	      // column-major iteration better for the L1 but doesn't matter for L2
	      // "nRepeats" iterations better for benchmarking
	      // tiled-computing to make even better cache-hit ratio
	      for (size_t k = 0; k < nRepeats; k++) {
			#pragma omp parallel for
	        for (int tiley = 0; tiley < imageHeight/tileSize; tiley++) {
	          for (int tilex = 0; tilex < imageWidth/tileSize; tilex++) {
	            for (int jj = 0; jj < tileSize; jj++) {
	              for (int ii = 0; ii < tileSize; ii++) {
	                const int i = tilex * tileSize + ii;
	                const int j = tiley * tileSize + jj;
	                if (j >= 1 && j <= imageHeight - 2 && i >= 1 && i <= imageWidth - 2) {
	                  unsigned int pixelAccumulator1 = 0;
	                  unsigned int pixelAccumulator2 = 0;
	                  unsigned int pixelAccumulator3 = 0;

	                  pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j - 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-1 + 1) * 3];
	                  pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j - 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-1 + 1) * 3];
	                  pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j - 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-1 + 1) * 3];

	                  pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j - 0) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-0 + 1) * 3];
	                  pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j - 0) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-0 + 1) * 3];
	                  pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j - 0) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-0 + 1) * 3];

	                  pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j + 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (+1 + 1) * 3];
	                  pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j + 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (+1 + 1) * 3];
	                  pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j + 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (+1 + 1) * 3];

	                  const int n = (pixelAccumulator1 + pixelAccumulator2 + pixelAccumulator3) >> 4;
	                  cache.setThreadSafe(i + j * imageWidth, n);
	                }
	              }
	            }
	          }
	        }
	      }


	    }
	    for (int i = 0; i < imageHeight; i++) {
	      for (int j = 0; j < imageWidth; j++) {

	        int n = cache.getThreadSafe(i + j * imageWidth);

	        int r = ((int) sqrt(n) % 256);
	        int gr = (2 * n % 256);
	        int b = (n % 256);

	        output << (char) r << (char) gr << (char) b;
	      }

	    }
	    std::cout << "Finished!" << std::endl;

	    cache.flush(); // every thread needs to call flush on its own CacheThreader object after work is complete
	    output.flush();

	  }
	return 0;
}

Results on FX8150 (3.6 GHz):

cacheSize = 524288  image size=32x32   performance: 3199512 nanoseconds     (bandwidth = 56.26 MB/s)      (throughput = 71.10 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=64x64   performance: 7966699 nanoseconds     (bandwidth = 96.50 MB/s)      (throughput = 41.45 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=128x128   performance: 24354866 nanoseconds     (bandwidth = 130.37 MB/s)      (throughput = 30.68 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=256x256   performance: 69921027 nanoseconds     (bandwidth = 184.54 MB/s)      (throughput = 21.68 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=512x512   performance: 183688499 nanoseconds     (bandwidth = 283.20 MB/s)      (throughput = 14.12 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=1024x1024   performance: 889042572 nanoseconds     (bandwidth = 234.97 MB/s)      (throughput = 17.02 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=2048x2048   performance: 5144069803 nanoseconds     (bandwidth = 162.76 MB/s)      (throughput = 24.58 nanoseconds per iteration) 
Finished!

14 nanoseconds per pixel access = ~70 million lookups per second, coherently with writes and reads (but order of reads/writes are not taken into consideration here, only cache coherency tested, not image quality)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Coherent Multithreaded Single Level Cache Example

Clone this wiki locally