-
Notifications
You must be signed in to change notification settings - Fork 5
Coherent Multithreaded Single Level Cache Example
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Oct 13, 2021
·
2 revisions
#include<fstream>
#include<iostream>
#include<vector>
#include"NWaySetAssociativeMultiThreadCache.h"
// or #include"DirectMappedMultiThreadCache.h"
int main()
{
int imageWidth, imageHeight, maxN;
double minR, maxR, minI, maxI;
const int tileSize = 8;
maxN = 512;
minR = -1.5;
maxR = 0.7;
minI = -1.0;
maxI = 1.0;
size_t cacheScaling = 1;
for (int cacheScalingIter = 0; cacheScalingIter < 7; cacheScalingIter++) {
imageWidth = tileSize*4*cacheScaling;
imageHeight = tileSize*4*cacheScaling;
cacheScaling *= 2;
const int cacheSize = 1024 * 512;
// multithreaded read/write to vector is ok as long as each key is guarded by its own lock (which NWaySetAssociativeMultiThreadCache does)
std::vector < int > backingStore(imageWidth*imageHeight);
// first parameter is number of sets
// second parameter is number of tags per set
// total size = tags * sets
// You can also use DirectMappedMultiThreadCache <int,int> (totalSize,funcMissRead,funcMissWrite)
NWaySetAssociativeMultiThreadCache <int, int > cache(1024,cacheSize/(1024),[ & ](int key) {
// no two threads will read same key simultaneously here
// safe
return backingStore[key];
},
[ & ](int key, int value) {
// no two threads will write to same key simultaneously here
// safe
backingStore[key] = value;
});
std::ofstream output("output_image_scaling" + std::to_string(cacheScaling) + ".ppm");
output << "P6" << std::endl;
output << imageWidth << " " << imageHeight << std::endl;
output << "255" << std::endl;
for (int i = 0; i < imageHeight; i++) {
for (int j = 0; j < imageWidth; j++) {
double cr = mapToReal(j, imageWidth, minR, maxR);
double ci = mapToImaginary(i, imageHeight, minI, maxI);
cache.setThreadSafe(i + j * imageWidth, findMandelbrot(cr, ci, maxN));
}
}
// benchmark
{
const int nGauss = 9;
const int GaussianBlurKernel[nGauss] = {
1, 2, 1,
2, 4, 2,
1, 2, 1
};
size_t nRepeats = 5;
size_t nReadsPerIter = nGauss;
size_t nWritesPerIter = 1;
size_t totalLookups = nRepeats * (imageHeight - 2) * (imageWidth - 2) * (nReadsPerIter + nWritesPerIter);
CpuBenchmarker bench(totalLookups * sizeof(int), "cacheSize = " + std::to_string(cacheSize) +" image size="+std::to_string(imageWidth)+"x"+std::to_string(imageHeight) + " performance", totalLookups);
// image softening (may not be accurate)
// column-major iteration better for the L1 but doesn't matter for L2
// "nRepeats" iterations better for benchmarking
// tiled-computing to make even better cache-hit ratio
for (size_t k = 0; k < nRepeats; k++) {
#pragma omp parallel for
for (int tiley = 0; tiley < imageHeight/tileSize; tiley++) {
for (int tilex = 0; tilex < imageWidth/tileSize; tilex++) {
for (int jj = 0; jj < tileSize; jj++) {
for (int ii = 0; ii < tileSize; ii++) {
const int i = tilex * tileSize + ii;
const int j = tiley * tileSize + jj;
if (j >= 1 && j <= imageHeight - 2 && i >= 1 && i <= imageWidth - 2) {
unsigned int pixelAccumulator1 = 0;
unsigned int pixelAccumulator2 = 0;
unsigned int pixelAccumulator3 = 0;
pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j - 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-1 + 1) * 3];
pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j - 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-1 + 1) * 3];
pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j - 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-1 + 1) * 3];
pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j - 0) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-0 + 1) * 3];
pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j - 0) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-0 + 1) * 3];
pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j - 0) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-0 + 1) * 3];
pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j + 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (+1 + 1) * 3];
pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j + 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (+1 + 1) * 3];
pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j + 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (+1 + 1) * 3];
const int n = (pixelAccumulator1 + pixelAccumulator2 + pixelAccumulator3) >> 4;
cache.setThreadSafe(i + j * imageWidth, n);
}
}
}
}
}
}
}
for (int i = 0; i < imageHeight; i++) {
for (int j = 0; j < imageWidth; j++) {
int n = cache.getThreadSafe(i + j * imageWidth);
int r = ((int) sqrt(n) % 256);
int gr = (2 * n % 256);
int b = (n % 256);
output << (char) r << (char) gr << (char) b;
}
}
std::cout << "Finished!" << std::endl;
cache.flush(); // every thread needs to call flush on its own CacheThreader object after work is complete
output.flush();
}
return 0;
}
Results on FX8150 (3.6 GHz):
cacheSize = 524288 image size=32x32 performance: 3199512 nanoseconds (bandwidth = 56.26 MB/s) (throughput = 71.10 nanoseconds per iteration)
Finished!
cacheSize = 524288 image size=64x64 performance: 7966699 nanoseconds (bandwidth = 96.50 MB/s) (throughput = 41.45 nanoseconds per iteration)
Finished!
cacheSize = 524288 image size=128x128 performance: 24354866 nanoseconds (bandwidth = 130.37 MB/s) (throughput = 30.68 nanoseconds per iteration)
Finished!
cacheSize = 524288 image size=256x256 performance: 69921027 nanoseconds (bandwidth = 184.54 MB/s) (throughput = 21.68 nanoseconds per iteration)
Finished!
cacheSize = 524288 image size=512x512 performance: 183688499 nanoseconds (bandwidth = 283.20 MB/s) (throughput = 14.12 nanoseconds per iteration)
Finished!
cacheSize = 524288 image size=1024x1024 performance: 889042572 nanoseconds (bandwidth = 234.97 MB/s) (throughput = 17.02 nanoseconds per iteration)
Finished!
cacheSize = 524288 image size=2048x2048 performance: 5144069803 nanoseconds (bandwidth = 162.76 MB/s) (throughput = 24.58 nanoseconds per iteration)
Finished!
14 nanoseconds per pixel access = ~70 million lookups per second, coherently with writes and reads (but order of reads/writes are not taken into consideration here, only cache coherency tested, not image quality)