-
Notifications
You must be signed in to change notification settings - Fork 5
Multi Level Cache Benchmark With Gaussian Blur Computation (Integer Key Only)
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Oct 20, 2021
·
10 revisions
This is for single-thread read+write usage scenario. For multithreaded access, either use only getThreadSafe/setThreadSafe method on a lone cache instance (not multi-level) or use the multi level cache and do read-only access. Read-only access is inherently coherent. But writing method requires extra features (to be added later) for multithreading.
FX8150 3.6GHz: Up to 400 million lookups per second
LLC cache hit ratio (read)=0.889111
LLC cache hit ratio (write)=0.0463966
L1 tags = 16384 L2 tags=65536 LLC tags=262144 performance: 3126881009 nanoseconds (bandwidth = 66.81 MB/s) (throughput = 59.87 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=0.89
LLC cache hit ratio (write)=0.11
L1 tags = 32768 L2 tags=131072 LLC tags=524288 performance: 3109395649 nanoseconds (bandwidth = 67.18 MB/s) (throughput = 59.54 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 65536 L2 tags=262144 LLC tags=1048576 performance: 2882339512 nanoseconds (bandwidth = 72.47 MB/s) (throughput = 55.19 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 131072 L2 tags=524288 LLC tags=2097152 performance: 2852149115 nanoseconds (bandwidth = 73.24 MB/s) (throughput = 54.61 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 262144 L2 tags=1048576 LLC tags=4194304 performance: 872660999 nanoseconds (bandwidth = 239.38 MB/s) (throughput = 16.71 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 524288 L2 tags=2097152 LLC tags=8388608 performance: 739535392 nanoseconds (bandwidth = 282.47 MB/s) (throughput = 14.16 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 1048576 L2 tags=4194304 LLC tags=16777216 performance: 133906416 nanoseconds (bandwidth = 1560.02 MB/s) (throughput = 2.56 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 2097152 L2 tags=8388608 LLC tags=33554432 performance: 131743431 nanoseconds (bandwidth = 1585.63 MB/s) (throughput = 2.52 nanoseconds per iteration)
Finished!
Codechef.com with cacheScaling=16 at start: up to 750 million lookups per second
LLC cache hit ratio (read)=1
LLC cache hit ratio (write)=1
L1 tags = 262144 L2 tags=1048576 LLC tags=4194304 performance: 590046720 nanoseconds (bandwidth = 354.03 MB/s) (throughput = 11.30 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 524288 L2 tags=2097152 LLC tags=8388608 performance: 561319930 nanoseconds (bandwidth = 372.15 MB/s) (throughput = 10.75 nanoseconds per iteration)
Finished!
LLC cache hit ratio (read)=1.00
LLC cache hit ratio (write)=1.00
L1 tags = 1048576 L2 tags=4194304 LLC tags=16777216 performance: 69739823 nanoseconds (bandwidth = 2995.37 MB/s) (throughput = 1.34 nanoseconds per iteration)
Finished!
Source code:
#include <iostream>
#include <fstream>
#include <math.h>
#include "LruClockCache.h"
#include "DirectMappedCache.h"
#include "CacheThreader.h"
#include "CpuBenchmarker.h"
int findMandelbrot(double cr, double ci, int max_iterations) {
int i = 0;
double zr = 0.0, zi = 0.0;
while (i < max_iterations && zr * zr + zi * zi < 4.0) {
double temp = zr * zr - zi * zi + cr;
zi = 2.0 * zr * zi + ci;
zr = temp;
++i;
}
return i;
}
double mapToReal(int x, int imageWidth, double minR, double maxR) {
double range = maxR - minR;
return x * (range / imageWidth) + minR;
}
double mapToImaginary(int y, int imageHeight, double minI, double maxI) {
double range = maxI - minI;
return y * (range / imageHeight) + minI;
}
int main() {
// mandelbrot generation + (gaussian blur X5) using a multi-level cache
using namespace std;
int imageWidth, imageHeight, maxN;
double minR, maxR, minI, maxI;
imageWidth = 1024;
imageHeight = 1024;
maxN = 512;
minR = -1.5;
maxR = 0.7;
minI = -1.0;
maxI = 1.0;
size_t cacheScaling = 1;
for (int cacheScalingIter = 0; cacheScalingIter < 8; cacheScalingIter++) {
cacheScaling *= 2;
size_t readmiss = 0;
size_t writemiss = 0;
size_t read = 0;
size_t write = 0;
const int LLCsize = 1024 * 128 * cacheScaling;
std::vector < int > backingStore(5000000);
auto LLC = std::make_shared < LruClockCache < int, int >> (LLCsize,
[ & ](int key) {
readmiss++;
return backingStore[key];
},
[ & ](int key, int value) {
writemiss++;
backingStore[key] = value;
});
const int L2size = LLCsize / 4;
const int L1size = L2size / 4; // L1 size has to be integer power of 2 !!!
CacheThreader < LruClockCache, int, int > cache(LLC, L1size, L2size);
ofstream output("output_image_scaling" + std::to_string(cacheScaling) + ".ppm",ios::binary);
output << "P6" << endl;
output << imageWidth << " " << imageHeight << endl;
output << "255" << endl;
for (int i = 0; i < imageHeight; i++) {
for (int j = 0; j < imageWidth; j++) {
double cr = mapToReal(j, imageWidth, minR, maxR);
double ci = mapToImaginary(i, imageHeight, minI, maxI);
cache.set(i + j * imageWidth, findMandelbrot(cr, ci, maxN));
write++;
}
}
// benchmark
{
const int nGauss = 9;
const int GaussianBlurKernel[nGauss] = {
1, 2, 1,
2, 4, 2,
1, 2, 1
};
size_t nRepeats = 5;
size_t nReadsPerIter = nGauss;
size_t nWritesPerIter = 1;
size_t totalLookups = nRepeats * (imageHeight - 2) * (imageWidth - 2) * (nReadsPerIter + nWritesPerIter);
CpuBenchmarker bench(totalLookups * sizeof(int), "L1 tags = " + std::to_string(L1size) + " L2 tags=" + std::to_string(L2size) + " LLC tags=" + std::to_string(LLCsize) + " performance", totalLookups);
// image softening (may not be accurate)
// column-major iteration better for the L1 but doesn't matter for L2
// "nRepeats" iterations better for benchmarking
// tiled-computing to make even better cache-hit ratio
for (size_t k = 0; k < nRepeats; k++) {
for (int tiley = 0; tiley < imageHeight/32; tiley++) {
for (int tilex = 0; tilex < imageWidth/32; tilex++) {
for (int jj = 0; jj < 32; jj++) {
for (int ii = 0; ii < 32; ii++) {
const int i = tilex * 32 + ii;
const int j = tiley * 32 + jj;
if (j >= 1 && j <= imageHeight - 2 && i >= 1 && i <= imageWidth - 2) {
unsigned int pixelAccumulator1 = 0;
unsigned int pixelAccumulator2 = 0;
unsigned int pixelAccumulator3 = 0;
pixelAccumulator1 += cache.get(i - 1 + (j - 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-1 + 1) * 3];
pixelAccumulator2 += cache.get(i + 0 + (j - 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-1 + 1) * 3];
pixelAccumulator3 += cache.get(i + 1 + (j - 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-1 + 1) * 3];
pixelAccumulator1 += cache.get(i - 1 + (j - 0) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-0 + 1) * 3];
pixelAccumulator2 += cache.get(i + 0 + (j - 0) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-0 + 1) * 3];
pixelAccumulator3 += cache.get(i + 1 + (j - 0) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-0 + 1) * 3];
pixelAccumulator1 += cache.get(i - 1 + (j + 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (+1 + 1) * 3];
pixelAccumulator2 += cache.get(i + 0 + (j + 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (+1 + 1) * 3];
pixelAccumulator3 += cache.get(i + 1 + (j + 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (+1 + 1) * 3];
const int n = (pixelAccumulator1 + pixelAccumulator2 + pixelAccumulator3) >> 4;
cache.set(i + j * imageWidth, n);
read += 9;
write++;
}
}
}
}
}
}
std::cout << "LLC cache hit ratio (read)=" << (read - readmiss) / (float) read << std::endl;
std::cout << "LLC cache hit ratio (write)=" << (write - writemiss) / (float) write << std::endl;
}
for (int i = 0; i < imageHeight; i++) {
for (int j = 0; j < imageWidth; j++) {
int n = cache.get(i + j * imageWidth);
int r = ((int) sqrt(n) % 256);
int gr = (2 * n % 256);
int b = (n % 256);
write++;
output << (char) r << (char) gr << (char) b;
}
}
cout << "Finished!" << endl;
cache.flush(); // every thread needs to call flush on its own CacheThreader object after work is complete
output.flush();
LLC->flush(); // only call from main thread that owns the LLC
}
return 0;
}