diff --git a/mac/README.md b/mac/README.md new file mode 100644 index 0000000..e9eedd2 --- /dev/null +++ b/mac/README.md @@ -0,0 +1,11 @@ +# MAC Simulation Benchmark + +## Introduction +MAC (Marker-And-Cell) is an efficient fluid simulation solver. + +## Implementation +We implemented the velocity calculation in Taichi, Numba (CPU), Numba +(GPU) and CUDA. + +## Results +We compared the performance in GFLOPS. diff --git a/mac/src/benchmark.py b/mac/src/benchmark.py new file mode 100644 index 0000000..aa41b57 --- /dev/null +++ b/mac/src/benchmark.py @@ -0,0 +1,39 @@ +from numba_kernels import launch_numba, launch_numba_cuda, calc_velocity_numba +from taichi_kernels import launch_taichi, init_taichi +from utils import gen_data_numpy, gen_data_numba_cuda, gen_data_taichi, compute_ground_truth, nx, ny + +from multiprocessing import Process +import time + + +def run(gen_data_func, calc_func, name, ut_groundtruth=None, vt_groundtruth=None, ctx_init_func=None): + print(f"=========================================") + print(f"Running benchmark for target {name}...") + if ctx_init_func is not None: + ctx_init_func() + data_args = gen_data_func() + calc_func(*data_args, name=name, + ut_gt=ut_groundtruth, vt_gt=vt_groundtruth) + + +print(f"Benchmark for {nx}x{ny} arrays") +print("Computing ground truth...") +ut, vt = compute_ground_truth(calc_velocity_numba) +print("Done") + +# Benchmark Numba CPU +p = run(gen_data_numpy, launch_numba, "Numba-CPU") + +# Benchmark Numba CUDA +# Numpy CUDA has context conflict with Taichi, move into a subprocess. +p = Process(target=run, args=(gen_data_numba_cuda, + launch_numba_cuda, "Numba-CUDA", ut, vt)) +p.start() +p.join() +time.sleep(1) + +# Benchmark Taichi CUDA +p = Process(target=run, args=(gen_data_taichi, launch_taichi, + "Taichi-CUDA", ut, vt, init_taichi)) +p.start() +p.join() diff --git a/mac/src/numba_kernels.py b/mac/src/numba_kernels.py new file mode 100644 index 0000000..36b5304 --- /dev/null +++ b/mac/src/numba_kernels.py @@ -0,0 +1,59 @@ +from numba import njit, prange, cuda +from utils import benchmark, nx, ny, dx, dy, dt, mu + + +@njit(parallel=True, fastmath=True) +def calc_velocity_numba(u, v, ut, vt): + for i in prange(1, nx): + for j in prange(1, ny+1): + ut[i, j] = u[i, j] + dt * ((-0.25) * + (((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx + + ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j]) + - (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy) + + (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2 + + (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2)) + for i in prange(1, nx+1): + for j in prange(1, ny): + vt[i, j] = v[i, j] + dt * ((-0.25) * + (((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j]) + - (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx + + ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy) + + (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2 + + (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2)) + + +@benchmark +def launch_numba(*args): + calc_velocity_numba(*args) + + +@cuda.jit +def calc_velocity_numba_cuda(u, v, ut, vt): + tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x + if tid > ((nx + 1) * (ny + 1)): + return + i = tid // (ny + 1) + 1 + j = tid % (ny + 1) + 1 + if (i < nx and j < ny + 1): + ut[i, j] = u[i, j] + dt * ((-0.25) * + (((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx + + ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j]) + - (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy) + + (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2 + + (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2)) + if (i < nx + 1 and j < ny): + vt[i, j] = v[i, j] + dt * ((-0.25) * + (((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j]) + - (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx + + ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy) + + (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2 + + (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2)) + + +@benchmark +def launch_numba_cuda(*args): + block_dim = 128 + total_threads = (nx + 1) * (ny + 1) + grid_dim = (total_threads + block_dim - 1) // block_dim + calc_velocity_numba_cuda[grid_dim, block_dim](*args) + cuda.synchronize() diff --git a/mac/src/taichi_kernels.py b/mac/src/taichi_kernels.py new file mode 100644 index 0000000..73c154b --- /dev/null +++ b/mac/src/taichi_kernels.py @@ -0,0 +1,30 @@ +import taichi as ti +from utils import benchmark, nx, ny, dx, dy, dt, mu + + +def init_taichi(): + ti.init(arch=ti.cuda, device_memory_fraction=0.5) + + +@ti.kernel +def calc_velocity_taichi(u: ti.types.ndarray(), v: ti.types.ndarray(), ut: ti.types.ndarray(), vt: ti.types.ndarray()): + for i, j in ti.ndrange((1, nx), (1, ny+1)): + ut[i, j] = u[i, j] + dt * ((-0.25) * + (((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx + + ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j]) + - (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy) + + (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2 + + (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2)) + for i, j in ti.ndrange((1, nx+1), (1, ny)): + vt[i, j] = v[i, j] + dt * ((-0.25) * + (((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j]) + - (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx + + ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy) + + (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2 + + (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2)) + + +@benchmark +def launch_taichi(*args): + calc_velocity_taichi(*args) + ti.sync() diff --git a/mac/src/utils.py b/mac/src/utils.py new file mode 100644 index 0000000..793ba66 --- /dev/null +++ b/mac/src/utils.py @@ -0,0 +1,71 @@ +import time +import taichi as ti +import numpy as np +from numba import cuda + +lx, ly, nx, ny = 1.0, 1.0, 4096, 4096 +dx, dy, dt = lx/nx, ly/ny, 0.001 +mu = 0.001 + + +def gen_data_numpy(): + np.random.seed(42) + u = np.random.rand(nx + 1, ny + 2).astype(np.float32) + v = np.random.rand(nx + 2, ny + 1).astype(np.float32) + ut = np.ones((nx + 1, ny + 2), dtype=np.float32) + vt = np.ones((nx + 2, ny + 1), dtype=np.float32) + return u, v, ut, vt + + +def gen_data_numba_cuda(): + u, v, ut, vt = gen_data_numpy() + u_dev = cuda.to_device(u) + v_dev = cuda.to_device(v) + ut_dev = cuda.to_device(ut) + vt_dev = cuda.to_device(vt) + return u_dev, v_dev, ut_dev, vt_dev + + +def gen_data_taichi(): + u, v, ut, vt = gen_data_numpy() + ti_u = ti.ndarray(shape=(nx + 1, ny + 2), dtype=ti.f32) + ti_v = ti.ndarray(shape=(nx + 2, ny + 1), dtype=ti.f32) + ti_ut = ti.ndarray(shape=(nx + 1, ny + 2), dtype=ti.f32) + ti_vt = ti.ndarray(shape=(nx + 2, ny + 1), dtype=ti.f32) + ti_u.from_numpy(u) + ti_v.from_numpy(v) + ti_ut.from_numpy(ut) + ti_vt.from_numpy(vt) + return ti_u, ti_v, ti_ut, ti_vt + + +def benchmark(func): + def wrapper(u, v, ut, vt, name="Taichi", ut_gt=None, vt_gt=None, nIter=1000): + func(u, v, ut, vt) + if ut_gt is not None and vt_gt is not None: + if type(ut) is ti.lang._ndarray.ScalarNdarray: + ut_diff = ut.to_numpy() + vt_diff = vt.to_numpy() + elif type(ut) is cuda.cudadrv.devicearray.DeviceNDArray: + ut_diff = ut.copy_to_host() + vt_diff = vt.copy_to_host() + else: + ut_diff = ut + vt_diff = vt + print(f"Verify UT matrix for {name}: ", np.allclose( + ut_diff, ut_gt, atol=1, rtol=1, equal_nan=True)) + print(f"Verify VT matrix for {name}: ", np.allclose( + vt_diff, vt_gt, atol=1, rtol=1, equal_nan=True)) + now = time.perf_counter() + for _ in range(nIter): + func(u, v, ut, vt) + end = time.perf_counter() + print(f'Time spent in {name} for {nIter}x: {(end -now):4.2f} sec.') + return wrapper + + +def compute_ground_truth(calc_velocity_func): + u, v, ut, vt = gen_data_numpy() + # Numpy is way too slow even for ground truth computation. + calc_velocity_func(u, v, ut, vt) + return ut, vt