Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MAC simulation benchmark case. #20

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions mac/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# MAC Simulation Benchmark

## Introduction
MAC (Marker-And-Cell) is an efficient fluid simulation solver.

## Implementation
We implemented the velocity calculation in Taichi, Numba (CPU), Numba
(GPU) and CUDA.

## Results
We compared the performance in GFLOPS.
39 changes: 39 additions & 0 deletions mac/src/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from numba_kernels import launch_numba, launch_numba_cuda, calc_velocity_numba
from taichi_kernels import launch_taichi, init_taichi
from utils import gen_data_numpy, gen_data_numba_cuda, gen_data_taichi, compute_ground_truth, nx, ny

from multiprocessing import Process
import time


def run(gen_data_func, calc_func, name, ut_groundtruth=None, vt_groundtruth=None, ctx_init_func=None):
print(f"=========================================")
print(f"Running benchmark for target {name}...")
if ctx_init_func is not None:
ctx_init_func()
data_args = gen_data_func()
calc_func(*data_args, name=name,
ut_gt=ut_groundtruth, vt_gt=vt_groundtruth)


print(f"Benchmark for {nx}x{ny} arrays")
print("Computing ground truth...")
ut, vt = compute_ground_truth(calc_velocity_numba)
print("Done")

# Benchmark Numba CPU
p = run(gen_data_numpy, launch_numba, "Numba-CPU")

# Benchmark Numba CUDA
# Numpy CUDA has context conflict with Taichi, move into a subprocess.
p = Process(target=run, args=(gen_data_numba_cuda,
launch_numba_cuda, "Numba-CUDA", ut, vt))
p.start()
p.join()
time.sleep(1)

# Benchmark Taichi CUDA
p = Process(target=run, args=(gen_data_taichi, launch_taichi,
"Taichi-CUDA", ut, vt, init_taichi))
p.start()
p.join()
59 changes: 59 additions & 0 deletions mac/src/numba_kernels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from numba import njit, prange, cuda
from utils import benchmark, nx, ny, dx, dy, dt, mu


@njit(parallel=True, fastmath=True)
def calc_velocity_numba(u, v, ut, vt):
for i in prange(1, nx):
for j in prange(1, ny+1):
ut[i, j] = u[i, j] + dt * ((-0.25) *
(((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx
+ ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j])
- (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy)
+ (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2
+ (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2))
for i in prange(1, nx+1):
for j in prange(1, ny):
vt[i, j] = v[i, j] + dt * ((-0.25) *
(((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j])
- (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx
+ ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy)
+ (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2
+ (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2))


@benchmark
def launch_numba(*args):
calc_velocity_numba(*args)


@cuda.jit
def calc_velocity_numba_cuda(u, v, ut, vt):
tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
if tid > ((nx + 1) * (ny + 1)):
return
i = tid // (ny + 1) + 1
j = tid % (ny + 1) + 1
if (i < nx and j < ny + 1):
ut[i, j] = u[i, j] + dt * ((-0.25) *
(((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx
+ ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j])
- (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy)
+ (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2
+ (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2))
if (i < nx + 1 and j < ny):
vt[i, j] = v[i, j] + dt * ((-0.25) *
(((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j])
- (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx
+ ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy)
+ (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2
+ (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2))


@benchmark
def launch_numba_cuda(*args):
block_dim = 128
total_threads = (nx + 1) * (ny + 1)
grid_dim = (total_threads + block_dim - 1) // block_dim
calc_velocity_numba_cuda[grid_dim, block_dim](*args)
cuda.synchronize()
30 changes: 30 additions & 0 deletions mac/src/taichi_kernels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import taichi as ti
from utils import benchmark, nx, ny, dx, dy, dt, mu


def init_taichi():
ti.init(arch=ti.cuda, device_memory_fraction=0.5)


@ti.kernel
def calc_velocity_taichi(u: ti.types.ndarray(), v: ti.types.ndarray(), ut: ti.types.ndarray(), vt: ti.types.ndarray()):
for i, j in ti.ndrange((1, nx), (1, ny+1)):
ut[i, j] = u[i, j] + dt * ((-0.25) *
(((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx
+ ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j])
- (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy)
+ (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2
+ (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2))
for i, j in ti.ndrange((1, nx+1), (1, ny)):
vt[i, j] = v[i, j] + dt * ((-0.25) *
(((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j])
- (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx
+ ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy)
+ (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2
+ (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2))


@benchmark
def launch_taichi(*args):
calc_velocity_taichi(*args)
ti.sync()
71 changes: 71 additions & 0 deletions mac/src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import time
import taichi as ti
import numpy as np
from numba import cuda

lx, ly, nx, ny = 1.0, 1.0, 4096, 4096
dx, dy, dt = lx/nx, ly/ny, 0.001
mu = 0.001


def gen_data_numpy():
np.random.seed(42)
u = np.random.rand(nx + 1, ny + 2).astype(np.float32)
v = np.random.rand(nx + 2, ny + 1).astype(np.float32)
ut = np.ones((nx + 1, ny + 2), dtype=np.float32)
vt = np.ones((nx + 2, ny + 1), dtype=np.float32)
return u, v, ut, vt


def gen_data_numba_cuda():
u, v, ut, vt = gen_data_numpy()
u_dev = cuda.to_device(u)
v_dev = cuda.to_device(v)
ut_dev = cuda.to_device(ut)
vt_dev = cuda.to_device(vt)
return u_dev, v_dev, ut_dev, vt_dev


def gen_data_taichi():
u, v, ut, vt = gen_data_numpy()
ti_u = ti.ndarray(shape=(nx + 1, ny + 2), dtype=ti.f32)
ti_v = ti.ndarray(shape=(nx + 2, ny + 1), dtype=ti.f32)
ti_ut = ti.ndarray(shape=(nx + 1, ny + 2), dtype=ti.f32)
ti_vt = ti.ndarray(shape=(nx + 2, ny + 1), dtype=ti.f32)
ti_u.from_numpy(u)
ti_v.from_numpy(v)
ti_ut.from_numpy(ut)
ti_vt.from_numpy(vt)
return ti_u, ti_v, ti_ut, ti_vt


def benchmark(func):
def wrapper(u, v, ut, vt, name="Taichi", ut_gt=None, vt_gt=None, nIter=1000):
func(u, v, ut, vt)
if ut_gt is not None and vt_gt is not None:
if type(ut) is ti.lang._ndarray.ScalarNdarray:
ut_diff = ut.to_numpy()
vt_diff = vt.to_numpy()
elif type(ut) is cuda.cudadrv.devicearray.DeviceNDArray:
ut_diff = ut.copy_to_host()
vt_diff = vt.copy_to_host()
else:
ut_diff = ut
vt_diff = vt
print(f"Verify UT matrix for {name}: ", np.allclose(
ut_diff, ut_gt, atol=1, rtol=1, equal_nan=True))
print(f"Verify VT matrix for {name}: ", np.allclose(
vt_diff, vt_gt, atol=1, rtol=1, equal_nan=True))
now = time.perf_counter()
for _ in range(nIter):
func(u, v, ut, vt)
end = time.perf_counter()
print(f'Time spent in {name} for {nIter}x: {(end -now):4.2f} sec.')
return wrapper


def compute_ground_truth(calc_velocity_func):
u, v, ut, vt = gen_data_numpy()
# Numpy is way too slow even for ground truth computation.
calc_velocity_func(u, v, ut, vt)
return ut, vt