diff --git a/mac/README.md b/mac/README.md
new file mode 100644
index 0000000..e9eedd2
--- /dev/null
+++ b/mac/README.md
@@ -0,0 +1,11 @@
+# MAC Simulation Benchmark
+
+## Introduction
+MAC (Marker-And-Cell) is an efficient fluid simulation solver.
+
+## Implementation
+We implemented the velocity calculation in Taichi, Numba (CPU), Numba 
+(GPU) and CUDA.
+
+## Results
+We compared the performance in GFLOPS.
diff --git a/mac/src/benchmark.py b/mac/src/benchmark.py
new file mode 100644
index 0000000..aa41b57
--- /dev/null
+++ b/mac/src/benchmark.py
@@ -0,0 +1,39 @@
+from numba_kernels import launch_numba, launch_numba_cuda, calc_velocity_numba
+from taichi_kernels import launch_taichi, init_taichi
+from utils import gen_data_numpy, gen_data_numba_cuda, gen_data_taichi, compute_ground_truth, nx, ny
+
+from multiprocessing import Process
+import time
+
+
+def run(gen_data_func, calc_func, name, ut_groundtruth=None, vt_groundtruth=None, ctx_init_func=None):
+    print(f"=========================================")
+    print(f"Running benchmark for target {name}...")
+    if ctx_init_func is not None:
+        ctx_init_func()
+    data_args = gen_data_func()
+    calc_func(*data_args, name=name,
+              ut_gt=ut_groundtruth, vt_gt=vt_groundtruth)
+
+
+print(f"Benchmark for {nx}x{ny} arrays")
+print("Computing ground truth...")
+ut, vt = compute_ground_truth(calc_velocity_numba)
+print("Done")
+
+# Benchmark Numba CPU
+p = run(gen_data_numpy, launch_numba, "Numba-CPU")
+
+# Benchmark Numba CUDA
+# Numpy CUDA has context conflict with Taichi, move into a subprocess.
+p = Process(target=run, args=(gen_data_numba_cuda,
+            launch_numba_cuda, "Numba-CUDA", ut, vt))
+p.start()
+p.join()
+time.sleep(1)
+
+# Benchmark Taichi CUDA
+p = Process(target=run, args=(gen_data_taichi, launch_taichi,
+            "Taichi-CUDA", ut, vt, init_taichi))
+p.start()
+p.join()
diff --git a/mac/src/numba_kernels.py b/mac/src/numba_kernels.py
new file mode 100644
index 0000000..36b5304
--- /dev/null
+++ b/mac/src/numba_kernels.py
@@ -0,0 +1,59 @@
+from numba import njit, prange, cuda
+from utils import benchmark, nx, ny, dx, dy, dt, mu
+
+
+@njit(parallel=True, fastmath=True)
+def calc_velocity_numba(u, v, ut, vt):
+    for i in prange(1, nx):
+        for j in prange(1, ny+1):
+            ut[i, j] = u[i, j] + dt * ((-0.25) *
+                                       (((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx
+                                        + ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j])
+                                           - (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy)
+                                       + (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2
+                                                 + (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2))
+    for i in prange(1, nx+1):
+        for j in prange(1, ny):
+            vt[i, j] = v[i, j] + dt * ((-0.25) *
+                                       (((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j])
+                                         - (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx
+                                        + ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy)
+                                       + (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2
+                                               + (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2))
+
+
+@benchmark
+def launch_numba(*args):
+    calc_velocity_numba(*args)
+
+
+@cuda.jit
+def calc_velocity_numba_cuda(u, v, ut, vt):
+    tid = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
+    if tid > ((nx + 1) * (ny + 1)):
+        return
+    i = tid // (ny + 1) + 1
+    j = tid % (ny + 1) + 1
+    if (i < nx and j < ny + 1):
+        ut[i, j] = u[i, j] + dt * ((-0.25) *
+                                   (((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx
+                                    + ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j])
+                                       - (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy)
+                                   + (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2
+                                             + (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2))
+    if (i < nx + 1 and j < ny):
+        vt[i, j] = v[i, j] + dt * ((-0.25) *
+                                   (((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j])
+                                     - (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx
+                                    + ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy)
+                                   + (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2
+                                           + (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2))
+
+
+@benchmark
+def launch_numba_cuda(*args):
+    block_dim = 128
+    total_threads = (nx + 1) * (ny + 1)
+    grid_dim = (total_threads + block_dim - 1) // block_dim
+    calc_velocity_numba_cuda[grid_dim, block_dim](*args)
+    cuda.synchronize()
diff --git a/mac/src/taichi_kernels.py b/mac/src/taichi_kernels.py
new file mode 100644
index 0000000..73c154b
--- /dev/null
+++ b/mac/src/taichi_kernels.py
@@ -0,0 +1,30 @@
+import taichi as ti
+from utils import benchmark, nx, ny, dx, dy, dt, mu
+
+
+def init_taichi():
+    ti.init(arch=ti.cuda, device_memory_fraction=0.5)
+
+
+@ti.kernel
+def calc_velocity_taichi(u: ti.types.ndarray(), v: ti.types.ndarray(), ut: ti.types.ndarray(), vt: ti.types.ndarray()):
+    for i, j in ti.ndrange((1, nx), (1, ny+1)):
+        ut[i, j] = u[i, j] + dt * ((-0.25) *
+                                   (((u[i+1, j] + u[i, j])**2 - (u[i, j] + u[i-1, j])**2) / dx
+                                    + ((u[i, j+1] + u[i, j]) * (v[i+1, j] + v[i, j])
+                                       - (u[i, j] + u[i, j-1]) * (v[i+1, j-1] + v[i, j-1])) / dy)
+                                   + (mu) * ((u[i+1, j] - 2 * u[i, j] + u[i-1, j]) / dx ** 2
+                                             + (u[i, j+1] - 2 * u[i, j] + u[i, j-1]) / dy ** 2))
+    for i, j in ti.ndrange((1, nx+1), (1, ny)):
+        vt[i, j] = v[i, j] + dt * ((-0.25) *
+                                   (((u[i, j+1] + u[i, j]) * (v[i+1, j]+v[i, j])
+                                     - (u[i-1, j+1] + u[i-1, j]) * (v[i, j]+v[i-1, j])) / dx
+                                    + ((v[i, j+1]+v[i, j]) ** 2-(v[i, j]+v[i, j-1]) ** 2) / dy)
+                                   + (mu)*((v[i+1, j] - 2 * v[i, j] + v[i-1, j]) / dx ** 2
+                                           + (v[i, j+1] - 2 * v[i, j] + v[i, j-1]) / dy ** 2))
+
+
+@benchmark
+def launch_taichi(*args):
+    calc_velocity_taichi(*args)
+    ti.sync()
diff --git a/mac/src/utils.py b/mac/src/utils.py
new file mode 100644
index 0000000..793ba66
--- /dev/null
+++ b/mac/src/utils.py
@@ -0,0 +1,71 @@
+import time
+import taichi as ti
+import numpy as np
+from numba import cuda
+
+lx, ly, nx, ny = 1.0, 1.0, 4096, 4096
+dx, dy, dt = lx/nx, ly/ny, 0.001
+mu = 0.001
+
+
+def gen_data_numpy():
+    np.random.seed(42)
+    u = np.random.rand(nx + 1, ny + 2).astype(np.float32)
+    v = np.random.rand(nx + 2, ny + 1).astype(np.float32)
+    ut = np.ones((nx + 1, ny + 2), dtype=np.float32)
+    vt = np.ones((nx + 2, ny + 1), dtype=np.float32)
+    return u, v, ut, vt
+
+
+def gen_data_numba_cuda():
+    u, v, ut, vt = gen_data_numpy()
+    u_dev = cuda.to_device(u)
+    v_dev = cuda.to_device(v)
+    ut_dev = cuda.to_device(ut)
+    vt_dev = cuda.to_device(vt)
+    return u_dev, v_dev, ut_dev, vt_dev
+
+
+def gen_data_taichi():
+    u, v, ut, vt = gen_data_numpy()
+    ti_u = ti.ndarray(shape=(nx + 1, ny + 2), dtype=ti.f32)
+    ti_v = ti.ndarray(shape=(nx + 2, ny + 1), dtype=ti.f32)
+    ti_ut = ti.ndarray(shape=(nx + 1, ny + 2), dtype=ti.f32)
+    ti_vt = ti.ndarray(shape=(nx + 2, ny + 1), dtype=ti.f32)
+    ti_u.from_numpy(u)
+    ti_v.from_numpy(v)
+    ti_ut.from_numpy(ut)
+    ti_vt.from_numpy(vt)
+    return ti_u, ti_v, ti_ut, ti_vt
+
+
+def benchmark(func):
+    def wrapper(u, v, ut, vt, name="Taichi", ut_gt=None, vt_gt=None, nIter=1000):
+        func(u, v, ut, vt)
+        if ut_gt is not None and vt_gt is not None:
+            if type(ut) is ti.lang._ndarray.ScalarNdarray:
+                ut_diff = ut.to_numpy()
+                vt_diff = vt.to_numpy()
+            elif type(ut) is cuda.cudadrv.devicearray.DeviceNDArray:
+                ut_diff = ut.copy_to_host()
+                vt_diff = vt.copy_to_host()
+            else:
+                ut_diff = ut
+                vt_diff = vt
+            print(f"Verify UT matrix for {name}: ", np.allclose(
+                ut_diff, ut_gt, atol=1, rtol=1, equal_nan=True))
+            print(f"Verify VT matrix for {name}: ", np.allclose(
+                vt_diff, vt_gt, atol=1, rtol=1, equal_nan=True))
+        now = time.perf_counter()
+        for _ in range(nIter):
+            func(u, v, ut, vt)
+        end = time.perf_counter()
+        print(f'Time spent in {name} for {nIter}x: {(end -now):4.2f} sec.')
+    return wrapper
+
+
+def compute_ground_truth(calc_velocity_func):
+    u, v, ut, vt = gen_data_numpy()
+    # Numpy is way too slow even for ground truth computation.
+    calc_velocity_func(u, v, ut, vt)
+    return ut, vt