diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
new file mode 100644
index 000000000..17a36b09f
--- /dev/null
+++ b/.github/workflows/Benchmark.yml
@@ -0,0 +1,55 @@
+name: Benchmarks
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+          arch: x64
+      - uses: actions/cache@v4
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - name: Run benchmark
+        run: |
+          cd bench
+          julia --project --color=yes -e '
+            using Pkg;
+            Pkg.develop(PackageSpec(path=joinpath(pwd(), "..")));
+            Pkg.instantiate();
+            include("runbenchmarks.jl")'
+      - name: Parse & Upload Benchmark Results
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: Benchmark Results
+          tool: 'julia'
+          output-file-path: bench/benchmark_results.json
+          summary-always: true
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          comment-always: true
+          alert-threshold: "200%"
+          fail-on-alert: true
+          benchmark-data-dir-path: benchmarks
+          auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/.gitignore b/.gitignore
index 8590783ac..b249f6420 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,5 @@ docs/src/tutorials/beginner
 docs/src/tutorials/intermediate
 docs/src/tutorials/advanced
 *.log
+
+bench/benchmark_results.json
diff --git a/bench/.JuliaFormatter.toml b/bench/.JuliaFormatter.toml
new file mode 100644
index 000000000..3d6dde2cb
--- /dev/null
+++ b/bench/.JuliaFormatter.toml
@@ -0,0 +1,9 @@
+style = "sciml"
+whitespace_in_kwargs = false
+always_use_return = true
+margin = 92
+indent = 4
+format_docstrings = true
+separate_kwargs_with_semicolon = true
+always_for_in = true
+annotate_untyped_fields_with_any = false
diff --git a/bench/Project.toml b/bench/Project.toml
new file mode 100644
index 000000000..0c5972386
--- /dev/null
+++ b/bench/Project.toml
@@ -0,0 +1,8 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
diff --git a/bench/README.md b/bench/README.md
new file mode 100644
index 000000000..8f5380fc5
--- /dev/null
+++ b/bench/README.md
@@ -0,0 +1,12 @@
+# Lux.jl Continuous Benchmarking
+
+Currently we use the BenchmarkTools.jl package to benchmark the performance of Lux.jl over
+time.
+
+This is built using https://github.com/benchmark-action/github-action-benchmark/ so it
+allows for nice visualizations of the benchmark results in github pages and produces
+warnings on PRs if the benchmarks regress.
+
+## Current Benchmarks
+
+1. Small VGG Net for CIFAR-10
diff --git a/bench/helpers.jl b/bench/helpers.jl
new file mode 100644
index 000000000..d60b9e8ae
--- /dev/null
+++ b/bench/helpers.jl
@@ -0,0 +1,17 @@
+# TODO: Special Handling for GPU Arrays with @sync
+function benchmark_forward_pass(tag::String, model, x, ps, st)
+    SUITE[tag]["forward"]["default"] = @benchmarkable Lux.apply($model, $x, $ps, $st)
+
+    ps_ca = ComponentArray(ps)
+    SUITE[tag]["forward"]["ComponentArray"] = @benchmarkable Lux.apply(
+        $model, $x, $ps_ca, $st)
+
+    return
+end
+
+function general_setup(model, x_dims)
+    rng = StableRNG(0)
+    ps, st = Lux.setup(rng, model)
+    x = randn(rng, Float32, x_dims)
+    return x, ps, st
+end
diff --git a/bench/runbenchmarks.jl b/bench/runbenchmarks.jl
new file mode 100644
index 000000000..5f76d244c
--- /dev/null
+++ b/bench/runbenchmarks.jl
@@ -0,0 +1,16 @@
+using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable
+using ComponentArrays: ComponentArray
+using Lux: Lux, BatchNorm, Chain, Conv, Dense, Dropout, FlattenLayer, MaxPool
+using NNlib: relu
+using StableRNGs: StableRNG
+using Statistics: median
+
+const SUITE = BenchmarkGroup()
+
+include("helpers.jl")
+include("vgg.jl")
+
+BenchmarkTools.tune!(SUITE)
+results = BenchmarkTools.run(SUITE; verbose=true)
+
+BenchmarkTools.save(joinpath(@__DIR__, "benchmark_results.json"), median(results))
diff --git a/bench/vgg.jl b/bench/vgg.jl
new file mode 100644
index 000000000..e12f1f9d9
--- /dev/null
+++ b/bench/vgg.jl
@@ -0,0 +1,32 @@
+function add_vgg_benchmarks()
+    vgg16 = Chain(Conv((3, 3), 3 => 64, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(64),
+        Conv((3, 3), 64 => 64, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(64),
+        MaxPool((2, 2)), Conv((3, 3), 64 => 128, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(128), Conv((3, 3), 128 => 128, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(128), MaxPool((2, 2)),
+        Conv((3, 3), 128 => 256, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(256), Conv((3, 3), 256 => 256, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(256), Conv((3, 3), 256 => 256, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(256), MaxPool((2, 2)),
+        Conv((3, 3), 256 => 512, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(512),
+        Conv((3, 3), 512 => 512, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(512),
+        Conv((3, 3), 512 => 512, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(512),
+        MaxPool((2, 2)), Conv((3, 3), 512 => 512, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(512), Conv((3, 3), 512 => 512, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(512), Conv((3, 3), 512 => 512, relu; pad=(1, 1), stride=(1, 1)),
+        BatchNorm(512), MaxPool((2, 2)), FlattenLayer(), Dense(512, 4096, relu),
+        Dropout(0.5), Dense(4096, 4096, relu), Dropout(0.5), Dense(4096, 10))
+
+    x, ps, st = general_setup(vgg16, (32, 32, 3, 1))
+    benchmark_forward_pass("vgg16 -- batchsize = 1", vgg16, x, ps, st)
+
+    x, ps, st = general_setup(vgg16, (32, 32, 3, 16))
+    benchmark_forward_pass("vgg16 -- batchsize = 16", vgg16, x, ps, st)
+
+    x, ps, st = general_setup(vgg16, (32, 32, 3, 64))
+    benchmark_forward_pass("vgg16 -- batchsize = 64", vgg16, x, ps, st)
+
+    return
+end
+
+add_vgg_benchmarks()