diff --git a/bench/Project.toml b/bench/Project.toml
index 0c5972386..7c13eb117 100644
--- a/bench/Project.toml
+++ b/bench/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/bench/helpers.jl b/bench/helpers.jl
index d60b9e8ae..0deae669a 100644
--- a/bench/helpers.jl
+++ b/bench/helpers.jl
@@ -1,9 +1,11 @@
 # TODO: Special Handling for GPU Arrays with @sync
-function benchmark_forward_pass(tag::String, model, x, ps, st)
-    SUITE[tag]["forward"]["default"] = @benchmarkable Lux.apply($model, $x, $ps, $st)
+function benchmark_forward_pass(tag::String, end_tag::String, model, x, ps_nt::NamedTuple,
+        st)
+    SUITE[tag]["cpu"]["forward"]["NamedTuple"][end_tag] = @benchmarkable Lux.apply(
+        $model, $x, $ps_nt, $st)
 
-    ps_ca = ComponentArray(ps)
-    SUITE[tag]["forward"]["ComponentArray"] = @benchmarkable Lux.apply(
+    ps_ca = ComponentArray(ps_nt)
+    SUITE[tag]["cpu"]["forward"]["ComponentArray"][end_tag] = @benchmarkable Lux.apply(
         $model, $x, $ps_ca, $st)
 
     return
diff --git a/bench/layers.jl b/bench/layers.jl
new file mode 100644
index 000000000..f9c51b1e3
--- /dev/null
+++ b/bench/layers.jl
@@ -0,0 +1,21 @@
+function add_dense_benchmarks!()
+    for n in (2, 20, 200, 2000)
+        layer = Dense(n => n)
+        x, ps, st = general_setup(layer, (n, 128))
+        benchmark_forward_pass("Dense($n => $n)", "($n, 128)", layer, x, ps, st)
+    end
+
+    return
+end
+
+function add_conv_benchmarks!()
+    for ch in (1, 3, 16, 64)
+        layer = Conv((3, 3), ch => ch)
+        x, ps, st = general_setup(layer, (64, 64, ch, 128))
+        benchmark_forward_pass(
+            "Conv((3, 3), $ch => $ch)", "(64, 64, $ch, 128)", layer, x, ps, st)
+    end
+end
+
+add_dense_benchmarks!()
+add_conv_benchmarks!()
diff --git a/bench/runbenchmarks.jl b/bench/runbenchmarks.jl
index 5f76d244c..256698338 100644
--- a/bench/runbenchmarks.jl
+++ b/bench/runbenchmarks.jl
@@ -1,14 +1,18 @@
 using BenchmarkTools: BenchmarkTools, BenchmarkGroup, @btime, @benchmarkable
 using ComponentArrays: ComponentArray
+using InteractiveUtils: versioninfo
 using Lux: Lux, BatchNorm, Chain, Conv, Dense, Dropout, FlattenLayer, MaxPool
 using NNlib: relu
 using StableRNGs: StableRNG
 using Statistics: median
 
+@info sprint(versioninfo)
+
 const SUITE = BenchmarkGroup()
 
 include("helpers.jl")
 include("vgg.jl")
+include("layers.jl")
 
 BenchmarkTools.tune!(SUITE)
 results = BenchmarkTools.run(SUITE; verbose=true)
diff --git a/bench/vgg.jl b/bench/vgg.jl
index e12f1f9d9..1464ffd8b 100644
--- a/bench/vgg.jl
+++ b/bench/vgg.jl
@@ -1,4 +1,4 @@
-function add_vgg_benchmarks()
+function add_vgg_benchmarks!()
     vgg16 = Chain(Conv((3, 3), 3 => 64, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(64),
         Conv((3, 3), 64 => 64, relu; pad=(1, 1), stride=(1, 1)), BatchNorm(64),
         MaxPool((2, 2)), Conv((3, 3), 64 => 128, relu; pad=(1, 1), stride=(1, 1)),
@@ -17,16 +17,12 @@ function add_vgg_benchmarks()
         BatchNorm(512), MaxPool((2, 2)), FlattenLayer(), Dense(512, 4096, relu),
         Dropout(0.5), Dense(4096, 4096, relu), Dropout(0.5), Dense(4096, 10))
 
-    x, ps, st = general_setup(vgg16, (32, 32, 3, 1))
-    benchmark_forward_pass("vgg16 -- batchsize = 1", vgg16, x, ps, st)
-
-    x, ps, st = general_setup(vgg16, (32, 32, 3, 16))
-    benchmark_forward_pass("vgg16 -- batchsize = 16", vgg16, x, ps, st)
-
-    x, ps, st = general_setup(vgg16, (32, 32, 3, 64))
-    benchmark_forward_pass("vgg16 -- batchsize = 64", vgg16, x, ps, st)
+    for bsize in (1, 16, 64)
+        x, ps, st = general_setup(vgg16, (32, 32, 3, bsize))
+        benchmark_forward_pass("vgg16", "(32, 32, 3, $bsize)", vgg16, x, ps, st)
+    end
 
     return
 end
 
-add_vgg_benchmarks()
+add_vgg_benchmarks!()