Skip to content

Commit

Permalink
test: run type-stability tests first
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 4, 2024
1 parent 2dc3697 commit ea332be
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 24 deletions.
24 changes: 16 additions & 8 deletions test/autodiff/nested_autodiff_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ function test_nested_ad_input_gradient_jacobian(aType, dev, ongpu, loss_fn, X, m
ps, st = Lux.setup(rng, model) |> dev
X = aType(X)

l = loss_fn(model, X, ps, st)
l = allow_unstable() do
loss_fn(model, X, ps, st)
end
@test l isa Number
@test isfinite(l) && !isnan(l)

Expand All @@ -25,9 +27,11 @@ function test_nested_ad_input_gradient_jacobian(aType, dev, ongpu, loss_fn, X, m
!iszero(ComponentArray(∂ps |> cpu_device())) &&
all(x -> x === nothing || isfinite(x), ComponentArray(∂ps |> cpu_device()))

test_gradients((x, ps) -> loss_fn(model, x, ps, st), X, ps;
atol=1.0f-3, rtol=1.0f-1, soft_fail=[AutoForwardDiff()],
skip_backends=[AutoReverseDiff(), AutoTracker(), AutoEnzyme()])
allow_unstable() do
test_gradients((x, ps) -> loss_fn(model, x, ps, st), X, ps;
atol=1.0f-3, rtol=1.0f-1, soft_fail=[AutoForwardDiff()],
skip_backends=[AutoReverseDiff(), AutoTracker(), AutoEnzyme()])
end
end

const Xs = (randn(rng, Float32, 3, 3, 2, 4), randn(rng, Float32, 2, 4),
Expand Down Expand Up @@ -133,7 +137,9 @@ function test_nested_ad_parameter_gradient_jacobian(aType, dev, ongpu, loss_fn,
st = st |> dev
X = aType(X)

l = loss_fn(model, X, ps, st)
l = allow_unstable() do
loss_fn(model, X, ps, st)
end
@test l isa Number
@test isfinite(l) && !isnan(l)

Expand All @@ -146,9 +152,11 @@ function test_nested_ad_parameter_gradient_jacobian(aType, dev, ongpu, loss_fn,
!iszero(ComponentArray(∂ps |> cpu_device())) &&
all(x -> x === nothing || isfinite(x), ComponentArray(∂ps |> cpu_device()))

test_gradients((x, ps) -> loss_fn(model, x, ps, st), X, ps;
atol=1.0f-3, rtol=1.0f-1, soft_fail=[AutoForwardDiff()],
skip_backends=[AutoReverseDiff(), AutoTracker(), AutoEnzyme()])
allow_unstable() do
test_gradients((x, ps) -> loss_fn(model, x, ps, st), X, ps;
atol=1.0f-3, rtol=1.0f-1, soft_fail=[AutoForwardDiff()],
skip_backends=[AutoReverseDiff(), AutoTracker(), AutoEnzyme()])
end
end

const Xs = (randn(rng, Float32, 3, 3, 2, 4), randn(rng, Float32, 2, 4),
Expand Down
28 changes: 13 additions & 15 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ end

using Lux

# Type Stability tests fail if run with DispatchDoctor enabled
if "all" in LUX_TEST_GROUP || "core_layers" in LUX_TEST_GROUP
try
# Run in a separate process to load the updated preferences
run(`$(Base.julia_cmd()) --color=yes --project=$(dirname(Pkg.project().path))
--startup-file=no --code-coverage=user $(@__DIR__)/zygote_type_stability.jl`)
@test true
catch
@test false
end
end

Lux.set_dispatch_doctor_preferences!(; luxcore="error", luxlib="error")

@testset "Load Tests" begin
Expand Down Expand Up @@ -85,7 +97,7 @@ const RETESTITEMS_NWORKERS = parse(
@info "Running tests for group: [$(i)/$(length(LUX_TEST_GROUP))] $tag"

ReTestItems.runtests(Lux; tags=(tag == "all" ? nothing : [Symbol(tag)]),
nworkers=RETESTITEMS_NWORKERS, testitem_timeout=1800, retries=2)
nworkers=RETESTITEMS_NWORKERS, testitem_timeout=1800, retries=1)
end
end

Expand Down Expand Up @@ -170,17 +182,3 @@ if ("all" in LUX_TEST_GROUP || "others" in LUX_TEST_GROUP)
end
end
end

# Type Stability tests fail if run with DispatchDoctor enabled
Lux.set_dispatch_doctor_preferences!(; luxcore="disable", luxlib="disable")

if "all" in LUX_TEST_GROUP || "core_layers" in LUX_TEST_GROUP
try
# Run in a separate process to load the updated preferences
run(`$(Base.julia_cmd()) --color=yes --project=$(dirname(Pkg.project().path))
--startup-file=no --code-coverage=user $(@__DIR__)/zygote_type_stability.jl`)
@test true
catch
@test false
end
end
1 change: 0 additions & 1 deletion test/zygote_type_stability.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ include("setup_modes.jl")
model in model_list,
input in inputs

model = maybe_rewrite_to_crosscor(mode, model)
ps, st = Lux.setup(rng, model) |> dev
x = input |> dev

Expand Down

1 comment on commit ea332be

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: ea332be Previous: 0b7571a Ratio
Dense(512 => 512, identity)(512 x 128)/forward/CPU/2 thread(s) 412458 ns 407750 ns 1.01
Dense(512 => 512, identity)(512 x 128)/forward/CPU/4 thread(s) 321709 ns 243584 ns 1.32
Dense(512 => 512, identity)(512 x 128)/forward/CPU/8 thread(s) 323896 ns 322333.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/CPU/1 thread(s) 741958 ns 740729.5 ns 1.00
Dense(512 => 512, identity)(512 x 128)/forward/GPU/CUDA 43204 ns 43833 ns 0.99
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/2 thread(s) 1317396 ns 1312437 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/4 thread(s) 2464375 ns 1253687.5 ns 1.97
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/8 thread(s) 14642958 ns 14025833.5 ns 1.04
Dense(512 => 512, identity)(512 x 128)/zygote/CPU/1 thread(s) 2196000 ns 2194708 ns 1.00
Dense(512 => 512, identity)(512 x 128)/zygote/GPU/CUDA 208726.5 ns 204686 ns 1.02
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/2 thread(s) 1450917 ns 1421833 ns 1.02
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/4 thread(s) 934625 ns 917666 ns 1.02
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/8 thread(s) 1697209 ns 1521458.5 ns 1.12
Dense(512 => 512, identity)(512 x 128)/enzyme/CPU/1 thread(s) 2207583 ns 2206041.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1781333 ns 1636291 ns 1.09
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1098208 ns 1093166 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1507125 ns 1540041 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2908542 ns 2957250 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/GPU/CUDA 209607.5 ns 208244 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12131458 ns 12064562 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 8814770.5 ns 8829500 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9252270.5 ns 9263417 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18589083.5 ns 18568125 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1490165 ns 1510701 ns 0.99
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17289959 ns 17268291.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 13989667 ns 13964333 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14502875 ns 14534875 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21849666 ns 21840708.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 249663875 ns 249883854.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148521541 ns 148685042 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 115933291.5 ns 116235625 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 447579458 ns 448238208 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5477215 ns 5476091.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1139835959 ns 1190941209 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 978934750 ns 979930625 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 853295792 ns 855870500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1789749000 ns 1816350500 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 31155061 ns 31819565 ns 0.98
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1134051542 ns 1035607791 ns 1.10
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 999963750 ns 995404083.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1308847250.5 ns 1363876062.5 ns 0.96
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1730047208 ns 1750692104.5 ns 0.99
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 1099083.5 ns 1047645.5 ns 1.05
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 1611187.5 ns 1617437.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 3499667 ns 3577375 ns 0.98
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 783708.5 ns 786333 ns 1.00
lenet(28, 28, 1, 32)/forward/GPU/CUDA 269562.5 ns 273031.5 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 3018791.5 ns 3004708.5 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 4156979 ns 4200542 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 10275229.5 ns 11777875 ns 0.87
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3215083 ns 3164334 ns 1.02
lenet(28, 28, 1, 32)/zygote/GPU/CUDA 1193854 ns 1198487 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 2332208 ns 2248937 ns 1.04
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1382583 ns 1322917 ns 1.05
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1687750 ns 1665229 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 4215375.5 ns 4201687.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 209594 ns 209763 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 19388000 ns 19436166 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 16069000 ns 16129625 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 17326292 ns 17429750 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 25910416.5 ns 25880667 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1588868.5 ns 1610222 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 34088083 ns 34173417 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 30937833 ns 30876125 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 31230458.5 ns 31248229 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 36701916.5 ns 36410104.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 4541833.5 ns 4482791.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2768083 ns 2543375 ns 1.09
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2900875.5 ns 2920604 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 8397125 ns 8391854 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 420375 ns 424436.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 38905562.5 ns 38997125 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 32031146 ns 32110145.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 32218187 ns 32392750 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 52007937.5 ns 51935562 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2626371 ns 2641424 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 89177917 ns 88445625 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 114075750 ns 115286208.5 ns 0.99
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 229081458 ns 222634750.5 ns 1.03
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 74341583.5 ns 74263979 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 268097375 ns 267592834 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 159334604 ns 156611875 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 126815875 ns 127057604.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 486160833 ns 485576209 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/GPU/CUDA 6980043.5 ns 6966689 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1472796583.5 ns 1476015166.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 1170472000 ns 1172283875 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 1064999083 ns 1075732708 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 2007098958.5 ns 2016646687.5 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 34638860 ns 34531886 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1689048208 ns 1716968542 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1523433000 ns 1543811791.5 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1884758500 ns 1807404167 ns 1.04
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 2205616333 ns 2233706500 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 2082500 ns 2003312.5 ns 1.04
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 2988979.5 ns 3069125 ns 0.97
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 8127625 ns 7973333 ns 1.02
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2508541.5 ns 2392166 ns 1.05
lenet(28, 28, 1, 128)/forward/GPU/CUDA 272761 ns 279062 ns 0.98
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 9735375 ns 9610687.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 12139000 ns 12028187.5 ns 1.01
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 25821041 ns 25151062.5 ns 1.03
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 11698458 ns 11817896 ns 0.99
lenet(28, 28, 1, 128)/zygote/GPU/CUDA 1272798 ns 1290603 ns 0.99
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 381449917 ns 380792958 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 286724875 ns 308976625 ns 0.93
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 242181541 ns 241644896 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 453170250 ns 452303562.5 ns 1.00
vgg16(32, 32, 3, 32)/forward/GPU/CUDA 4833185 ns 4828914.5 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 1173336083 ns 1171201291 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 924216958 ns 942099041 ns 0.98
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 971657125 ns 953747750 ns 1.02
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 1430179375 ns 1428813708 ns 1.00
vgg16(32, 32, 3, 32)/zygote/GPU/CUDA 17840776 ns 17841586 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1403750.5 ns 1410500 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 2081958 ns 1664542 ns 1.25
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 5722875 ns 5488792 ns 1.04
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1408458 ns 1374687.5 ns 1.02
lenet(28, 28, 1, 64)/forward/GPU/CUDA 275280 ns 280156 ns 0.98
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 6770687 ns 6573750 ns 1.03
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 12458250 ns 13287229 ns 0.94
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 21274521 ns 18608542 ns 1.14
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 6134979 ns 6090708 ns 1.01
lenet(28, 28, 1, 64)/zygote/GPU/CUDA 1311627 ns 1340355 ns 0.98
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70478833 ns 70020646 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43532916 ns 43782708 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39489833 ns 39491500 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132771729 ns 132617625.5 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1936601.5 ns 1956022 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 382368791 ns 383334125 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 295591666.5 ns 296279354 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 282483000 ns 282808416 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 535030479 ns 539325500 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 12289555.5 ns 12276639 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 407420458 ns 409376958 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 408775479 ns 366908458 ns 1.11
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 705784395.5 ns 675051667 ns 1.05
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 712922750 ns 711583625 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 1190190416 ns 1188246125 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 691356562.5 ns 831292458.5 ns 0.83
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 632381292 ns 632114354 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 1864383042 ns 1865387666 ns 1.00
vgg16(32, 32, 3, 128)/forward/GPU/CUDA 12548744.5 ns 12542044 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 3527214854.5 ns 3538721354 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 2750816917 ns 2772375167 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 2723456375 ns 2713351709 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 4906995375 ns 4951157583 ns 0.99
vgg16(32, 32, 3, 128)/zygote/GPU/CUDA 49787100 ns 49614963 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3430374.5 ns 3375250 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2075896 ns 2081166.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2513604 ns 2536500 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6036208 ns 6037687.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/GPU/CUDA 290675.5 ns 295351 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 25509791 ns 25516333 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18477979.5 ns 18518313 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18929687 ns 18846583 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 38972812 ns 38898354 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2459960 ns 2476302.5 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 54137604 ns 53964667 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 79016146 ns 80576917 ns 0.98
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 172864042 ns 171957042 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 45747729 ns 45586062.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1785000 ns 1747812.5 ns 1.02
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1098833 ns 1105729 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1575271 ns 1562500 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 3041083 ns 3031521 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/GPU/CUDA 213255 ns 212548 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12530062 ns 12517833 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9179500 ns 9220687.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9666624.5 ns 9561583.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18982583.5 ns 18978542 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/GPU/CUDA 1539758 ns 1527181.5 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17615791.5 ns 17640583 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14315750.5 ns 14342500 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14612125 ns 14447312 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 22193458 ns 22205541 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 70464750 ns 70074708 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 43492875 ns 43766958 ns 0.99
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 39563729.5 ns 39559750 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 132725062.5 ns 132760291.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 1890878 ns 1957872 ns 0.97
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 360162958 ns 359826333 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 290966979 ns 287628083 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 287495583.5 ns 287402500 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 623603729 ns 620985562.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 13401076 ns 13387014.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 420131604 ns 418344583 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 425616125 ns 420938875 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 719362771 ns 708918708 ns 1.01
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 718603750 ns 718150125 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 1566208 ns 1473791.5 ns 1.06
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 1239083.5 ns 1037208 ns 1.19
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 1245979.5 ns 1169396 ns 1.07
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 2362041 ns 2343958 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/GPU/CUDA 589439 ns 576538.5 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 8832333 ns 8746083.5 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 12769958 ns 13704583 ns 0.93
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 30689750 ns 31580708 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 9829292 ns 9800812.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/zygote/GPU/CUDA 1434002 ns 1453307 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 18037958 ns 17895167 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 16982896 ns 17277875 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 30462270.5 ns 30555625 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14482959 ns 14342520.5 ns 1.01
Dense(512 => 512, relu)(512 x 128)/forward/CPU/2 thread(s) 789958.5 ns 766563 ns 1.03
Dense(512 => 512, relu)(512 x 128)/forward/CPU/4 thread(s) 633083.5 ns 521854.5 ns 1.21
Dense(512 => 512, relu)(512 x 128)/forward/CPU/8 thread(s) 1036791.5 ns 1038167 ns 1.00
Dense(512 => 512, relu)(512 x 128)/forward/CPU/1 thread(s) 725125 ns 737500 ns 0.98
Dense(512 => 512, relu)(512 x 128)/forward/GPU/CUDA 48429 ns 48260 ns 1.00
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/2 thread(s) 1542250 ns 1513333 ns 1.02
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/4 thread(s) 1032458.5 ns 1063333 ns 0.97
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/8 thread(s) 1380125 ns 1432583 ns 0.96
Dense(512 => 512, relu)(512 x 128)/zygote/CPU/1 thread(s) 2295562.5 ns 2270542 ns 1.01
Dense(512 => 512, relu)(512 x 128)/zygote/GPU/CUDA 240743.5 ns 237514 ns 1.01
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/2 thread(s) 1747854 ns 1713000 ns 1.02
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/4 thread(s) 1235354 ns 1297458 ns 0.95
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/8 thread(s) 1736479 ns 1983417 ns 0.88
Dense(512 => 512, relu)(512 x 128)/enzyme/CPU/1 thread(s) 2412208 ns 2314667 ns 1.04
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3414875 ns 3345354.5 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2061771 ns 2072187.5 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2477833 ns 2520104 ns 0.98
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 6017000 ns 6022334 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/GPU/CUDA 284081.5 ns 285768 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24039917 ns 24075208 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 17178499.5 ns 17290417 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17190666.5 ns 17127666.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 37578542 ns 37508125 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/GPU/CUDA 2405176.5 ns 2401522.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 52521875 ns 52370708 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 78741917 ns 85319667 ns 0.92
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 170683583 ns 170588104.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 44627042 ns 44585812.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 250060541.5 ns 249675792 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 148207250 ns 148489083 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 115967792 ns 115482854 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 448320521 ns 447673229 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/GPU/CUDA 5438535.5 ns 5439180 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 1129762334 ns 1127687125 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 881232895.5 ns 883344500.5 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 807642666 ns 805915916 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1746898708 ns 1756922958 ns 0.99
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/GPU/CUDA 28881644.5 ns 29345011 ns 0.98
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1020749770.5 ns 1057020520.5 ns 0.97
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 971889209 ns 963663000 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1306078959 ns 1305738167 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1723825958.5 ns 1740054666.5 ns 0.99
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1295917 ns 1286708 ns 1.01
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 904250 ns 773958 ns 1.17
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 957041.5 ns 910250 ns 1.05
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2119271 ns 2050125 ns 1.03
mlp7layer_bn(relu)(32 x 256)/forward/GPU/CUDA 573283 ns 558247.5 ns 1.03
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 5873750.5 ns 5806667 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 6045250 ns 8935000 ns 0.68
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 24731625 ns 24443500 ns 1.01
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 7076916 ns 7052833 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/GPU/CUDA 1333770 ns 1343916 ns 0.99
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 11387000 ns 10065667 ns 1.13
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 10073875 ns 10525833 ns 0.96
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 17896812 ns 17854875 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 8967708 ns 8785625 ns 1.02
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/2 thread(s) 479500 ns 456166.5 ns 1.05
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/4 thread(s) 475500 ns 379750 ns 1.25
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/8 thread(s) 2159875 ns 1895687.5 ns 1.14
Dense(128 => 128, gelu)(128 x 128)/forward/CPU/1 thread(s) 89083 ns 90208 ns 0.99
Dense(128 => 128, gelu)(128 x 128)/forward/GPU/CUDA 28042 ns 27683 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/2 thread(s) 383020.5 ns 380792 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/4 thread(s) 428916 ns 445520.5 ns 0.96
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/8 thread(s) 4731438 ns 4682292 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/zygote/CPU/1 thread(s) 266541 ns 261167 ns 1.02
Dense(128 => 128, gelu)(128 x 128)/zygote/GPU/CUDA 220790.5 ns 219550.5 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/2 thread(s) 709084 ns 704084 ns 1.01
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/4 thread(s) 701625 ns 722292 ns 0.97
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/8 thread(s) 787375.5 ns 997916.5 ns 0.79
Dense(128 => 128, gelu)(128 x 128)/enzyme/CPU/1 thread(s) 445771 ns 451917 ns 0.99
Dense(128 => 128, relu)(128 x 128)/forward/CPU/2 thread(s) 427875 ns 405708 ns 1.05
Dense(128 => 128, relu)(128 x 128)/forward/CPU/4 thread(s) 416708.5 ns 324500 ns 1.28
Dense(128 => 128, relu)(128 x 128)/forward/CPU/8 thread(s) 744000 ns 744834 ns 1.00
Dense(128 => 128, relu)(128 x 128)/forward/CPU/1 thread(s) 52854 ns 54917 ns 0.96
Dense(128 => 128, relu)(128 x 128)/forward/GPU/CUDA 27664 ns 28053 ns 0.99
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/2 thread(s) 340833 ns 335750 ns 1.02
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/4 thread(s) 317584 ns 339729 ns 0.93
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/8 thread(s) 868791.5 ns 762333 ns 1.14
Dense(128 => 128, relu)(128 x 128)/zygote/CPU/1 thread(s) 153625 ns 172500 ns 0.89
Dense(128 => 128, relu)(128 x 128)/zygote/GPU/CUDA 207528 ns 205595.5 ns 1.01
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/2 thread(s) 404416 ns 404000 ns 1.00
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/4 thread(s) 385334 ns 406875 ns 0.95
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/8 thread(s) 1054292 ns 805584 ns 1.31
Dense(128 => 128, relu)(128 x 128)/enzyme/CPU/1 thread(s) 174000 ns 174375 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 603618375 ns 601761750 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 428696083 ns 429688896 ns 1.00
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 377266063 ns 380315375 ns 0.99
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 876199292 ns 872892666.5 ns 1.00
vgg16(32, 32, 3, 64)/forward/GPU/CUDA 7024377 ns 7026376 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1985844104.5 ns 1987308375 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 1661758208.5 ns 1621059583.5 ns 1.03
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 1608456437.5 ns 1611181167 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 2755931875 ns 2764493083 ns 1.00
vgg16(32, 32, 3, 64)/zygote/GPU/CUDA 25990323.5 ns 25927495 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/2 thread(s) 522084 ns 513625 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/4 thread(s) 433375 ns 406334 ns 1.07
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/8 thread(s) 2244333.5 ns 1661167 ns 1.35
Dense(512 => 512, gelu)(512 x 128)/forward/CPU/1 thread(s) 870959 ns 865667 ns 1.01
Dense(512 => 512, gelu)(512 x 128)/forward/GPU/CUDA 47163.5 ns 47567 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/2 thread(s) 1868271 ns 1872250 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/4 thread(s) 2327875 ns 1794875 ns 1.30
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/8 thread(s) 14854667 ns 14572917 ns 1.02
Dense(512 => 512, gelu)(512 x 128)/zygote/CPU/1 thread(s) 2780687.5 ns 2776666 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/zygote/GPU/CUDA 248248 ns 247130.5 ns 1.00
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/2 thread(s) 2717000 ns 2742584 ns 0.99
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/4 thread(s) 2282917 ns 2329041.5 ns 0.98
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/8 thread(s) 3907250 ns 4441625 ns 0.88
Dense(512 => 512, gelu)(512 x 128)/enzyme/CPU/1 thread(s) 3418708 ns 3342437.5 ns 1.02
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1568167 ns 1568250 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1231291.5 ns 1047208.5 ns 1.18
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 1182312.5 ns 1211750 ns 0.98
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2381916 ns 2310625 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/forward/GPU/CUDA 584934.5 ns 587603 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 5788854 ns 5769875 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 6745833 ns 8267854 ns 0.82
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 24802729 ns 24141062 ns 1.03
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 7285084 ns 7272833 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/GPU/CUDA 1358738.5 ns 1402858 ns 0.97
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 13061333 ns 12362292 ns 1.06
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 12025375 ns 11920979.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 21056084 ns 21342959 ns 0.99
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 10835604.5 ns 10695063 ns 1.01
Dense(16 => 16, relu)(16 x 128)/forward/CPU/2 thread(s) 2666 ns 2375 ns 1.12
Dense(16 => 16, relu)(16 x 128)/forward/CPU/4 thread(s) 2459 ns 2834 ns 0.87
Dense(16 => 16, relu)(16 x 128)/forward/CPU/8 thread(s) 3541.5 ns 3270.5 ns 1.08
Dense(16 => 16, relu)(16 x 128)/forward/CPU/1 thread(s) 2750 ns 2750 ns 1
Dense(16 => 16, relu)(16 x 128)/forward/GPU/CUDA 24643 ns 23858 ns 1.03
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/2 thread(s) 8500 ns 8750 ns 0.97
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/4 thread(s) 8709 ns 8583 ns 1.01
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/8 thread(s) 8770.5 ns 8625 ns 1.02
Dense(16 => 16, relu)(16 x 128)/zygote/CPU/1 thread(s) 8458 ns 8792 ns 0.96
Dense(16 => 16, relu)(16 x 128)/zygote/GPU/CUDA 211404.5 ns 217435 ns 0.97
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/2 thread(s) 16791 ns 16667 ns 1.01
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/4 thread(s) 16708 ns 16500 ns 1.01
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/8 thread(s) 16708 ns 16833 ns 0.99
Dense(16 => 16, relu)(16 x 128)/enzyme/CPU/1 thread(s) 10750 ns 10916 ns 0.98
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/2 thread(s) 11729 ns 14291 ns 0.82
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/4 thread(s) 14500 ns 15708.5 ns 0.92
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/8 thread(s) 11709 ns 12750 ns 0.92
Dense(16 => 16, gelu)(16 x 128)/forward/CPU/1 thread(s) 7833 ns 7562.5 ns 1.04
Dense(16 => 16, gelu)(16 x 128)/forward/GPU/CUDA 24689 ns 25549 ns 0.97
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/2 thread(s) 22667 ns 22500 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/4 thread(s) 22250 ns 22208 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/8 thread(s) 22459 ns 22542 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/CPU/1 thread(s) 22500 ns 22459 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/zygote/GPU/CUDA 232898 ns 236055 ns 0.99
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/2 thread(s) 52291.5 ns 52312.5 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/4 thread(s) 52500 ns 52125 ns 1.01
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/8 thread(s) 52521 ns 52500 ns 1.00
Dense(16 => 16, gelu)(16 x 128)/enzyme/CPU/1 thread(s) 44000 ns 43917 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/2 thread(s) 28750 ns 29000 ns 0.99
Dense(128 => 128, identity)(128 x 128)/forward/CPU/4 thread(s) 29334 ns 29041 ns 1.01
Dense(128 => 128, identity)(128 x 128)/forward/CPU/8 thread(s) 29208 ns 29250 ns 1.00
Dense(128 => 128, identity)(128 x 128)/forward/CPU/1 thread(s) 46916 ns 46458.5 ns 1.01
Dense(128 => 128, identity)(128 x 128)/forward/GPU/CUDA 25952 ns 26404 ns 0.98
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/2 thread(s) 211958.5 ns 209708 ns 1.01
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/4 thread(s) 261208 ns 267167 ns 0.98
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/8 thread(s) 4169541.5 ns 4263083.5 ns 0.98
Dense(128 => 128, identity)(128 x 128)/zygote/CPU/1 thread(s) 153125 ns 148000 ns 1.03
Dense(128 => 128, identity)(128 x 128)/zygote/GPU/CUDA 217493 ns 216477.5 ns 1.00
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/2 thread(s) 317500 ns 314042 ns 1.01
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/4 thread(s) 290167 ns 301625 ns 0.96
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/8 thread(s) 796854.5 ns 772375 ns 1.03
Dense(128 => 128, identity)(128 x 128)/enzyme/CPU/1 thread(s) 161500 ns 160834 ns 1.00
Dense(16 => 16, identity)(16 x 128)/forward/CPU/2 thread(s) 1792 ns 1833 ns 0.98
Dense(16 => 16, identity)(16 x 128)/forward/CPU/4 thread(s) 1875 ns 2000 ns 0.94
Dense(16 => 16, identity)(16 x 128)/forward/CPU/8 thread(s) 2625 ns 2417 ns 1.09
Dense(16 => 16, identity)(16 x 128)/forward/CPU/1 thread(s) 1917 ns 2479 ns 0.77
Dense(16 => 16, identity)(16 x 128)/forward/GPU/CUDA 22908 ns 23762 ns 0.96
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/2 thread(s) 7416 ns 7417 ns 1.00
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/4 thread(s) 7208 ns 7125 ns 1.01
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/8 thread(s) 7625 ns 7584 ns 1.01
Dense(16 => 16, identity)(16 x 128)/zygote/CPU/1 thread(s) 7625 ns 7187.5 ns 1.06
Dense(16 => 16, identity)(16 x 128)/zygote/GPU/CUDA 268483 ns 255200 ns 1.05
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/2 thread(s) 11250 ns 11250 ns 1
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/4 thread(s) 11625 ns 11708 ns 0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/8 thread(s) 11542 ns 11708 ns 0.99
Dense(16 => 16, identity)(16 x 128)/enzyme/CPU/1 thread(s) 6833 ns 6958 ns 0.98
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 79894667 ns 79879875 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 49133813 ns 47906812.5 ns 1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 44971167 ns 44940625 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 151617667 ns 152149250 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/GPU/CUDA 2714974.5 ns 2721915 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 472351667 ns 662482292 ns 0.71
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 408027541 ns 413355625 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 398391084 ns 398984125 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 687897666 ns 733919021 ns 0.94
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/GPU/CUDA 14607484.5 ns 14581195 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 686060271 ns 709711916.5 ns 0.97
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 657056541 ns 666381083 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 1003771958 ns 1014202458 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 999509292 ns 999737375 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.