Consolidate the Backend parameter caching

LuxDL · Jun 22, 2024 · ebe293b · ebe293b
1 parent e0d262a
commit ebe293b
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 64 deletions.
diff --git a/ext/LuxEnzymeExt.jl b/ext/LuxEnzymeExt.jl
@@ -1,31 +1,27 @@
 module LuxEnzymeExt
 
 using ADTypes: AutoEnzyme
-using ConcreteStructs: @concrete
 using Enzyme: Enzyme, Active, Const, Duplicated
 using Lux: Lux
-
-@concrete struct CachedEnzymeExtras{FT}
- dparameters
- objective_function
- st_wrap
- stats_wrap
-end
+using Lux.Experimental: TrainingBackendCache
 
 # Case I: We have CachedEnzymeExtras and objective_function is unchanged.
-function Lux.Experimental.compute_gradients(::AutoEnzyme, objective_function::F, data,
- ts::Lux.Experimental.TrainState{<:CachedEnzymeExtras{FT}, F}) where {F, FT}
+function Lux.Experimental.compute_gradients(::AutoEnzyme,
+ objective_function::F,
+ data,
+ ts::Lux.Experimental.TrainState{<:TrainingBackendCache{:Enzyme, FT}, F}) where {
+ F, FT}
  dps = Lux.recursive_make_zero!!(ts.cache.dparameters)
 
  _, loss = Enzyme.autodiff(
  Enzyme.ReverseWithPrimal, ts.cache.objective_function, Active, Const(ts.model),
  Duplicated(ts.parameters, dps), Const(ts.states), Const(data))
 
  ts_new = __construct_new_trainstate(
- ts.cache.st_wrap[], ts.states, ts, objective_function, dps,
- ts.cache.objective_function, ts.cache.st_wrap, ts.cache.stats_wrap)
+ ts.cache.extras.st_wrap[], ts.states, ts, objective_function, dps,
+ ts.cache.objective_function, ts.cache.extras.st_wrap, ts.cache.extras.stats_wrap)
 
- return dps, loss, ts.cache.stats_wrap[], ts_new
+ return dps, loss, ts.cache.extras.stats_wrap[], ts_new
 end
 
 # Case II: We have CachedEnzymeExtras and objective_function is changed.
@@ -49,7 +45,8 @@ end
 function Lux.Experimental.compute_gradients(ad::AutoEnzyme, objective_function::F, data,
  ts::Lux.Experimental.TrainState) where {F}
  dps = Lux.recursive_make_zero(ts.parameters)
- cache = CachedEnzymeExtras{true}(dps, nothing, nothing, nothing)
+ cache = TrainingBackendCache{:Enzyme, true}(
+ dps, nothing, (; st_wrap=nothing, stats_wrap=nothing))
  ts_new = Lux.Experimental.TrainState(
  cache, nothing, ts.model, ts.parameters, ts.states, ts.optimizer_state, ts.step)
  return Lux.Experimental.compute_gradients(ad, objective_function, data, ts_new)
@@ -60,15 +57,15 @@ end
 function __construct_new_trainstate(
  st_new::S, ::S, ts::Lux.Experimental.TrainState, objective_fn::O,
  dps, obj_fn::O2, st_wrap, stats_wrap) where {S, O, O2}
- cache = CachedEnzymeExtras{false}(dps, obj_fn, st_wrap, stats_wrap)
+ cache = TrainingBackendCache{:Enzyme, false}(dps, obj_fn, (; st_wrap, stats_wrap))
  return Lux.Experimental.TrainState(
  cache, objective_fn, ts.model, ts.parameters, st_new, ts.optimizer_state, ts.step)
 end
 
 function __construct_new_trainstate(
  st_new, _, ts::Lux.Experimental.TrainState, objective_fn::O,
  dps, obj_fn::O2, st_wrap, stats_wrap) where {O, O2}
- cache = CachedEnzymeExtras{false}(dps, nothing, nothing, nothing)
+ cache = TrainingBackendCache{:Enzyme, false}(dps, nothing, (; st_wrap, stats_wrap))
  return Lux.Experimental.TrainState(
  cache, nothing, ts.model, ts.parameters, st_new, ts.optimizer_state, ts.step)
 end

diff --git a/ext/LuxReverseDiffExt/LuxReverseDiffExt.jl b/ext/LuxReverseDiffExt/LuxReverseDiffExt.jl
@@ -2,10 +2,9 @@ module LuxReverseDiffExt
 
 using ADTypes: ADTypes, AutoReverseDiff
 using ArrayInterface: ArrayInterface
-using Functors: fmap
 using Lux: Lux, LuxCPUDevice
+using Lux.Experimental: TrainingBackendCache
 using ReverseDiff: ReverseDiff, TrackedArray, @grad_from_chainrules
-using Setfield: @set!
 
 # AoS to SoA conversion
 function Lux.apply(

diff --git a/ext/LuxReverseDiffExt/training.jl b/ext/LuxReverseDiffExt/training.jl
@@ -16,29 +16,43 @@ else
  end
 end
 
-@inline function __uncompiled_reverse_diff(
- objective_function::F, data, ts::Lux.Experimental.TrainState) where {F}
+# Uncompiled ReverseDiff
+@inline function __uncompiled_reverse_diff(objective_function::F, data,
+ ts::Lux.Experimental.TrainState{<:TrainingBackendCache{:ReverseDiff}}) where {F}
  tape = ReverseDiff.InstructionTape()
- grads = Lux.recursive_make_zero(ts.parameters)
  ps_tracked = Lux.recursive_map(
- Lux.__Fix3(ReverseDiff.TrackedArray, tape), ts.parameters, grads)
+ Lux.__Fix3(ReverseDiff.TrackedArray, tape), ts.parameters, ts.cache.dparameters)
+
  loss, st, stats = objective_function(ts.model, ps_tracked, ts.states, data)
  loss.deriv = true
  ReverseDiff.reverse_pass!(tape)
- @set! ts.states = st
- return grads, ReverseDiff.value(loss), stats, ts
+
+ ts_new = Lux.Experimental.TrainState(
+ TrainingBackendCache{:ReverseDiff, false}(
+ ts.cache.dparameters, objective_function, nothing),
+ objective_function,
+ ts.model,
+ ts.parameters,
+ st,
+ ts.optimizer_state,
+ ts.step)
+
+ return ts.cache.dparameters, ReverseDiff.value(loss), stats, ts_new
+end
+
+# First call, nothing is cached
+@inline function __uncompiled_reverse_diff(
+ objective_function::F, data, ts::Lux.Experimental.TrainState) where {F}
+ grads = Lux.recursive_make_zero(ts.parameters)
+ ts_new = Lux.Experimental.TrainState(
+ TrainingBackendCache{:ReverseDiff, true}(grads, objective_function, nothing),
+ objective_function, ts.model, ts.parameters,
+ ts.states, ts.optimizer_state, ts.step)
+ return __uncompiled_reverse_diff(objective_function, data, ts_new)
 end
 
+# Compiled ReverseDiff
 @inline function __compiled_reverse_diff(
  objective_function::F, data, ts::Lux.Experimental.TrainState) where {F}
- # tape = ReverseDiff.InstructionTape()
- # grads = Lux.recursive_make_zero(ts.parameters)
- # ps_tracked = Lux.recursive_map(
- # Lux.__Fix3(ReverseDiff.TrackedArray, tape), ts.parameters, grads)
- # loss, st, stats = objective_function(ts.model, ps_tracked, ts.states, data)
- # loss.deriv = true
- # ReverseDiff.reverse_pass!(tape)
- # @set! ts.states = st
- # return grads, ReverseDiff.value(loss), stats, ts
  error("Not implemented yet")
 end
diff --git a/ext/LuxTrackerExt.jl b/ext/LuxTrackerExt.jl
@@ -3,49 +3,61 @@ module LuxTrackerExt
 using ADTypes: AutoTracker
 using ArrayInterface: ArrayInterface
 using ChainRulesCore: ChainRulesCore
-using FastClosures: @closure
 using Functors: fmap
 using Lux: Lux, LuxCPUDevice
+using Lux.Experimental: TrainingBackendCache
 using LuxCore: LuxCore
-using Setfield: @set!
 using Tracker: Tracker, TrackedArray, @grad_from_chainrules
 
 const CRC = ChainRulesCore
 
-# Type Piracy: Need to upstream
-Tracker.param(nt::NamedTuple{F}) where {F} = NamedTuple{F}(Tracker.param.(values(nt)))
-Tracker.param(t::Tuple) = map(Tracker.param, t)
-Tracker.param(l::LuxCore.AbstractExplicitLayer) = l
-
-Tracker.zero_grad!(nt::NamedTuple) = Tracker.zero_grad!.(values(nt))
-Tracker.zero_grad!(::LuxCore.AbstractExplicitLayer) = nothing
-
-function Tracker.extract_grad!(nt::NamedTuple{F}) where {F}
- return NamedTuple{F}(Tracker.extract_grad!.(values(nt)))
-end
-Tracker.extract_grad!(t::Tuple) = map(Tracker.extract_grad!, t)
-Tracker.extract_grad!(::LuxCore.AbstractExplicitLayer) = nothing
-
-Tracker.data(nt::NamedTuple) = fmap(Tracker.data, nt)
-Tracker.data(t::Tuple) = map(Tracker.data, t)
-Tracker.data(l::LuxCore.AbstractExplicitLayer) = l
-
 # Weight Norm Patch
 @inline Lux._norm(x::TrackedArray; dims=Colon()) = sqrt.(sum(abs2.(x); dims))
 
 # multigate chain rules
 @inline Lux._gate(x::Tracker.TrackedVector, h::Int, n::Int) = x[Lux._gate(h, n)]
 @inline Lux._gate(x::Tracker.TrackedMatrix, h::Int, n::Int) = x[Lux._gate(h, n), :]
 
+function __construct_tracked_params(ps, dps)
+ map_fn = (p, dp) -> Tracker.TrackedArray(Tracker.Call(), p, dp)
+ return Lux.recursive_map(map_fn, ps, dps)
+end
+
 # Lux.Training
-function Lux.Experimental.compute_gradients(::AutoTracker, objective_function::F, data,
- ts::Lux.Experimental.TrainState) where {F}
- ps_tracked = fmap(Tracker.param, ts.parameters)
+## Use the cached gradient parameters
+function Lux.Experimental.compute_gradients(::AutoTracker,
+ objective_function::F,
+ data,
+ ts::Lux.Experimental.TrainState{<:TrainingBackendCache{:Tracker, FT}}) where {F, FT}
+ dparams = FT ? ts.cache.dparameters : Lux.recursive_make_zero!!(ts.cache.dparameters)
+ ps_tracked = __construct_tracked_params(ts.parameters, dparams)
+
  loss, st, stats = objective_function(ts.model, ps_tracked, ts.states, data)
  Tracker.back!(loss)
- @set! ts.states = st
- grads = fmap(Tracker.grad, ps_tracked)
- return grads, Tracker.value(loss), stats, ts
+
+ ts_new = Lux.Experimental.TrainState(
+ TrainingBackendCache{:Tracker, false}(
+ ts.cache.dparameters, objective_function, nothing),
+ objective_function,
+ ts.model,
+ ts.parameters,
+ st,
+ ts.optimizer_state,
+ ts.step)
+
+ return dparams, Tracker.value(loss), stats, ts_new
+end
+
+## First call, nothing is cached
+function Lux.Experimental.compute_gradients(::AutoTracker, objective_function::F, data,
+ ts::Lux.Experimental.TrainState) where {F}
+ grads = Lux.recursive_make_zero(ts.parameters)
+ ts_new = Lux.Experimental.TrainState(
+ TrainingBackendCache{:Tracker, true}(grads, objective_function, nothing),
+ objective_function, ts.model, ts.parameters,
+ ts.states, ts.optimizer_state, ts.step)
+ return Lux.Experimental.compute_gradients(
+ AutoTracker(), objective_function, data, ts_new)
 end
 
 # AoS to SoA conversion
@@ -77,9 +89,11 @@ Tracker.@grad function Lux.__apply_simple_chain(layer, x, ps, ::LuxCPUDevice)
  As such please test your model with FiniteDifferences or Zygote before using \
  `Tracker.jl` for your model." maxlog=1
  y, pb_f = CRC.rrule(layer, Tracker.data(x), Tracker.data(ps))
- __∇apply_simple_chain = @closure Δ -> begin
- _, ∂x, ∂ps = pb_f(convert(Array, Tracker.data(Δ)))
- return Tracker.nobacksies(:__apply_simple_chain, (nothing, ∂x, ∂ps, nothing))
+ __∇apply_simple_chain = let pb_f = pb_f
+ Δ -> begin
+ _, ∂x, ∂ps = pb_f(convert(Array, Tracker.data(Δ)))
+ return Tracker.nobacksies(:__apply_simple_chain, (nothing, ∂x, ∂ps, nothing))
+ end
  end
  # Tracker is not great at handling arbitrary types, so we convert to Array
  return Array(y), __∇apply_simple_chain

diff --git a/src/contrib/training.jl b/src/contrib/training.jl
@@ -29,14 +29,28 @@ Internal fields:
  step::Int
 end
 
+@concrete struct TrainingBackendCache{backend, first_try}
+ dparameters
+ objective_function
+ extras
+end
+
+@inline __backend(::TrainingBackendCache{backend}) where {backend} = backend
+
 function Base.show(io::IO, ts::TrainState)
  println(io, "TrainState")
  println(io, " model: ", ts.model)
  println(io, " # of parameters: ", Lux.parameterlength(ts.parameters))
  println(io, " # of states: ", Lux.statelength(ts.states))
  println(io, " optimizer_state: ", ts.optimizer_state)
  print(io, " step: ", ts.step)
- ts.cache !== nothing && print(io, "\n cache: ", nameof(typeof(ts.cache)))
+ if ts.cache !== nothing
+ if ts.cache isa TrainingBackendCache
+ print(io, "\n cache: $(nameof(typeof(ts.cache))){$(__backend(ts.cache))}")
+ else
+ print(io, "\n cache: $(nameof(typeof(ts.cache)))")
+ end
+ end
  ts.objective_function !== nothing &&
  print(io, "\n objective_function: ", nameof(typeof(ts.objective_function)))
 end

diff --git a/src/utils.jl b/src/utils.jl
@@ -331,7 +331,9 @@ end
 end
 @inline function __fused_agg(::typeof(sum), lfn::LossFunctions.Traits.Loss, x, y)
  fast_scalar_indexing(x) && fast_scalar_indexing(y) && return sum(lfn, x, y)
- return mapreduce(Broadcast.BroadcastFunction(lfn), +, x, y)
+ # mapreduce(Broadcast.BroadcastFunction(lfn), +, x, y) leads to slowdowns, better to
+ # allocate a new array
+ return sum(lfn.(x, y))
 end
 
 @inline __fused_agg(::Nothing, op::OP, args...) where {OP} = op.(args...)