LuxDL · avik-pal · Jun 23, 2024 · Jun 22, 2024 · Jun 22, 2024 · Jun 22, 2024
diff --git a/docs/src/api/Lux/utilities.md b/docs/src/api/Lux/utilities.md
@@ -59,6 +59,7 @@ Lux.xlogx
 ## Recursive Operations
 
 ```@docs
+Lux.recursive_map
 Lux.recursive_add!!
 Lux.recursive_eltype
 Lux.recursive_make_zero

diff --git a/ext/LuxEnzymeExt.jl b/ext/LuxEnzymeExt.jl
@@ -1,75 +1,68 @@
 module LuxEnzymeExt
 
 using ADTypes: AutoEnzyme
-using ConcreteStructs: @concrete
 using Enzyme: Enzyme, Active, Const, Duplicated
 using Lux: Lux
+using Lux.Experimental: TrainingBackendCache, TrainState
 
-@concrete struct CachedEnzymeExtras{FT}
- dparameters
- objective_function
- st_wrap
- stats_wrap
-end
-
-# Case I: We have CachedEnzymeExtras and objective_function is unchanged.
-function Lux.Experimental.compute_gradients(::AutoEnzyme, objective_function::F, data,
- ts::Lux.Experimental.TrainState{<:CachedEnzymeExtras{FT}, F}) where {F, FT}
- dps = Lux.recursive_make_zero!!(ts.cache.dparameters)
+# Case I: We have TrainingBackendCache{:Enzyme} and obj_fn is unchanged.
+function Lux.Experimental.compute_gradients(::AutoEnzyme, obj_fn::F, data,
+ ts::TrainState{<:TrainingBackendCache{:Enzyme, FT}, F}) where {F, FT}
+ dps = FT ? ts.cache.dparameters : Lux.recursive_make_zero!!(ts.cache.dparameters)
 
  _, loss = Enzyme.autodiff(
- Enzyme.ReverseWithPrimal, ts.cache.objective_function, Active, Const(ts.model),
+ Enzyme.ReverseWithPrimal, ts.cache.extras.obj_fn, Active, Const(ts.model),
  Duplicated(ts.parameters, dps), Const(ts.states), Const(data))
 
  ts_new = __construct_new_trainstate(
- ts.cache.st_wrap[], ts.states, ts, objective_function, dps,
- ts.cache.objective_function, ts.cache.st_wrap, ts.cache.stats_wrap)
+ ts.cache.extras.st_wrap[], ts.states, ts, obj_fn, dps,
+ ts.cache.extras.obj_fn, ts.cache.extras.st_wrap, ts.cache.extras.stats_wrap)
 
- return dps, loss, ts.cache.stats_wrap[], ts_new
+ return dps, loss, ts.cache.extras.stats_wrap[], ts_new
 end
 
-# Case II: We have CachedEnzymeExtras and objective_function is changed.
-function Lux.Experimental.compute_gradients(::AutoEnzyme, objective_function::F, data,
- ts::Lux.Experimental.TrainState{<:CachedEnzymeExtras{FT}}) where {F, FT}
- dps = Lux.recursive_make_zero!!(ts.cache.dparameters)
+# Case II: We have CachedEnzymeExtras and obj_fn is changed.
+function Lux.Experimental.compute_gradients(::AutoEnzyme, obj_fn::F, data,
+ ts::TrainState{<:TrainingBackendCache{:Enzyme, FT}}) where {F, FT}
+ dps = FT ? ts.cache.dparameters : Lux.recursive_make_zero!!(ts.cache.dparameters)
 
- obj_fn, st_wrap, stats_wrap = Lux.Experimental.__wrap_objective_function(
- objective_function, ts.model, ts.parameters, ts.states, data, Val(FT))
+ obj_fn_wrap, st_wrap, stats_wrap = Lux.Experimental.__wrap_objective_function(
+ obj_fn, ts.model, ts.parameters, ts.states, data, Val(FT))
 
- _, loss = Enzyme.autodiff(Enzyme.ReverseWithPrimal, obj_fn, Active, Const(ts.model),
+ _, loss = Enzyme.autodiff(
+ Enzyme.ReverseWithPrimal, obj_fn_wrap, Active, Const(ts.model),
  Duplicated(ts.parameters, dps), Const(ts.states), Const(data))
 
  ts_new = __construct_new_trainstate(
- st_wrap[], ts.states, ts, objective_function, dps, obj_fn, st_wrap, stats_wrap)
+ st_wrap[], ts.states, ts, obj_fn, dps, obj_fn_wrap, st_wrap, stats_wrap)
 
  return dps, loss, stats_wrap[], ts_new
 end
 
 # Case III: Nothing is cached. First call to `compute_gradients`
-function Lux.Experimental.compute_gradients(ad::AutoEnzyme, objective_function::F, data,
- ts::Lux.Experimental.TrainState) where {F}
+function Lux.Experimental.compute_gradients(
+ ad::AutoEnzyme, obj_fn::F, data, ts::TrainState) where {F}
  dps = Lux.recursive_make_zero(ts.parameters)
- cache = CachedEnzymeExtras{true}(dps, nothing, nothing, nothing)
- ts_new = Lux.Experimental.TrainState(
+ cache = TrainingBackendCache{:Enzyme, true}(
+ dps, (; obj_fn=nothing, st_wrap=nothing, stats_wrap=nothing))
+ ts_new = TrainState(
  cache, nothing, ts.model, ts.parameters, ts.states, ts.optimizer_state, ts.step)
- return Lux.Experimental.compute_gradients(ad, objective_function, data, ts_new)
+ return Lux.Experimental.compute_gradients(ad, obj_fn, data, ts_new)
 end
 
 # If `st_new` is of a new type, we will have to recompute the cache anyway. Force it by not
 # storing the objective function.
-function __construct_new_trainstate(
- st_new::S, ::S, ts::Lux.Experimental.TrainState, objective_fn::O,
- dps, obj_fn::O2, st_wrap, stats_wrap) where {S, O, O2}
- cache = CachedEnzymeExtras{false}(dps, obj_fn, st_wrap, stats_wrap)
- return Lux.Experimental.TrainState(
+function __construct_new_trainstate(st_new::S, ::S, ts::TrainState, objective_fn::O, dps,
+ obj_fn::O2, st_wrap, stats_wrap) where {S, O, O2}
+ cache = TrainingBackendCache{:Enzyme, false}(dps, (; obj_fn, st_wrap, stats_wrap))
+ return TrainState(
  cache, objective_fn, ts.model, ts.parameters, st_new, ts.optimizer_state, ts.step)
 end
 
-function __construct_new_trainstate(
- st_new, _, ts::Lux.Experimental.TrainState, objective_fn::O,
- dps, obj_fn::O2, st_wrap, stats_wrap) where {O, O2}
- cache = CachedEnzymeExtras{false}(dps, nothing, nothing, nothing)
- return Lux.Experimental.TrainState(
+function __construct_new_trainstate(st_new, _, ts::TrainState, objective_fn::O, dps,
+ obj_fn::O2, st_wrap, stats_wrap) where {O, O2}
+ cache = TrainingBackendCache{:Enzyme, false}(dps, (; obj_fn, st_wrap, stats_wrap))
+ return TrainState(
  cache, nothing, ts.model, ts.parameters, st_new, ts.optimizer_state, ts.step)
 end
 

diff --git a/ext/LuxReverseDiffExt.jl b/ext/LuxReverseDiffExt.jl
diff --git a/ext/LuxReverseDiffExt/LuxReverseDiffExt.jl b/ext/LuxReverseDiffExt/LuxReverseDiffExt.jl
@@ -0,0 +1,28 @@
+module LuxReverseDiffExt
+
+using ADTypes: ADTypes, AutoReverseDiff
+using ArrayInterface: ArrayInterface
+using Lux: Lux, LuxCPUDevice
+using Lux.Experimental: TrainingBackendCache, TrainState
+using ReverseDiff: ReverseDiff, TrackedArray, @grad_from_chainrules
+
+# AoS to SoA conversion
+function Lux.apply(
+ m::Lux.AbstractExplicitLayer, x::AbstractArray{<:ReverseDiff.TrackedReal}, ps, st)
+ @warn "Lux.apply(m::Lux.AbstractExplicitLayer, \
+ x::AbstractArray{<:ReverseDiff.TrackedReal}, ps, st) input was corrected to \
+ Lux.apply(m::Lux.AbstractExplicitLayer, x::ReverseDiff.TrackedArray}, ps, \
+ st).\n\n\
+ 1. If this was not the desired behavior overload the dispatch on `m`.\n\n\
+ 2. This might have performance implications. Check which layer was causing this \
+ problem using `Lux.Experimental.@debug_mode`." maxlog=1
+ return Lux.apply(m, reshape(ArrayInterface.aos_to_soa(x), size(x)), ps, st)
+end
+
+## Prevent an infinite loop
+Lux.apply(m::Lux.AbstractExplicitLayer, x::TrackedArray, ps, st) = m(x, ps, st)
+
+include("rules.jl")
+include("training.jl")
+
+end
diff --git a/ext/LuxReverseDiffExt/rules.jl b/ext/LuxReverseDiffExt/rules.jl
@@ -0,0 +1,14 @@
+# SimpleChains.jl
+@grad_from_chainrules Lux.__apply_simple_chain(layer, x::TrackedArray, ps, ::LuxCPUDevice)
+@grad_from_chainrules Lux.__apply_simple_chain(layer, x, ps::TrackedArray, ::LuxCPUDevice)
+@grad_from_chainrules Lux.__apply_simple_chain(
+ layer, x::TrackedArray, ps::TrackedArray, ::LuxCPUDevice)
+
+# DynamicExpressions.jl
+@grad_from_chainrules Lux.__apply_dynamic_expression(de::Lux.DynamicExpressionsLayer, expr,
+ operator_enum, x::TrackedArray, ps, ::LuxCPUDevice)
+@grad_from_chainrules Lux.__apply_dynamic_expression(de::Lux.DynamicExpressionsLayer, expr,
+ operator_enum, x, ps::TrackedArray, ::LuxCPUDevice)
+@grad_from_chainrules Lux.__apply_dynamic_expression(
+ de::Lux.DynamicExpressionsLayer, expr, operator_enum,
+ x::TrackedArray, ps::TrackedArray, ::LuxCPUDevice)
diff --git a/ext/LuxReverseDiffExt/training.jl b/ext/LuxReverseDiffExt/training.jl
@@ -0,0 +1,55 @@
+@static if pkgversion(ADTypes) < v"1.5"
+ # older versions did not have `compile` type parameter. Use slower type-unstable code
+ function Lux.Experimental.compute_gradients(
+ ad::AutoReverseDiff, obj_fn::F, data, ts::TrainState) where {F}
+ ad.compile && return __compiled_reverse_diff(obj_fn, data, ts)
+ return __uncompiled_reverse_diff(obj_fn, data, ts)
+ end
+else
+ for compiled in (false, true)
+ fname = compiled ? :__compiled_reverse_diff : :__uncompiled_reverse_diff
+ @eval function Lux.Experimental.compute_gradients(
+ ::AutoReverseDiff{$(compiled)}, obj_fn::F, data, ts::TrainState) where {F}
+ return $(fname)(obj_fn, data, ts)
+ end
+ end
+end
+
+# Uncompiled ReverseDiff
+@inline function __uncompiled_reverse_diff(obj_fn::F, data, ts::TrainState) where {F}
+ grads = Lux.recursive_make_zero(ts.parameters)
+ ts_new = TrainState(TrainingBackendCache{:ReverseDiff, true}(grads, nothing),
+ obj_fn, ts.model, ts.parameters, ts.states, ts.optimizer_state, ts.step)
+ return __uncompiled_reverse_diff(obj_fn, data, ts_new)
+end
+
+@inline function __uncompiled_reverse_diff(obj_fn::F, data,
+ ts::TrainState{<:TrainingBackendCache{:ReverseDiff, FT}}) where {F, FT}
+ tape = ReverseDiff.InstructionTape()
+ dparams = FT ? ts.cache.dparameters : Lux.recursive_make_zero!!(ts.cache.dparameters)
+ ps_tracked = Lux.recursive_map(
+ Lux.__Fix3(ReverseDiff.TrackedArray, tape), ts.parameters, dparams)
+
+ loss, st, stats = obj_fn(ts.model, ps_tracked, ts.states, data)
+ loss.deriv = true
+ ReverseDiff.reverse_pass!(tape)
+
+ ts_new = TrainState(
+ TrainingBackendCache{:ReverseDiff, false}(ts.cache.dparameters, nothing),
+ obj_fn, ts.model, ts.parameters, st, ts.optimizer_state, ts.step)
+
+ return ts.cache.dparameters, ReverseDiff.value(loss), stats, ts_new
+end
+
+# Compiled ReverseDiff
+@inline function __compiled_reverse_diff(obj_fn::F, data, ts::TrainState) where {F}
+ grads = Lux.recursive_make_zero(ts.parameters)
+ ts_new = TrainState(TrainingBackendCache{:ReverseDiff, true}(grads, nothing),
+ obj_fn, ts.model, ts.parameters, ts.states, ts.optimizer_state, ts.step)
+ return __compiled_reverse_diff(obj_fn, data, ts_new)
+end
+
+@inline function __compiled_reverse_diff(obj_fn::F, data,
+ ts::TrainState{<:TrainingBackendCache{:ReverseDiff, FT}}) where {F, FT}
+ error(1)
+end
diff --git a/ext/LuxTrackerExt.jl b/ext/LuxTrackerExt.jl
@@ -3,49 +3,47 @@
 using ADTypes: AutoTracker
 using ArrayInterface: ArrayInterface
 using ChainRulesCore: ChainRulesCore
-using FastClosures: @closure
-using Functors: fmap
 using Lux: Lux, LuxCPUDevice
+using Lux.Experimental: TrainingBackendCache, TrainState
 using LuxCore: LuxCore
-using Setfield: @set!
 using Tracker: Tracker, TrackedArray, @grad_from_chainrules
 
 const CRC = ChainRulesCore
 
-# Type Piracy: Need to upstream
-Tracker.param(nt::NamedTuple{F}) where {F} = NamedTuple{F}(Tracker.param.(values(nt)))
-Tracker.param(t::Tuple) = map(Tracker.param, t)
-Tracker.param(l::LuxCore.AbstractExplicitLayer) = l
-
-Tracker.zero_grad!(nt::NamedTuple) = Tracker.zero_grad!.(values(nt))
-Tracker.zero_grad!(::LuxCore.AbstractExplicitLayer) = nothing
-
-function Tracker.extract_grad!(nt::NamedTuple{F}) where {F}
- return NamedTuple{F}(Tracker.extract_grad!.(values(nt)))
-end
-Tracker.extract_grad!(t::Tuple) = map(Tracker.extract_grad!, t)
-Tracker.extract_grad!(::LuxCore.AbstractExplicitLayer) = nothing
-
-Tracker.data(nt::NamedTuple) = fmap(Tracker.data, nt)
-Tracker.data(t::Tuple) = map(Tracker.data, t)
-Tracker.data(l::LuxCore.AbstractExplicitLayer) = l
-
 # Weight Norm Patch
 @inline Lux._norm(x::TrackedArray; dims=Colon()) = sqrt.(sum(abs2.(x); dims))
 
 # multigate chain rules
 @inline Lux._gate(x::Tracker.TrackedVector, h::Int, n::Int) = x[Lux._gate(h, n)]
 @inline Lux._gate(x::Tracker.TrackedMatrix, h::Int, n::Int) = x[Lux._gate(h, n), :]
 
+function __construct_tracked_params(ps, dps)
+ map_fn = (p, dp) -> Tracker.TrackedArray(Tracker.Call(), p, dp)
+ return Lux.recursive_map(map_fn, ps, dps)
+end
+
 # Lux.Training
-function Lux.Experimental.compute_gradients(::AutoTracker, objective_function::F, data,
- ts::Lux.Experimental.TrainState) where {F}
- ps_tracked = fmap(Tracker.param, ts.parameters)
- loss, st, stats = objective_function(ts.model, ps_tracked, ts.states, data)
+function Lux.Experimental.compute_gradients(::AutoTracker, obj_fn::F, data,
+ ts::TrainState{<:TrainingBackendCache{:Tracker, FT}}) where {F, FT}
+ dparams = FT ? ts.cache.dparameters : Lux.recursive_make_zero!!(ts.cache.dparameters)
+ ps_tracked = __construct_tracked_params(ts.parameters, dparams)
+
+ loss, st, stats = obj_fn(ts.model, ps_tracked, ts.states, data)
  Tracker.back!(loss)
- @set! ts.states = st
- grads = fmap(Tracker.grad, ps_tracked)
- return grads, Tracker.value(loss), stats, ts
+
+ ts_new = TrainState(
+ TrainingBackendCache{:Tracker, false}(ts.cache.dparameters, nothing),
+ obj_fn, ts.model, ts.parameters, st, ts.optimizer_state, ts.step)
+
+ return dparams, Tracker.value(loss), stats, ts_new
+end
+
+function Lux.Experimental.compute_gradients(
+ ::AutoTracker, obj_fn::F, data, ts::TrainState) where {F}
+ grads = Lux.recursive_make_zero(ts.parameters)
+ ts_new = TrainState(TrainingBackendCache{:Tracker, true}(grads, nothing), obj_fn,
+ ts.model, ts.parameters, ts.states, ts.optimizer_state, ts.step)
+ return Lux.Experimental.compute_gradients(AutoTracker(), obj_fn, data, ts_new)
 end
 
 # AoS to SoA conversion
@@ -77,9 +75,11 @@
  As such please test your model with FiniteDifferences or Zygote before using \
  `Tracker.jl` for your model." maxlog=1
  y, pb_f = CRC.rrule(layer, Tracker.data(x), Tracker.data(ps))
- __∇apply_simple_chain = @closure Δ -> begin
- _, ∂x, ∂ps = pb_f(convert(Array, Tracker.data(Δ)))
- return Tracker.nobacksies(:__apply_simple_chain, (nothing, ∂x, ∂ps, nothing))
+ __∇apply_simple_chain = let pb_f = pb_f
+ Δ -> begin
+ _, ∂x, ∂ps = pb_f(convert(Array, Tracker.data(Δ)))
+ return Tracker.nobacksies(:__apply_simple_chain, (nothing, ∂x, ∂ps, nothing))
+ end
  end
  # Tracker is not great at handling arbitrary types, so we convert to Array
  return Array(y), __∇apply_simple_chain