LuxDL · avik-pal · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.1.0"
+version = "1.2.0-DEV"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -18,7 +18,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
 LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
@@ -43,6 +42,7 @@ ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e"
+LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 NCCL = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
@@ -55,6 +55,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 LuxComponentArraysExt = "ComponentArrays"
 LuxEnzymeExt = "Enzyme"
 LuxFluxExt = "Flux"
+LuxLossFunctionsExt = "LossFunctions"
 LuxMLUtilsExt = "MLUtils"
 LuxMPIExt = "MPI"
 LuxMPINCCLExt = ["CUDA", "MPI", "NCCL"]

diff --git a/ext/LuxLossFunctionsExt.jl b/ext/LuxLossFunctionsExt.jl
@@ -0,0 +1,71 @@
+module LuxLossFunctionsExt
+
+using ArrayInterface: fast_scalar_indexing
+using ChainRulesCore: ChainRulesCore, NoTangent, @thunk
+using EnzymeCore: EnzymeCore, EnzymeRules
+using FastClosures: @closure
+using LossFunctions: LossFunctions
+using Statistics: mean
+
+using Lux: Lux, LossFunctionImpl
+
+const CRC = ChainRulesCore
+
+function LossFunctionImpl.fused_agg(
+ ::typeof(mean), lfn::LossFunctions.Traits.Loss, x::AbstractArray, y::AbstractArray)
+ return LossFunctionImpl.fused_agg(sum, lfn, x, y) / length(x)
+end
+
+function LossFunctionImpl.fused_agg(
+ ::typeof(sum), lfn::LossFunctions.Traits.Loss, x::Number, y::Number)
+ return lfn(x, y)
+end
+function LossFunctionImpl.fused_agg(
+ ::typeof(sum), lfn::LossFunctions.Traits.Loss, x::AbstractArray, y::AbstractArray)
+ fast_scalar_indexing(x) && fast_scalar_indexing(y) && return sum(lfn, x, y)
+ return sum(lfn.(x, y))
+end
+
+function CRC.rrule(
+ ::CRC.RuleConfig{>:CRC.HasReverseMode},
+ ::typeof(LossFunctionImpl.fused_agg), ::typeof(sum),
+ lfn::LossFunctions.Traits.Loss, x, y)
+ ∇fused_agg = @closure Δ -> begin
+ ∂x = @thunk LossFunctions.deriv.(Ref(lfn), x, y) .* Δ
+ return NoTangent(), NoTangent(), NoTangent(), ∂x, NoTangent()
+ end
+ return LossFunctionImpl.fused_agg(sum, lfn, x, y), ∇fused_agg
+end
+
+function EnzymeRules.augmented_primal(
+ cfg::EnzymeRules.RevConfigWidth{1},
+ func::EnzymeCore.Const{typeof(LossFunctionImpl.fused_agg)},
+ ::Type{<:EnzymeCore.Active}, agg_f::EnzymeCore.Const{typeof(sum)},
+ lfn::EnzymeCore.Const{<:LossFunctions.Traits.Loss},
+ x::EnzymeCore.Annotation{<:AbstractArray}, y::EnzymeCore.Const)
+ primal = EnzymeRules.needs_primal(cfg) ? func.val(agg_f.val, lfn.val, x.val, y.val) :
+ nothing
+
+ cache_x = EnzymeRules.overwritten(cfg)[4] ? copy(x.val) : nothing
+ cache_y = EnzymeRules.overwritten(cfg)[5] ? copy(y.val) : nothing
+
+ return EnzymeRules.AugmentedReturn(primal, nothing, (cache_x, cache_y))
+end
+
+function EnzymeRules.reverse(
+ cfg::EnzymeRules.RevConfigWidth{1},
+ ::EnzymeCore.Const{typeof(LossFunctionImpl.fused_agg)},
+ dret::EnzymeCore.Active, (cache_x, cache_y), agg_f::EnzymeCore.Const{typeof(sum)},
+ lfn::EnzymeCore.Const{<:LossFunctions.Traits.Loss},
+ x::EnzymeCore.Annotation{<:AbstractArray}, y::EnzymeCore.Const)
+ EnzymeRules.overwritten(cfg)[4] || (cache_x = x.val)
+ EnzymeRules.overwritten(cfg)[5] || (cache_y = y.val)
+
+ if !(typeof(x) <: EnzymeCore.Const)
+ @. x.dval = LossFunctions.deriv(lfn.val, cache_x, cache_y) * dret.val
+ end
+
+ return ntuple(Returns(nothing), 4)
+end
+
+end
diff --git a/src/Lux.jl b/src/Lux.jl
@@ -11,7 +11,6 @@ using ConcreteStructs: @concrete
 using FastClosures: @closure
 using Functors: Functors, fmap
 using GPUArraysCore: @allowscalar
-using LossFunctions: LossFunctions
 using Markdown: @doc_str
 using NNlib: NNlib
 using Optimisers: Optimisers

diff --git a/src/helpers/losses.jl b/src/helpers/losses.jl
@@ -6,11 +6,11 @@ module LossFunctionImpl
 
 using ArrayInterface: fast_scalar_indexing
 using ChainRulesCore: ChainRulesCore, NoTangent, @non_differentiable, @thunk
-using EnzymeCore: EnzymeCore, EnzymeRules
 using FastClosures: @closure
-using LossFunctions: LossFunctions
+using ForwardDiff: ForwardDiff, Dual, Partials
 using Statistics: mean
 
+using ..Utils: Utils
 using ..LuxOps: xlogy
 
 const CRC = ChainRulesCore
@@ -30,59 +30,66 @@ check_sizes(_, __) = nothing
 
 # Aggregation. We are able to define custom aggregation fast paths
 fused_agg(::typeof(mean), op::OP, x) where {OP} = fused_agg(sum, op, x) / length(x)
-function fused_agg(::typeof(mean), lfn::LossFunctions.Traits.Loss, x, y)
- return fused_agg(sum, lfn, x, y) / length(x)
-end
 
 fused_agg(::typeof(sum), op::OP, x::Number) where {OP} = op(x)
 fused_agg(::typeof(sum), op::OP, x) where {OP} = sum(op, x)
 
-fused_agg(::typeof(sum), lfn::LossFunctions.Traits.Loss, x::Number, y::Number) = lfn(x, y)
-function fused_agg(::typeof(sum), lfn::LossFunctions.Traits.Loss, x, y)
- fast_scalar_indexing(x) && fast_scalar_indexing(y) && return sum(lfn, x, y)
- # mapreduce(Broadcast.BroadcastFunction(lfn), +, x, y) leads to slowdowns, better to
- # allocate a new array
- return sum(lfn.(x, y))
+fused_agg(::typeof(mean), op::OP, x::Number, y::Number) where {OP} = op(x, y)
+function fused_agg(::typeof(mean), op::OP, x::AbstractArray, y::AbstractArray) where {OP}
+ return fused_agg(sum, op, x, y) / length(x)
 end
 
-fused_agg(::Nothing, op::OP, args...) where {OP} = op.(args...)
-fused_agg(f::F, op::OP, args...) where {F, OP} = f(op.(args...))
-
-function CRC.rrule(::typeof(fused_agg), ::typeof(sum), lfn::LossFunctions.Traits.Loss, x, y)
- ∇fused_agg = @closure Δ -> begin
- ∂x = @thunk LossFunctions.deriv.(Ref(lfn), x, y) .* Δ
- return NoTangent(), NoTangent(), NoTangent(), ∂x, NoTangent()
+fused_agg(::typeof(sum), op::OP, x::Number, y::Number) where {OP} = op(x, y)
+function fused_agg(::typeof(sum), op::OP, x::AbstractArray, y::AbstractArray) where {OP}
+ if fast_scalar_indexing(x) && fast_scalar_indexing(y)
+ res = Core.Compiler._return_type(op, Tuple{eltype(x), eltype(y)})(0)
+ @simd ivdep for i in eachindex(x, y)
+ @inbounds res += op(x[i], y[i])
+ end
+ return res
  end
- return fused_agg(sum, lfn, x, y), ∇fused_agg
+ return fallback_fused_agg(sum, op, x, y)
 end
 
-function EnzymeRules.augmented_primal(
- cfg::EnzymeRules.RevConfigWidth{1}, func::EnzymeCore.Const{typeof(fused_agg)},
- ::Type{<:EnzymeCore.Active}, agg_f::EnzymeCore.Const{typeof(sum)},
- lfn::EnzymeCore.Const{<:LossFunctions.Traits.Loss},
- x::EnzymeCore.Annotation{<:AbstractArray}, y::EnzymeCore.Const)
- primal = EnzymeRules.needs_primal(cfg) ? func.val(agg_f.val, lfn.val, x.val, y.val) :
- nothing
-
- cache_x = EnzymeRules.overwritten(cfg)[4] ? copy(x.val) : nothing
- cache_y = EnzymeRules.overwritten(cfg)[5] ? copy(y.val) : nothing
+fused_agg(::Nothing, op::OP, args...) where {OP} = op.(args...)
+fused_agg(f::F, op::OP, args...) where {F, OP} = fallback_fused_agg(f, op, args...)
 
- return EnzymeRules.AugmentedReturn(primal, nothing, (cache_x, cache_y))
-end
+@inline fallback_fused_agg(f::F, op::OP, args...) where {F, OP} = f(op.(args...))
 
-function EnzymeRules.reverse(
- cfg::EnzymeRules.RevConfigWidth{1}, ::EnzymeCore.Const{typeof(fused_agg)},
- dret::EnzymeCore.Active, (cache_x, cache_y), agg_f::EnzymeCore.Const{typeof(sum)},
- lfn::EnzymeCore.Const{<:LossFunctions.Traits.Loss},
- x::EnzymeCore.Annotation{<:AbstractArray}, y::EnzymeCore.Const)
- EnzymeRules.overwritten(cfg)[4] || (cache_x = x.val)
- EnzymeRules.overwritten(cfg)[5] || (cache_y = y.val)
+function CRC.rrule(cfg::CRC.RuleConfig{>:CRC.HasReverseMode},
+ ::typeof(fused_agg), ::typeof(sum), op::OP, x, y) where {OP}
+ if has_custom_derivative(op)
+ res = fused_agg(sum, op, x, y)
+ ∇fused_agg_custom_derivative = Δ -> begin
+ ∂x = @thunk derivative.(Ref(op), x, y) .* Δ
+ return NoTangent(), NoTangent(), NoTangent(), ∂x, NoTangent()
+ end
+ return res, ∇fused_agg_custom_derivative
+ end
 
- if !(typeof(x) <: EnzymeCore.Const)
- @. x.dval = LossFunctions.deriv(lfn.val, cache_x, cache_y) * dret.val
+ # Without custom derivatives use ForwardDiff for the looped implementation
+ if fast_scalar_indexing(x) && fast_scalar_indexing(y)
+ x_dual = Dual{
+ Nothing, eltype(x), 1}.(x, (Partials{1, eltype(x)}((one(eltype(x)),)),))
+ x_partials = similar(x)
+ T = eltype(x)
+ res = Core.Compiler._return_type(op, Tuple{T, eltype(y)})(0)
+ @inbounds @simd for i in eachindex(x_partials, x, y)
+ x_dual = Dual{Nothing, T, 1}(x[i], Partials{1, T}((one(T),)))
+ tmp = op(x_dual, y[i])
+ x_partials[i] = ForwardDiff.partials(tmp, 1)
+ res += ForwardDiff.value(tmp)
+ end
+ ∇fused_agg_loop = Δ -> begin
+ @simd ivdep for i in eachindex(x_partials)
+ @inbounds x_partials[i] *= Δ
+ end
+ return NoTangent(), NoTangent(), NoTangent(), x_partials, NoTangent()
+ end
+ return res, ∇fused_agg_loop
  end
 
- return ntuple(Returns(nothing), 4)
+ return CRC.rrule_via_ad(cfg, fallback_fused_agg, sum, op, x, y)
 end
 
 get_ϵ(::Type{T}, ϵ::Real) where {T} = T(ϵ)
@@ -91,9 +98,57 @@ get_ϵ(::Type{T}, ::Nothing) where {T} = eps(float(T))
 get_loss_dims(::AbstractVector) = Colon()
 get_loss_dims(::AbstractArray{T, N}) where {T, N} = 1:(N - 1)
 
+has_custom_derivative(::F) where {F} = false
+
+has_custom_derivative(f::Utils.Fix3) = has_custom_derivative(f.f)
+derivative(f::Utils.Fix3, x, y) = derivative(f.f, x, y, f.x)
+
 # Functional forms of losses
+l1_distance_loss(x::T1, y::T2) where {T1, T2} = abs(x - y)
+has_custom_derivative(::typeof(l1_distance_loss)) = true
+function derivative(::typeof(l1_distance_loss), x::T1, y::T2) where {T1, T2}
+ return convert(T1, sign(x - y))
+end
+
+l2_distance_loss(x::T1, y::T2) where {T1, T2} = abs2(x - y)
+has_custom_derivative(::typeof(l2_distance_loss)) = true
+function derivative(::typeof(l2_distance_loss), x::T1, y::T2) where {T1, T2}
+ return convert(T1, 2 * (x - y))
+end
+
+function huber_loss(x::T1, y::T2, δ::T3) where {T1, T2, T3}
+ T = promote_type(T1, T2, T3)
+ diff = x - y
+ abs_diff = abs(diff)
+ return ifelse(abs_diff ≤ δ, T(0.5) * abs2(diff), δ * (abs_diff - T(0.5) * δ))
+end
+has_custom_derivative(::typeof(huber_loss)) = true
+function derivative(::typeof(huber_loss), x::T, y::T2, δ::T3) where {T, T2, T3}
+ diff = x - y
+ return ifelse(abs(diff) ≤ δ, T(diff), T(δ) * convert(T, sign(diff)))
+end
+
+function l1_hinge_loss(x::T1, y::T2) where {T1, T2}
+ agreement = x * y
+ return max(oftype(agreement, false), true - agreement)
+end
+has_custom_derivative(::typeof(l1_hinge_loss)) = true
+function derivative(::typeof(l1_hinge_loss), x::T1, y::T2) where {T1, T2}
+ return T1(ifelse(x * y ≥ 1, false, true))
+end
+
+function l2_hinge_loss(x::T1, y::T2) where {T1, T2}
+ agreement = x * y
+ return ifelse(agreement ≥ 1, oftype(agreement, false), abs2(true - agreement))
+end
+has_custom_derivative(::typeof(l2_hinge_loss)) = true
+function derivative(::typeof(l2_hinge_loss), x::T1, y::T2) where {T1, T2}
+ agreement = x * y
+ return T1(ifelse(agreement ≥ 1, false, 2 * (agreement - true)))
+end
+
 function siamese_contrastive_loss(x::T1, y::T2, margin=true) where {T1, T2}
- return (1 - y) * x^2 + y * max(promote_type(T1, T2)(0), margin - x)^2
+ return (true - y) * x^2 + y * max(promote_type(T1, T2)(false), margin - x)^2
 end
 
 poisson_loss(x::T1, y::T2, ϵ) where {T1, T2} = x - xlogy(y, x + get_ϵ(T1, ϵ))
@@ -462,7 +517,7 @@ julia> loss(y_pred, y_true) ≈ 0.55
 true
 ```
 """
-HingeLoss(; agg=mean) = GenericLossFunction(LossFunctions.L1HingeLoss(); agg)
+HingeLoss(; agg=mean) = GenericLossFunction(LossFunctionImpl.l1_hinge_loss; agg)
 
 @doc doc"""
  HuberLoss(; delta = 1, agg = mean)
@@ -490,7 +545,8 @@ true
 """
 function HuberLoss(; delta::Union{Nothing, AbstractFloat}=nothing, agg=mean)
  return GenericLossFunction(
- LossFunctions.HuberLoss(ifelse(delta === nothing, Float16(1), delta)); agg)
+ Utils.Fix3(LossFunctionImpl.huber_loss, ifelse(delta === nothing, true, delta));
+ agg)
 end
 
 @doc doc"""
@@ -566,7 +622,7 @@ julia> loss(y_model, 1:3) ≈ 0.1
 true
 ```
 """
-MAELoss(; agg=mean) = GenericLossFunction(LossFunctions.L1DistLoss(); agg)
+MAELoss(; agg=mean) = GenericLossFunction(LossFunctionImpl.l1_distance_loss; agg)
 
 const L1Loss = MAELoss
 
@@ -588,7 +644,7 @@ julia> loss(y_model, 1:3) ≈ 0.01
 true
 ```
 """
-MSELoss(; agg=mean) = GenericLossFunction(LossFunctions.L2DistLoss(); agg)
+MSELoss(; agg=mean) = GenericLossFunction(LossFunctionImpl.l2_distance_loss; agg)
 
 const L2Loss = MSELoss
 
@@ -696,7 +752,7 @@ julia> loss(y_pred, y_true) ≈ 0.625
 true
 ```
 """
-SquaredHingeLoss(; agg=mean) = GenericLossFunction(LossFunctions.L2HingeLoss(); agg)
+SquaredHingeLoss(; agg=mean) = GenericLossFunction(LossFunctionImpl.l2_hinge_loss; agg)
 
 @doc doc"""
  GenericLossFunction(loss_fn; agg = mean)

diff --git a/test/helpers/loss_tests.jl b/test/helpers/loss_tests.jl
@@ -91,11 +91,7 @@ end
 
  @jet MSLELoss()(ŷ, y)
 
- if VERSION ≥ v"1.11-"
- @test @inferred(Zygote.gradient(MSLELoss(), ŷ, y)) isa Any
- else
- @test_broken @inferred(Zygote.gradient(MSLELoss(), ŷ, y)) isa Any
- end
+ @test @inferred(Zygote.gradient(MSLELoss(), ŷ, y)) isa Any broken=ongpu
 
  __f = Base.Fix2(MSLELoss(), y)
  @test_gradients(__f, ŷ; atol=1.0f-3, rtol=1.0f-3)
@@ -343,7 +339,7 @@ end
  @test Lux.PoissonLoss()(y, y) ≈ 0.5044459776946685
 
  @jet Lux.PoissonLoss()(ŷ, y)
- @test_broken @inferred Zygote.gradient(Lux.PoissonLoss(), ŷ, y)
+ @test @inferred Zygote.gradient(Lux.PoissonLoss(), ŷ, y) isa Any
 
  __f = Base.Fix2(Lux.PoissonLoss(), y)
  @test_gradients(__f, ŷ; atol=1.0f-3, rtol=1.0f-3)
@@ -357,7 +353,7 @@ end
  @test DiceCoeffLoss()(y, y) ≈ 0.0
 
  @jet DiceCoeffLoss()(ŷ, y)
- @test_broken @inferred Zygote.gradient(DiceCoeffLoss(), ŷ, y)
+ @test @inferred(Zygote.gradient(DiceCoeffLoss(), ŷ, y)) isa Any broken=true
 
  __f = Base.Fix2(DiceCoeffLoss(), y)
  @test_gradients(__f, ŷ; atol=1.0f-3, rtol=1.0f-3,