From 784142c81a75f2b0717818149cdf631605839a24 Mon Sep 17 00:00:00 2001
From: RainerHeintzmann <heintzmann@gmail.com>
Date: Wed, 24 Jul 2024 16:24:37 +0200
Subject: [PATCH] about to remove sca in the example.

---
 performance_tests/gauss_fit.jl |  56 ++++++---
 src/SeparableFunctions.jl      |   4 +-
 src/general.jl                 | 204 ++++++++++++++++++++-------------
 src/specific.jl                |  31 ++---
 src/utilities.jl               |   8 ++
 5 files changed, 194 insertions(+), 109 deletions(-)

diff --git a/performance_tests/gauss_fit.jl b/performance_tests/gauss_fit.jl
index 79464f0..56cc33e 100644
--- a/performance_tests/gauss_fit.jl
+++ b/performance_tests/gauss_fit.jl
@@ -1,5 +1,4 @@
 using SeparableFunctions
-using Zygote 
 using IndexFunArrays
 using BenchmarkTools
 using ComponentArrays
@@ -24,15 +23,16 @@ w = separable_create(fct, sz, sigma)
 @time my_gaussian = gaussian_col(sz; sigma = sigma); 
 # @btime my_gaussian = gaussian_col($sz; sigma = $sigma); # 6 µs
 
+##############
 sz = (64,64)
 mid = sz.÷2 .+1
 myxx = Float32.(xx((sz[1],1)))
 myyy = Float32.(yy((1, sz[2])))
 
-my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.sigma[1])) .- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.sigma[2])))
+my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.args[1])) .- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.args[2])))
 # my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.sigma[1]))).*exp.(.- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.sigma[2])))
 
-vec_true = ComponentVector(;bg=0.2f0, intensity=1.0f0, sca= (1f0, 1f0), off = (2.2f0, 3.3f0), sigma = (2.4f0, 1.5f0))
+vec_true = ComponentVector(;bg=0.2f0, intensity=1.0f0, sca= (1f0, 1f0), off = (2.2f0, 3.3f0), args = (2.4f0, 1.5f0))
 # @btime my_full_gaussian($vec_true); # 42.800 μs (18 allocations: 16.70 KiB)
 
 Random.seed!(42)
@@ -41,8 +41,8 @@ dat_copy = copy(dat)
 
 loss = (vec) -> sum(abs2.(my_full_gaussian(vec) .- dat))
 
-bc_mem = gaussian_nokw_sep(sz, vec_true.off.+mid, vec_true.sca, vec_true.sigma)
-my_sep_gaussian(vec) = vec.bg .+ vec.intensity .* gaussian_nokw_sep(sz, vec.off .+mid, vec.sca, vec.sigma; all_axes=bc_mem)
+bc_mem = gaussian_nokw_sep(sz, vec_true.off.+mid, vec_true.sca, vec_true.args)
+my_sep_gaussian(vec) = vec.bg .+ vec.intensity .* gaussian_nokw_sep(sz, vec.off .+mid, vec.sca, vec.args; all_axes=bc_mem)
 loss_sep = (vec) -> sum(abs2.(my_sep_gaussian(vec) .- dat))
 
 # off_start = (2.0f0, 3.0f0)
@@ -52,9 +52,9 @@ off_start = [2.0f0, 3.0f0]
 # off_start = [-32f0, -32f0]
 sigma_start = [3.0f0, 2.0f0]
 sca_start = [1.2f0, 1.5f0]
-startvals = ComponentVector(;bg = 0.5f0, intensity=1.0f0, sca=sca_start, off=off_start, sigma=sigma_start)
+startvals = ComponentVector(;bg = 0.5f0, intensity=1.0f0, sca=sca_start, off=off_start, args=sigma_start)
 
-@vt dat my_full_gaussian(startvals) my_sep_gaussian(startvals)
+# @vt dat my_full_gaussian(startvals) my_sep_gaussian(startvals)
 
 loss(startvals)
 loss_sep(startvals)
@@ -104,7 +104,8 @@ dat .= dat_copy
 
 # DifferentiationInterface.withgradient
 
-opt = Optim.Options(x_reltol=0.001)
+# opt = Optim.Options(x_reltol=0.001)
+opt = Optim.Options(iterations = 9) # why ?
 
 @time res = Optim.optimize(loss, startvals, Optim.LBFGS(), opt; autodiff = :forward); # 
 res.f_calls
@@ -115,10 +116,12 @@ res.f_calls
 @btime res = Optim.optimize($loss, $startvals, Optim.LBFGS(), opt; autodiff = :forward);
 # 1.06 ms, 11.13 ms
 # Hand-Separated: 14 ms
+# 10 ms
 
 @btime res = Optim.optimize($loss, $startvals, Optim.LBFGS(), opt; autodiff = :finite);
 # 4.065 ms (1989 allocations: 2.52 MiB), 40.440 ms (19519 allocations: 25.53 MiB)
 # Hand-separated: 60.908 ms (20067 allocations: 24.28 MiB)
+# 27 ms
 
 function fg!(F, G, vec)
     val_pb = Zygote.pullback(loss, vec);
@@ -132,19 +135,46 @@ function fg!(F, G, vec)
         return val_pb[1]
     end
 end
-od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals)
+
+myfg! = get_fg!(dat, gaussian_raw)
+
+startvals_s = copy(startvals)
+startvals_s.off .+= sz.÷2 .+1
+G = copy(startvals)
+myfg!(1, G, startvals_s)
+
+od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals);
 @time res = Optim.optimize(od, startvals,  Optim.LBFGS(), opt)
-res.f_calls # 61
+res.f_calls # 33
+res.minimum # 13.9137
+# res.f_calls = 0
 
-@btime res = Optim.optimize($od, $startvals, Optim.LBFGS(), opt);
+loss(startvals)
+fg!(1.0, G, startvals)
+G
+od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals);
+@btime res = Optim.optimize($od, $startvals, Optim.LBFGS(), $opt);
 # Full: 4.804 ms (14435 allocations: 12.78 MiB)
 # Sep: 4.099 ms (26269 allocations: 11.20 MiB)
 # Hand-Separated: 2.397 ms (15472 allocations: 9.19 MiB)
+# 2.8 ms
+
+myfg!(1.0, G, startvals_s)
+G
+# opt = Optim.Options(iterations = 9); # why ?
+odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals_s);
+@time reso = Optim.optimize(odo, startvals_s, Optim.LBFGS(), opt);
+reso.f_calls # 33
+reso.minimum # 13.9138
+
+odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals_s);
+@btime reso = Optim.optimize($odo, $startvals_s, Optim.LBFGS(), $opt);
+# Zygote-free: 1.5 ms
 
 using InverseModeling
 
-gstartvals = ComponentVector(;offset = startvals.bg, i0=startvals.intensity, µ=startvals.off, σ=startvals.sigma)
-@time res1, res2, res3 = gauss_fit(dat, gstartvals; x_reltol=0.001);
+gstartvals = ComponentVector(;offset = startvals.bg, i0=startvals.intensity, µ=startvals.off, σ=startvals.args)
+@time res1, res2, res3 = gauss_fit(dat, gstartvals; iterations = 9);
 res3.f_calls
 vec_true
 res1
diff --git a/src/SeparableFunctions.jl b/src/SeparableFunctions.jl
index 0132817..6b7f527 100644
--- a/src/SeparableFunctions.jl
+++ b/src/SeparableFunctions.jl
@@ -38,8 +38,8 @@ export calculate_separables_nokw, calculate_separables, separable_view, separabl
 export calculate_broadcasted
 export copy_corners!
 export propagator_col, propagator_col!, phase_kz_col, phase_kz_col!
-export get_sep_mem, get_bc_mem
-export fg!
+export get_sep_mem, get_bc_mem, get_operator
+export get_fg!
 
 export calc_radial_symm!, calc_radial_symm, get_corner_ranges
 export radial_speedup, radial_speedup_ifa
diff --git a/src/general.jl b/src/general.jl
index b90b77e..58d19a9 100644
--- a/src/general.jl
+++ b/src/general.jl
@@ -2,7 +2,7 @@
     calculate_separables_nokw([::Type{AT},] fct, sz::NTuple{N, Int}, offset = sz.÷2 .+1, scale = one(real(eltype(AT))),
                                 args...;all_axes = get_sep_mem(AT, sz), kwargs...) where {AT, N}
 
-creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operation.
+creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operator.
 The (potentially heavy) work of calculating the one-dimensional functions is done now but the memory-heavy calculation of the array is done later.
 This function is used in `separable_view` and `separable_create`.
 
@@ -80,12 +80,13 @@ end
 """
     get_bc_mem(::Type{AT}, sz::NTuple{N, Int}) where {AT, N}
 
-allocates a contingous memory for the separable functions and wraps it into an instantiate broadcast structure.
+allocates a contigous memory block for the separable functions and wraps it into an instantiate broadcast (bc) structure including the bc-`operator`.
 This structure is also returned by functions like `gaussian_sep` and can  be reused by supplying it via the
-keyword argument `all_axes`.
+keyword argument `all_axes`. To obtain the bc-operator for predefined functions use `get_operator(fct)` with `fct`
+being the `raw_` version of the function, e.g. `get_operator(gassian_raw)`
 """
-function get_bc_mem(::Type{AT}, sz::NTuple{N, Int}, operation) where {AT, N}
-    return Broadcast.instantiate(Broadcast.broadcasted(operation, get_sep_mem(AT, sz)...))
+function get_bc_mem(::Type{AT}, sz::NTuple{N, Int}, operator) where {AT, N}
+    return Broadcast.instantiate(Broadcast.broadcasted(operator, get_sep_mem(AT, sz)...))
 end
 
 # a special in-place assignment, which gets its own differentiation rule for the reverse mode 
@@ -146,7 +147,7 @@ end
                                     scale = one(real(eltype(AT))),
                                     kwargs...) where {AT, N}
 
-creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operation.
+creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operator.
 The (potentially heavy) work of calculating the one-dimensional functions is done now but the memory-heavy calculation of the array is done later.
 This function is used in `separable_view` and `separable_create`.
 For automatic differentiation support (e.g. for opinizing) you should use the `calculate_separables_nokw` function, since it supports AD, whereas the keyword-based version does not.
@@ -201,10 +202,10 @@ end
 
 
 """
-    calculate_broadcasted([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset=sz.÷2 .+1, scale=one(real(eltype(DefaultArrType))), operation = *, kwargs...) where {TA, N}
+    calculate_broadcasted([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset=sz.÷2 .+1, scale=one(real(eltype(DefaultArrType))), operator = *, kwargs...) where {TA, N}
 
 returns an instantiated broadcasted separable array, which essentially behaves almost like an array yet uses broadcasting. Test revealed maximal speed improvements for this version.
-Yet, a problem is that reduce operations with specified dimensions cause an error. However this can be avoided by calling `collect`.
+Yet, a problem is that reduce operators with specified dimensions cause an error. However this can be avoided by calling `collect`.
 
 # Arguments:
 + `fct`:          The separable function, with a number of arguments corresponding to `length(args)`, each corresponding to a vector of separable inputs of length N.
@@ -214,7 +215,7 @@ Yet, a problem is that reduce operations with specified dimensions cause an erro
 + `all_axes`: if provided, this memory is used instead of allocating a new one. This can be useful if you want to use the same memory for multiple calculations.
 + `offset`:       position of the center from which the position is measured
 + `scale`:        defines the pixel size as vector or scalar. Default: 1.0.
-+ `operation`:    the separable operation connecting the separable dimensions
++ `operator`:    the separable operator connecting the separable dimensions
 
 ```jldoctest
 julia> fct = (r, sz, pos, sigma)-> exp(-(r-pos)^2/(2*sigma^2))
@@ -233,42 +234,42 @@ julia> collect(my_gaussian)
 ```
 """
 function calculate_broadcasted(::Type{AT}, fct, sz::NTuple{N, Int}, args...; 
-        operation = *, all_axes = get_bc_mem(AT, sz, operation),
+    operator = *, all_axes = get_bc_mem(AT, sz, operator),
         kwargs...) where {AT, N}
     # replace the sep memory inside the broadcast structure with new values:
     calculate_separables(AT, fct, sz, args...; all_axes=all_axes.args, kwargs...)
     return all_axes
-    # Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...))
+    # Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...))
 end
 
 function calculate_broadcasted(fct, sz::NTuple{N, Int}, args...; 
-        operation = *, all_axes = get_bc_mem(AT, sz, operation),
+    operator = *, all_axes = get_bc_mem(AT, sz, operator),
         kwargs...) where {N}
         calculate_separables(DefaultArrType, fct, sz, args...; all_axes=all_axes.args, kwargs...)
         return all_axes
-        # Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...))
+        # Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...))
 end
 
 # function calculate_sep_nokw(::Type{AT}, fct, sz::NTuple{N, Int}, args...; 
 #     all_axes = (similar_arr_type(AT, eltype(AT), Val(1)))(undef, sum(sz)),
-#     operation = *, defaults = nothing, kwargs...) where {AT, N}
+#     operator = *, defaults = nothing, kwargs...) where {AT, N}
 #     # defaults should be evaluated here and filled into args...
 #     return calculate_separables_nokw(AT, fct, sz, args...; all_axes=all_axes, kwargs...)
 # end
 
 # function calculate_sep_nokw(fct, sz::NTuple{N, Int}, args...; 
 #     all_axes = (similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(1)))(undef, sum(sz)),
-#     operation = *, defaults = nothing, kwargs...) where {N}
+#     operator = *, defaults = nothing, kwargs...) where {N}
 #     return calculate_separables_nokw(AT, fct, sz, args...; all_axes=all_axes, kwargs...)
 # end
 
 ### Versions where offst and scale are without keyword arguments
 function calculate_broadcasted_nokw(::Type{AT}, fct, sz::NTuple{N, Int}, args...; 
-    operation = *, all_axes = get_bc_mem(AT, sz, operation),
+    operator = *, all_axes = get_bc_mem(AT, sz, operator),
     defaults = nothing, kwargs...) where {AT, N}
     # defaults should be evaluated here and filled into args...
     calculate_separables_nokw_hook(DefaultArrType, fct, sz, args...; all_axes=all_axes.args, kwargs...)
-    # res = Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables_nokw_hook(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...))
+    # res = Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables_nokw_hook(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...))
     # @show eltype(collect(res))
     return all_axes
 end
@@ -284,14 +285,14 @@ end
 # this code only works for the multiplicative version and actually only saves very few allocations.
 # it's not worth the specialization:
 function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(calculate_broadcasted_nokw), ::Type{AT}, fct, sz::NTuple{N, Int}, args...;
-    operation = *, all_axes = get_bc_mem(AT, sz, operation),
+    operator = *, all_axes = get_bc_mem(AT, sz, operator),
     defaults = nothing, kwargs...) where {AT, N}
     # println("in rrule broadcast")
     # y = calculate_broadcasted_nokw(AT, fct, sz, args..., kwargs...)
     y_sep, calculate_sep_nokw_pullback = rrule_via_ad(config, calculate_separables_nokw_hook, AT, fct, sz, args...; all_axes=all_axes.args, kwargs...)
     # println("y done")
-    # y = Broadcast.instantiate(Broadcast.broadcasted(operation, y_sep...))
-    y = operation.(y_sep...) # is faster than the broadcast above.
+    # y = Broadcast.instantiate(Broadcast.broadcasted(operator, y_sep...))
+    y = operator.(y_sep...) # is faster than the broadcast above.
     # @show size(y_sep[1])
     # @show size(y_sep[2])
     # @show size(y)
@@ -301,7 +302,7 @@ function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(cal
         # @show size(dy)
         projected = ntuple((d) -> begin
             dims=((1:d-1)...,(d+1:N)...)
-            reduce(+, operation.(y_sep[[dims...]]...) .* dy, dims=dims)
+            reduce(+, operator.(y_sep[[dims...]]...) .* dy, dims=dims)
             end, Val(N)) # 7 Mb
         # @show typeof(projected[1])
         # @show size(projected[1])
@@ -358,61 +359,104 @@ function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(cal
     return y, calculate_separables_nokw_hook_pullback
 end
 
-function fg!(data::AbstractArray{T,N}, fct, bg, intensity, off, sca, args...; operation = *, all_axes = get_bc_mem(typeof(data), size(data), operation)) where {T,N}
-    sz = size(data)
-    # mid = sz .÷ 2 .+ 1
-    # off = off .+ mid
-    ids = ntuple((d) -> get_1d_ids(d, sz, off, sca), Val(N)) # offset==args[1] and scale==args[2]
-    ids_offset_only = get_1d_ids.(1:N, Ref(sz), Ref(off), one(eltype(data)))
+function get_fg!(data::AbstractArray{T,N}, fct) where {T,N}
+    RT = real(eltype(data))
+    operator = get_operator(fct)
+    by = get_bc_mem(typeof(data), size(data), operator)
+    y = by.args
+    yv = ntuple((d) -> (@view y[d][:]), Val(N))
 
-    by = calculate_broadcasted_nokw(typeof(data), fct, sz, off, sca, args...; operation=operation, all_axes=all_axes)
+    resid = similar(data)
+    dy = get_sep_mem(typeof(data), size(data))
+    dyv = ntuple((d) -> (@view dy[d][:]), Val(N))
 
-    resid = bg .+ intensity .* by .- data
-    dfdbg = 2*sum(resid)
+    # this function returns the forward value and mutates the gradient G
+    function fg!(F, G, vec)
+        bg = hasfield(vec, :bg) ? vec.bg : zero(RT);
+        intensity = hasfield(vec, :intensity) ? vec.intensity : one(RT)
+        off = hasfield(vec, :off) ? vec.off : RT.(sz.÷2 .+1)
+        sca = hasfield(vec, :sca) ? vec.sca : one(RT);
+        args = hasfield(vec, :args) ? (vec.args,) : one(RT)
+        sz = size(data)
+        # mid = sz .÷ 2 .+ 1
+        # off = off .+ mid
+        ids = ntuple((d) -> get_1d_ids(d, sz, off, sca), Val(N)) # offset==args[1] and scale==args[2]
+        ids_offset_only = get_1d_ids.(1:N, Ref(sz), Ref(off), one(eltype(data)))
+
+        # 5kB, result is in by
+        calculate_broadcasted_nokw(typeof(data), fct, sz, off, sca, args...; operator=operator, all_axes=by)
+
+        if !isnothing(F) || !isnothing(G)
+            resid .= bg .+ intensity .* by .- data
+        end
+        if !isnothing(G)
+            G.bg = 2*sum(resid)
+
+            other_dims = ntuple((d)-> (ntuple((n)->n, d-1)..., ntuple((n)->d+n, N-d)...), Val(N))
+            # @show other_dims
+            other_ys = ntuple((d)-> (ntuple((n)->y[n], d-1)..., ntuple((n)->y[d+n], N-d)...), Val(N))
+            # moved 2*intensity to the condensed terms, but logically it should be in dy!
+            # this is fairly expensive in memory:
+            # dy = ntuple((d) -> sum(resid.* (.*(other_ys[d]...)), dims=other_dims[d]), Val(N))
+
+            for d in 1:N
+                # This costs 33 kB allocation, sometimes it can be faster?
+                # dy[d] .= sum(resid.* (.*(other_ys[d]...)), dims=other_dims[d])
+                # this costs 2 kB allocations but is slower
+                dy[d] .= broadcast_reduce(*, +, resid, other_ys[d]..., dims=other_dims[d], init=zero(T))
+            end
+            # less memory, but a little slower:
+            # dy = ntuple((d) -> broadcast_reduce(*, +, resid, other_ys[d]..., dims=other_dims[d], init=zero(T)), Val(N))
+            # @time dy = ntuple((d) -> mapreduce(*, +, resid, other_ys[d]...), dims=other_dims[d]), Val(N))
+
+            args_1d = ntuple((d)-> pick_n.(d, args), Val(N))
+
+            if hasfield(vec, :off)
+                G.off = optional_convert(off, ntuple((d) -> (-2*intensity*pick_n(d, sca)) .* 
+                get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
+            end
+
+            if hasfield(vec, :args)
+                dargs = ntuple((argno) -> optional_convert(args[argno],
+                (2*intensity).*ntuple((d) -> get_arg_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))), length(args)) # ids @ offset the -1 is since the argument of fct is idx-offset
+                G.args = dargs[1]
+            end
+            # dargs = (0f0,0f0)
+
+            if hasfield(vec, :sca)
+                # missuse the dy memory
+                for d = 1:N
+                    dyv[d] .= dyv[d].* ids_offset_only[d]
+                end
+                G.sca = optional_convert(sca, ntuple((d) -> 
+                (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
+            end
+                # 1.5 kB:
+            # (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d].* ids_offset_only[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
+            # dscale = (0f0,0f0)
+        end
 
-    # @time loss = sum(resid .* resid)
-    # loss = sum(abs2.(resid))
-    loss = mapreduce(abs2, +, resid; init=zero(T))
-    # loss = sum(sum(abs2.(resid), dims=1))
-    y = by.args
-    other_dims = ntuple((d)-> (ntuple((n)->n, d-1)..., ntuple((n)->d+n, N-d)...), Val(N))
-    # @show other_dims
-    other_ys = ntuple((d)-> (ntuple((n)->y[n], d-1)..., ntuple((n)->y[d+n], N-d)...), Val(N))
-    # moved 2*intensity to the condensed terms, but logically it should be in dy!
-    # this is fairly expensive in memory:
-    dy = ntuple((d) -> sum(resid.* (.*(other_ys[d]...)), dims=other_dims[d]), Val(N))
-    # less memory, but a little slower:
-    # dy = ntuple((d) -> broadcast_reduce(*, +, resid, other_ys[d]..., dims=other_dims[d], init=zero(T)), Val(N))
-    # @time dy = ntuple((d) -> mapreduce(*, +, resid, other_ys[d]...), dims=other_dims[d]), Val(N))
+        # return loss, dfdbg, dfdI, doffset, dscale, dargs...
+        loss = (!isnothing(F)) ? mapreduce(abs2, +, resid; init=zero(T)) : T(0);
 
-    yv = ntuple((d) -> (@view y[d][:]), Val(N))
-    dyv = ntuple((d) -> (@view dy[d][:]), Val(N))
-    args_1d = ntuple((d)-> pick_n.(d, args), Val(N))
-
-    doffset = optional_convert(off, ntuple((d) -> (-2*intensity*pick_n(d, sca)) .* 
-        get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
-    # doffset = (0f0,0f0)
-        
-    # SCALE needs checking. Probably wrong!
-    dscale = optional_convert(sca, ntuple((d) -> 
-    (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d].* ids_offset_only[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
-    # dscale = (0f0,0f0)
-
-    dargs = ntuple((argno) -> optional_convert(args[argno],
-    (2*intensity).*ntuple((d) -> get_arg_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))), length(args)) # ids @ offset the -1 is since the argument of fct is idx-offset
-    # dargs = (0f0,0f0)
-
-    # resid .*= by # is slower!
-    dfdI = 2*sum(resid.*by)
-    return loss, dfdbg, dfdI, doffset, dscale, dargs...
+        if !isnothing(G) # forward needs to be calculated
+            # resid .*= by # is slower!
+            resid .= resid .* by
+            G.intensity = 2*sum(resid)
+            # G.intensity = 2*sum(resid.*by)
+        end
+
+        return loss
+        end
+    return fg!
 end
 
 function calculate_broadcasted_nokw(fct, sz::NTuple{N, Int}, args...; 
-    operation = *, all_axes = get_bc_mem(AT, sz, operation),
+    operator = *, all_axes = get_bc_mem(AT, sz, operator),
     defaults = nothing, kwargs...) where {N}
     # defaults should be evaluated here and filled into args...
     calculate_separables_nokw(DefaultArrType, fct, sz, args...; all_axes=all_axes.args, kwargs...)
-    # res = Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables_nokw(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...))
+    # res = Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables_nokw(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...))
     # @show eltype(collect(res))
     return all_axes
 end
@@ -426,7 +470,7 @@ end
 
 
 """
-    separable_view{N}(fct, sz, args...; offset =  sz.÷2 .+1, scale = one(real(eltype(AT))), operation = .*)
+    separable_view{N}(fct, sz, args...; offset =  sz.÷2 .+1, scale = one(real(eltype(AT))), operator = .*)
 
 creates an `LazyArray` view of an N-dimensional separable function.
 Note that this view consumes much less memory than a full allocation of the collected result.
@@ -441,7 +485,7 @@ See the example below.
 + `all_axes`:     if provided, this memory is used instead of allocating a new one. This can be useful if you want to use the same memory for multiple calculations.
 + `offset`:       position of the center from which the position is measured
 + `scale`:        defines the pixel size as vector or scalar. Default: 1.0.
-+ `operation`:    the separable operation connecting the separable dimensions
++ `operator`:    the separable operator connecting the separable dimensions
 
 # Example:
 ```jldoctest
@@ -458,19 +502,19 @@ julia> my_gaussian = separable_view(fct, (6,5), (0.1,0.2), (0.5,1.0))
 """
 function separable_view(::Type{AT}, fct, sz::NTuple{N, Int}, args...; 
         all_axes = get_sep_mem(AT, sz),
-        operation = *, kwargs...) where {AT, N}
+        operator = *, kwargs...) where {AT, N}
     res = calculate_separables(AT, fct, sz, args...; all_axes = all_axes, kwargs...)
-    return LazyArray(@~ operation.(res...)) # to prevent premature evaluation
+    return LazyArray(@~ operator.(res...)) # to prevent premature evaluation
 end
 
 function separable_view(fct, sz::NTuple{N, Int}, args...; 
         all_axes = get_sep_mem(DefaultArrType, sz),
-        operation = *, kwargs...) where {N}
-    separable_view(DefaultArrType, fct, sz::NTuple{N, Int}, args...; all_axes = all_axes, operation=operation, kwargs...)
+        operator = *, kwargs...) where {N}
+    separable_view(DefaultArrType, fct, sz::NTuple{N, Int}, args...; all_axes = all_axes, operator=operator, kwargs...)
 end
 
 """
-    separable_create([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset =  sz.÷2 .+1, scale = one(real(eltype(AT))), operation = *, kwargs...) where {TA, N}
+    separable_create([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset =  sz.÷2 .+1, scale = one(real(eltype(AT))), operator = *, kwargs...) where {TA, N}
 
 creates an array view of an N-dimensional separable function including memory allocation and collection.
 See the example below.
@@ -483,7 +527,7 @@ See the example below.
 + `all_axes`:     if provided, this memory is used instead of allocating a new one. This can be useful if you want to use the same memory for multiple calculations.
 + `offset`:       position of the center from which the position is measured
 + `scale`:        defines the pixel size as vector or scalar. Default: 1.0.
-+ `operation`:    the separable operation connecting the separable dimensions
++ `operator`:    the separable operator connecting the separable dimensions
 
 # Example:
 ```julia
@@ -498,19 +542,19 @@ julia> my_gaussian = separable_create(fct, (6,5), (0.5,1.0); )
  6.50731f-5   0.000356206  0.000717312  0.000531398  0.000144823
 ```
 """
-function separable_create(::Type{TA}, fct, sz::NTuple{N, Int}, args...; operation::Function = *, kwargs...)::similar_arr_type(TA, T, Val(N)) where {T, N, TA <: AbstractArray{T}}
+function separable_create(::Type{TA}, fct, sz::NTuple{N, Int}, args...; operator::Function = *, kwargs...)::similar_arr_type(TA, T, Val(N)) where {T, N, TA <: AbstractArray{T}}
     # res = calculate_separables(TA, fct, sz, args...; kwargs...)
-    # operation.(res...)
+    # operator.(res...)
     res = similar(TA, sz)
-    res .= calculate_broadcasted(TA, fct, sz, args...; operation=operation, kwargs...)
+    res .= calculate_broadcasted(TA, fct, sz, args...; operator=operator, kwargs...)
     return res
 end
 
 ## the code below seems not type-stable but the code above is. Why?
-function separable_create(fct, sz::NTuple{N, Int}, args...; operation::Function = *, kwargs...)::similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(N)) where {N}
+function separable_create(fct, sz::NTuple{N, Int}, args...; operator::Function = *, kwargs...)::similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(N)) where {N}
     # TT = similar_arr_type(DefaultArrType, Float32, Val(N))
     # res = calculate_separables(similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(N)), fct, sz, args...; kwargs...)
-    # return operation.(res...)
+    # return operator.(res...)
     # resT = 
-    separable_create(similar_arr_type(DefaultArrType, Float32, Val(N)), fct, sz, args...; operation=operation, kwargs...)
+    separable_create(similar_arr_type(DefaultArrType, Float32, Val(N)), fct, sz, args...; operator=operator, kwargs...)
 end
diff --git a/src/specific.jl b/src/specific.jl
index cb7e8f7..e60cb5d 100644
--- a/src/specific.jl
+++ b/src/specific.jl
@@ -62,10 +62,10 @@ end
 
 # would be nice to have a macro which defines all those function extensions.
 # But its not quite that simple. First try:
-# macro define_separable(basename, fct, basetype, operation)
+# macro define_separable(basename, fct, basetype, operator)
 #     @eval function $(Symbol(basename, :_col))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N}
 #         fct = $(fct) # to assign the function to a symbol
-#         separable_create(TA, fct, sz, args...; operation=$(operation), kwargs...)
+#         separable_create(TA, fct, sz, args...; operator=$(operator), kwargs...)
 #     end
 # end
 
@@ -132,25 +132,28 @@ for F in generate_functions_expr()
         end
         # @show "added rrule for $(Symbol(F[1], :_raw))"
     end
+    @eval function get_operator(::typeof($(Symbol(F[1], :_raw)))) 
+        return $(F[5])
+    end
 
     @eval function $(Symbol(F[1], :_col))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N}
         fct = $(F[3]) # to assign the function to a symbol
-        separable_create(TA, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...)
+        separable_create(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
     end
  
     @eval function $(Symbol(F[1], :_col))(sz::NTuple{N, Int}, args...; kwargs...) where {N}
         fct = $(F[3]) # to assign the function to a symbol
-        separable_create(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...)
+        separable_create(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
     end
 
     @eval function $(Symbol(F[1], :_sep))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N}
         fct = $(F[3]) # to assign the function to a symbol
-        calculate_broadcasted(TA, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...)
+        calculate_broadcasted(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
     end
 
     @eval function $(Symbol(F[1], :_sep))(sz::NTuple{N, Int}, args...; kwargs...) where {N}
         fct = $(F[3]) # to assign the function to a symbol
-        calculate_broadcasted(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...)
+        calculate_broadcasted(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
     end
 
     @eval function $(Symbol(F[1], :_nokw_sep))(::Type{TA}, sz::NTuple{N, Int}, args...;
@@ -159,9 +162,9 @@ for F in generate_functions_expr()
         # fct = $(F[3]) # to assign the function to a symbol
 
         # @show "call"
-        return calculate_broadcasted_nokw(TA, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operation=$(F[5]), all_axes=all_axes)
-        # operation=$(F[5])
-        # return calculate_separables_nokw(TA, fct, sz, args...; all_axes=all_axes), operation
+        return calculate_broadcasted_nokw(TA, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operator=$(F[5]), all_axes=all_axes)
+        # operator=$(F[5])
+        # return calculate_separables_nokw(TA, fct, sz, args...; all_axes=all_axes), operator
     end
 
     @eval function $(Symbol(F[1], :_nokw_sep))(sz::NTuple{N, Int}, args...;
@@ -169,19 +172,19 @@ for F in generate_functions_expr()
                     ) where {N}
         # fct = $(F[3]) # to assign the function to a symbol        
         # @show "call2"
-        return calculate_broadcasted_nokw(Array{$(F[4])}, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operation=$(F[5]), all_axes=all_axes)
-        # operation=$(F[5])
-        # return calculate_separables_nokw(Array{$(F[4])}, fct, sz, args...; all_axes=all_axes), operation
+        return calculate_broadcasted_nokw(Array{$(F[4])}, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operator=$(F[5]), all_axes=all_axes)
+        # operator=$(F[5])
+        # return calculate_separables_nokw(Array{$(F[4])}, fct, sz, args...; all_axes=all_axes), operator
     end
  
     @eval function $(Symbol(F[1], :_lz))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N}
         fct = $(F[3]) # to assign the function to a symbol
-        separable_view(TA, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...)
+        separable_view(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
     end
 
     @eval function $(Symbol(F[1], :_lz))(sz::NTuple{N, Int}, args...; kwargs...) where {N}
         fct = $(F[3]) # to assign the function to a symbol
-        separable_view(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...)
+        separable_view(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
     end
 
     # collected: fast separable calculation but resulting in an ND array
diff --git a/src/utilities.jl b/src/utilities.jl
index c869d51..1b2b183 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -85,6 +85,14 @@ function kwargs_to_args(defaults, kwargs)
 end
 
 
+"""
+    broadcast_reduce(f, op, A...; dims, init)
+
+broadcasts the function `f` over the arrays `A` and reduces the result with the operation `op` but requiring less memory.
+
+modified from code by @torrence:
+https://discourse.julialang.org/t/mapreduce-with-broadcasting/77053/7?u=rainerheintzmann
+"""
 function broadcast_reduce(f, op, A...; dims, init)
     bc = Broadcast.instantiate(Broadcast.broadcasted(f, A...))
     return reduce(op, bc; dims, init)