From 784142c81a75f2b0717818149cdf631605839a24 Mon Sep 17 00:00:00 2001 From: RainerHeintzmann Date: Wed, 24 Jul 2024 16:24:37 +0200 Subject: [PATCH] about to remove sca in the example. --- performance_tests/gauss_fit.jl | 56 ++++++--- src/SeparableFunctions.jl | 4 +- src/general.jl | 204 ++++++++++++++++++++------------- src/specific.jl | 31 ++--- src/utilities.jl | 8 ++ 5 files changed, 194 insertions(+), 109 deletions(-) diff --git a/performance_tests/gauss_fit.jl b/performance_tests/gauss_fit.jl index 79464f0..56cc33e 100644 --- a/performance_tests/gauss_fit.jl +++ b/performance_tests/gauss_fit.jl @@ -1,5 +1,4 @@ using SeparableFunctions -using Zygote using IndexFunArrays using BenchmarkTools using ComponentArrays @@ -24,15 +23,16 @@ w = separable_create(fct, sz, sigma) @time my_gaussian = gaussian_col(sz; sigma = sigma); # @btime my_gaussian = gaussian_col($sz; sigma = $sigma); # 6 µs +############## sz = (64,64) mid = sz.÷2 .+1 myxx = Float32.(xx((sz[1],1))) myyy = Float32.(yy((1, sz[2]))) -my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.sigma[1])) .- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.sigma[2]))) +my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.args[1])) .- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.args[2]))) # my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.sigma[1]))).*exp.(.- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.sigma[2]))) -vec_true = ComponentVector(;bg=0.2f0, intensity=1.0f0, sca= (1f0, 1f0), off = (2.2f0, 3.3f0), sigma = (2.4f0, 1.5f0)) +vec_true = ComponentVector(;bg=0.2f0, intensity=1.0f0, sca= (1f0, 1f0), off = (2.2f0, 3.3f0), args = (2.4f0, 1.5f0)) # @btime my_full_gaussian($vec_true); # 42.800 μs (18 allocations: 16.70 KiB) Random.seed!(42) @@ -41,8 +41,8 @@ dat_copy = copy(dat) loss = (vec) -> sum(abs2.(my_full_gaussian(vec) .- dat)) -bc_mem = gaussian_nokw_sep(sz, vec_true.off.+mid, vec_true.sca, vec_true.sigma) -my_sep_gaussian(vec) = vec.bg .+ vec.intensity .* gaussian_nokw_sep(sz, vec.off .+mid, vec.sca, vec.sigma; all_axes=bc_mem) +bc_mem = gaussian_nokw_sep(sz, vec_true.off.+mid, vec_true.sca, vec_true.args) +my_sep_gaussian(vec) = vec.bg .+ vec.intensity .* gaussian_nokw_sep(sz, vec.off .+mid, vec.sca, vec.args; all_axes=bc_mem) loss_sep = (vec) -> sum(abs2.(my_sep_gaussian(vec) .- dat)) # off_start = (2.0f0, 3.0f0) @@ -52,9 +52,9 @@ off_start = [2.0f0, 3.0f0] # off_start = [-32f0, -32f0] sigma_start = [3.0f0, 2.0f0] sca_start = [1.2f0, 1.5f0] -startvals = ComponentVector(;bg = 0.5f0, intensity=1.0f0, sca=sca_start, off=off_start, sigma=sigma_start) +startvals = ComponentVector(;bg = 0.5f0, intensity=1.0f0, sca=sca_start, off=off_start, args=sigma_start) -@vt dat my_full_gaussian(startvals) my_sep_gaussian(startvals) +# @vt dat my_full_gaussian(startvals) my_sep_gaussian(startvals) loss(startvals) loss_sep(startvals) @@ -104,7 +104,8 @@ dat .= dat_copy # DifferentiationInterface.withgradient -opt = Optim.Options(x_reltol=0.001) +# opt = Optim.Options(x_reltol=0.001) +opt = Optim.Options(iterations = 9) # why ? @time res = Optim.optimize(loss, startvals, Optim.LBFGS(), opt; autodiff = :forward); # res.f_calls @@ -115,10 +116,12 @@ res.f_calls @btime res = Optim.optimize($loss, $startvals, Optim.LBFGS(), opt; autodiff = :forward); # 1.06 ms, 11.13 ms # Hand-Separated: 14 ms +# 10 ms @btime res = Optim.optimize($loss, $startvals, Optim.LBFGS(), opt; autodiff = :finite); # 4.065 ms (1989 allocations: 2.52 MiB), 40.440 ms (19519 allocations: 25.53 MiB) # Hand-separated: 60.908 ms (20067 allocations: 24.28 MiB) +# 27 ms function fg!(F, G, vec) val_pb = Zygote.pullback(loss, vec); @@ -132,19 +135,46 @@ function fg!(F, G, vec) return val_pb[1] end end -od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals) + +myfg! = get_fg!(dat, gaussian_raw) + +startvals_s = copy(startvals) +startvals_s.off .+= sz.÷2 .+1 +G = copy(startvals) +myfg!(1, G, startvals_s) + +od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals); @time res = Optim.optimize(od, startvals, Optim.LBFGS(), opt) -res.f_calls # 61 +res.f_calls # 33 +res.minimum # 13.9137 +# res.f_calls = 0 -@btime res = Optim.optimize($od, $startvals, Optim.LBFGS(), opt); +loss(startvals) +fg!(1.0, G, startvals) +G +od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals); +@btime res = Optim.optimize($od, $startvals, Optim.LBFGS(), $opt); # Full: 4.804 ms (14435 allocations: 12.78 MiB) # Sep: 4.099 ms (26269 allocations: 11.20 MiB) # Hand-Separated: 2.397 ms (15472 allocations: 9.19 MiB) +# 2.8 ms + +myfg!(1.0, G, startvals_s) +G +# opt = Optim.Options(iterations = 9); # why ? +odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals_s); +@time reso = Optim.optimize(odo, startvals_s, Optim.LBFGS(), opt); +reso.f_calls # 33 +reso.minimum # 13.9138 + +odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals_s); +@btime reso = Optim.optimize($odo, $startvals_s, Optim.LBFGS(), $opt); +# Zygote-free: 1.5 ms using InverseModeling -gstartvals = ComponentVector(;offset = startvals.bg, i0=startvals.intensity, µ=startvals.off, σ=startvals.sigma) -@time res1, res2, res3 = gauss_fit(dat, gstartvals; x_reltol=0.001); +gstartvals = ComponentVector(;offset = startvals.bg, i0=startvals.intensity, µ=startvals.off, σ=startvals.args) +@time res1, res2, res3 = gauss_fit(dat, gstartvals; iterations = 9); res3.f_calls vec_true res1 diff --git a/src/SeparableFunctions.jl b/src/SeparableFunctions.jl index 0132817..6b7f527 100644 --- a/src/SeparableFunctions.jl +++ b/src/SeparableFunctions.jl @@ -38,8 +38,8 @@ export calculate_separables_nokw, calculate_separables, separable_view, separabl export calculate_broadcasted export copy_corners! export propagator_col, propagator_col!, phase_kz_col, phase_kz_col! -export get_sep_mem, get_bc_mem -export fg! +export get_sep_mem, get_bc_mem, get_operator +export get_fg! export calc_radial_symm!, calc_radial_symm, get_corner_ranges export radial_speedup, radial_speedup_ifa diff --git a/src/general.jl b/src/general.jl index b90b77e..58d19a9 100644 --- a/src/general.jl +++ b/src/general.jl @@ -2,7 +2,7 @@ calculate_separables_nokw([::Type{AT},] fct, sz::NTuple{N, Int}, offset = sz.÷2 .+1, scale = one(real(eltype(AT))), args...;all_axes = get_sep_mem(AT, sz), kwargs...) where {AT, N} -creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operation. +creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operator. The (potentially heavy) work of calculating the one-dimensional functions is done now but the memory-heavy calculation of the array is done later. This function is used in `separable_view` and `separable_create`. @@ -80,12 +80,13 @@ end """ get_bc_mem(::Type{AT}, sz::NTuple{N, Int}) where {AT, N} -allocates a contingous memory for the separable functions and wraps it into an instantiate broadcast structure. +allocates a contigous memory block for the separable functions and wraps it into an instantiate broadcast (bc) structure including the bc-`operator`. This structure is also returned by functions like `gaussian_sep` and can be reused by supplying it via the -keyword argument `all_axes`. +keyword argument `all_axes`. To obtain the bc-operator for predefined functions use `get_operator(fct)` with `fct` +being the `raw_` version of the function, e.g. `get_operator(gassian_raw)` """ -function get_bc_mem(::Type{AT}, sz::NTuple{N, Int}, operation) where {AT, N} - return Broadcast.instantiate(Broadcast.broadcasted(operation, get_sep_mem(AT, sz)...)) +function get_bc_mem(::Type{AT}, sz::NTuple{N, Int}, operator) where {AT, N} + return Broadcast.instantiate(Broadcast.broadcasted(operator, get_sep_mem(AT, sz)...)) end # a special in-place assignment, which gets its own differentiation rule for the reverse mode @@ -146,7 +147,7 @@ end scale = one(real(eltype(AT))), kwargs...) where {AT, N} -creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operation. +creates a list of one-dimensional vectors, which can be combined to yield a separable array. In a way this can be seen as a half-way Lazy operator. The (potentially heavy) work of calculating the one-dimensional functions is done now but the memory-heavy calculation of the array is done later. This function is used in `separable_view` and `separable_create`. For automatic differentiation support (e.g. for opinizing) you should use the `calculate_separables_nokw` function, since it supports AD, whereas the keyword-based version does not. @@ -201,10 +202,10 @@ end """ - calculate_broadcasted([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset=sz.÷2 .+1, scale=one(real(eltype(DefaultArrType))), operation = *, kwargs...) where {TA, N} + calculate_broadcasted([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset=sz.÷2 .+1, scale=one(real(eltype(DefaultArrType))), operator = *, kwargs...) where {TA, N} returns an instantiated broadcasted separable array, which essentially behaves almost like an array yet uses broadcasting. Test revealed maximal speed improvements for this version. -Yet, a problem is that reduce operations with specified dimensions cause an error. However this can be avoided by calling `collect`. +Yet, a problem is that reduce operators with specified dimensions cause an error. However this can be avoided by calling `collect`. # Arguments: + `fct`: The separable function, with a number of arguments corresponding to `length(args)`, each corresponding to a vector of separable inputs of length N. @@ -214,7 +215,7 @@ Yet, a problem is that reduce operations with specified dimensions cause an erro + `all_axes`: if provided, this memory is used instead of allocating a new one. This can be useful if you want to use the same memory for multiple calculations. + `offset`: position of the center from which the position is measured + `scale`: defines the pixel size as vector or scalar. Default: 1.0. -+ `operation`: the separable operation connecting the separable dimensions ++ `operator`: the separable operator connecting the separable dimensions ```jldoctest julia> fct = (r, sz, pos, sigma)-> exp(-(r-pos)^2/(2*sigma^2)) @@ -233,42 +234,42 @@ julia> collect(my_gaussian) ``` """ function calculate_broadcasted(::Type{AT}, fct, sz::NTuple{N, Int}, args...; - operation = *, all_axes = get_bc_mem(AT, sz, operation), + operator = *, all_axes = get_bc_mem(AT, sz, operator), kwargs...) where {AT, N} # replace the sep memory inside the broadcast structure with new values: calculate_separables(AT, fct, sz, args...; all_axes=all_axes.args, kwargs...) return all_axes - # Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...)) + # Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...)) end function calculate_broadcasted(fct, sz::NTuple{N, Int}, args...; - operation = *, all_axes = get_bc_mem(AT, sz, operation), + operator = *, all_axes = get_bc_mem(AT, sz, operator), kwargs...) where {N} calculate_separables(DefaultArrType, fct, sz, args...; all_axes=all_axes.args, kwargs...) return all_axes - # Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...)) + # Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...)) end # function calculate_sep_nokw(::Type{AT}, fct, sz::NTuple{N, Int}, args...; # all_axes = (similar_arr_type(AT, eltype(AT), Val(1)))(undef, sum(sz)), -# operation = *, defaults = nothing, kwargs...) where {AT, N} +# operator = *, defaults = nothing, kwargs...) where {AT, N} # # defaults should be evaluated here and filled into args... # return calculate_separables_nokw(AT, fct, sz, args...; all_axes=all_axes, kwargs...) # end # function calculate_sep_nokw(fct, sz::NTuple{N, Int}, args...; # all_axes = (similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(1)))(undef, sum(sz)), -# operation = *, defaults = nothing, kwargs...) where {N} +# operator = *, defaults = nothing, kwargs...) where {N} # return calculate_separables_nokw(AT, fct, sz, args...; all_axes=all_axes, kwargs...) # end ### Versions where offst and scale are without keyword arguments function calculate_broadcasted_nokw(::Type{AT}, fct, sz::NTuple{N, Int}, args...; - operation = *, all_axes = get_bc_mem(AT, sz, operation), + operator = *, all_axes = get_bc_mem(AT, sz, operator), defaults = nothing, kwargs...) where {AT, N} # defaults should be evaluated here and filled into args... calculate_separables_nokw_hook(DefaultArrType, fct, sz, args...; all_axes=all_axes.args, kwargs...) - # res = Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables_nokw_hook(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...)) + # res = Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables_nokw_hook(AT, fct, sz, args...; all_axes=all_axes, kwargs...)...)) # @show eltype(collect(res)) return all_axes end @@ -284,14 +285,14 @@ end # this code only works for the multiplicative version and actually only saves very few allocations. # it's not worth the specialization: function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(calculate_broadcasted_nokw), ::Type{AT}, fct, sz::NTuple{N, Int}, args...; - operation = *, all_axes = get_bc_mem(AT, sz, operation), + operator = *, all_axes = get_bc_mem(AT, sz, operator), defaults = nothing, kwargs...) where {AT, N} # println("in rrule broadcast") # y = calculate_broadcasted_nokw(AT, fct, sz, args..., kwargs...) y_sep, calculate_sep_nokw_pullback = rrule_via_ad(config, calculate_separables_nokw_hook, AT, fct, sz, args...; all_axes=all_axes.args, kwargs...) # println("y done") - # y = Broadcast.instantiate(Broadcast.broadcasted(operation, y_sep...)) - y = operation.(y_sep...) # is faster than the broadcast above. + # y = Broadcast.instantiate(Broadcast.broadcasted(operator, y_sep...)) + y = operator.(y_sep...) # is faster than the broadcast above. # @show size(y_sep[1]) # @show size(y_sep[2]) # @show size(y) @@ -301,7 +302,7 @@ function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(cal # @show size(dy) projected = ntuple((d) -> begin dims=((1:d-1)...,(d+1:N)...) - reduce(+, operation.(y_sep[[dims...]]...) .* dy, dims=dims) + reduce(+, operator.(y_sep[[dims...]]...) .* dy, dims=dims) end, Val(N)) # 7 Mb # @show typeof(projected[1]) # @show size(projected[1]) @@ -358,61 +359,104 @@ function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(cal return y, calculate_separables_nokw_hook_pullback end -function fg!(data::AbstractArray{T,N}, fct, bg, intensity, off, sca, args...; operation = *, all_axes = get_bc_mem(typeof(data), size(data), operation)) where {T,N} - sz = size(data) - # mid = sz .÷ 2 .+ 1 - # off = off .+ mid - ids = ntuple((d) -> get_1d_ids(d, sz, off, sca), Val(N)) # offset==args[1] and scale==args[2] - ids_offset_only = get_1d_ids.(1:N, Ref(sz), Ref(off), one(eltype(data))) +function get_fg!(data::AbstractArray{T,N}, fct) where {T,N} + RT = real(eltype(data)) + operator = get_operator(fct) + by = get_bc_mem(typeof(data), size(data), operator) + y = by.args + yv = ntuple((d) -> (@view y[d][:]), Val(N)) - by = calculate_broadcasted_nokw(typeof(data), fct, sz, off, sca, args...; operation=operation, all_axes=all_axes) + resid = similar(data) + dy = get_sep_mem(typeof(data), size(data)) + dyv = ntuple((d) -> (@view dy[d][:]), Val(N)) - resid = bg .+ intensity .* by .- data - dfdbg = 2*sum(resid) + # this function returns the forward value and mutates the gradient G + function fg!(F, G, vec) + bg = hasfield(vec, :bg) ? vec.bg : zero(RT); + intensity = hasfield(vec, :intensity) ? vec.intensity : one(RT) + off = hasfield(vec, :off) ? vec.off : RT.(sz.÷2 .+1) + sca = hasfield(vec, :sca) ? vec.sca : one(RT); + args = hasfield(vec, :args) ? (vec.args,) : one(RT) + sz = size(data) + # mid = sz .÷ 2 .+ 1 + # off = off .+ mid + ids = ntuple((d) -> get_1d_ids(d, sz, off, sca), Val(N)) # offset==args[1] and scale==args[2] + ids_offset_only = get_1d_ids.(1:N, Ref(sz), Ref(off), one(eltype(data))) + + # 5kB, result is in by + calculate_broadcasted_nokw(typeof(data), fct, sz, off, sca, args...; operator=operator, all_axes=by) + + if !isnothing(F) || !isnothing(G) + resid .= bg .+ intensity .* by .- data + end + if !isnothing(G) + G.bg = 2*sum(resid) + + other_dims = ntuple((d)-> (ntuple((n)->n, d-1)..., ntuple((n)->d+n, N-d)...), Val(N)) + # @show other_dims + other_ys = ntuple((d)-> (ntuple((n)->y[n], d-1)..., ntuple((n)->y[d+n], N-d)...), Val(N)) + # moved 2*intensity to the condensed terms, but logically it should be in dy! + # this is fairly expensive in memory: + # dy = ntuple((d) -> sum(resid.* (.*(other_ys[d]...)), dims=other_dims[d]), Val(N)) + + for d in 1:N + # This costs 33 kB allocation, sometimes it can be faster? + # dy[d] .= sum(resid.* (.*(other_ys[d]...)), dims=other_dims[d]) + # this costs 2 kB allocations but is slower + dy[d] .= broadcast_reduce(*, +, resid, other_ys[d]..., dims=other_dims[d], init=zero(T)) + end + # less memory, but a little slower: + # dy = ntuple((d) -> broadcast_reduce(*, +, resid, other_ys[d]..., dims=other_dims[d], init=zero(T)), Val(N)) + # @time dy = ntuple((d) -> mapreduce(*, +, resid, other_ys[d]...), dims=other_dims[d]), Val(N)) + + args_1d = ntuple((d)-> pick_n.(d, args), Val(N)) + + if hasfield(vec, :off) + G.off = optional_convert(off, ntuple((d) -> (-2*intensity*pick_n(d, sca)) .* + get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset + end + + if hasfield(vec, :args) + dargs = ntuple((argno) -> optional_convert(args[argno], + (2*intensity).*ntuple((d) -> get_arg_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))), length(args)) # ids @ offset the -1 is since the argument of fct is idx-offset + G.args = dargs[1] + end + # dargs = (0f0,0f0) + + if hasfield(vec, :sca) + # missuse the dy memory + for d = 1:N + dyv[d] .= dyv[d].* ids_offset_only[d] + end + G.sca = optional_convert(sca, ntuple((d) -> + (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset + end + # 1.5 kB: + # (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d].* ids_offset_only[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset + # dscale = (0f0,0f0) + end - # @time loss = sum(resid .* resid) - # loss = sum(abs2.(resid)) - loss = mapreduce(abs2, +, resid; init=zero(T)) - # loss = sum(sum(abs2.(resid), dims=1)) - y = by.args - other_dims = ntuple((d)-> (ntuple((n)->n, d-1)..., ntuple((n)->d+n, N-d)...), Val(N)) - # @show other_dims - other_ys = ntuple((d)-> (ntuple((n)->y[n], d-1)..., ntuple((n)->y[d+n], N-d)...), Val(N)) - # moved 2*intensity to the condensed terms, but logically it should be in dy! - # this is fairly expensive in memory: - dy = ntuple((d) -> sum(resid.* (.*(other_ys[d]...)), dims=other_dims[d]), Val(N)) - # less memory, but a little slower: - # dy = ntuple((d) -> broadcast_reduce(*, +, resid, other_ys[d]..., dims=other_dims[d], init=zero(T)), Val(N)) - # @time dy = ntuple((d) -> mapreduce(*, +, resid, other_ys[d]...), dims=other_dims[d]), Val(N)) + # return loss, dfdbg, dfdI, doffset, dscale, dargs... + loss = (!isnothing(F)) ? mapreduce(abs2, +, resid; init=zero(T)) : T(0); - yv = ntuple((d) -> (@view y[d][:]), Val(N)) - dyv = ntuple((d) -> (@view dy[d][:]), Val(N)) - args_1d = ntuple((d)-> pick_n.(d, args), Val(N)) - - doffset = optional_convert(off, ntuple((d) -> (-2*intensity*pick_n(d, sca)) .* - get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset - # doffset = (0f0,0f0) - - # SCALE needs checking. Probably wrong! - dscale = optional_convert(sca, ntuple((d) -> - (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d].* ids_offset_only[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset - # dscale = (0f0,0f0) - - dargs = ntuple((argno) -> optional_convert(args[argno], - (2*intensity).*ntuple((d) -> get_arg_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))), length(args)) # ids @ offset the -1 is since the argument of fct is idx-offset - # dargs = (0f0,0f0) - - # resid .*= by # is slower! - dfdI = 2*sum(resid.*by) - return loss, dfdbg, dfdI, doffset, dscale, dargs... + if !isnothing(G) # forward needs to be calculated + # resid .*= by # is slower! + resid .= resid .* by + G.intensity = 2*sum(resid) + # G.intensity = 2*sum(resid.*by) + end + + return loss + end + return fg! end function calculate_broadcasted_nokw(fct, sz::NTuple{N, Int}, args...; - operation = *, all_axes = get_bc_mem(AT, sz, operation), + operator = *, all_axes = get_bc_mem(AT, sz, operator), defaults = nothing, kwargs...) where {N} # defaults should be evaluated here and filled into args... calculate_separables_nokw(DefaultArrType, fct, sz, args...; all_axes=all_axes.args, kwargs...) - # res = Broadcast.instantiate(Broadcast.broadcasted(operation, calculate_separables_nokw(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...)) + # res = Broadcast.instantiate(Broadcast.broadcasted(operator, calculate_separables_nokw(DefaultArrType, fct, sz, args...; all_axes=all_axes, kwargs...)...)) # @show eltype(collect(res)) return all_axes end @@ -426,7 +470,7 @@ end """ - separable_view{N}(fct, sz, args...; offset = sz.÷2 .+1, scale = one(real(eltype(AT))), operation = .*) + separable_view{N}(fct, sz, args...; offset = sz.÷2 .+1, scale = one(real(eltype(AT))), operator = .*) creates an `LazyArray` view of an N-dimensional separable function. Note that this view consumes much less memory than a full allocation of the collected result. @@ -441,7 +485,7 @@ See the example below. + `all_axes`: if provided, this memory is used instead of allocating a new one. This can be useful if you want to use the same memory for multiple calculations. + `offset`: position of the center from which the position is measured + `scale`: defines the pixel size as vector or scalar. Default: 1.0. -+ `operation`: the separable operation connecting the separable dimensions ++ `operator`: the separable operator connecting the separable dimensions # Example: ```jldoctest @@ -458,19 +502,19 @@ julia> my_gaussian = separable_view(fct, (6,5), (0.1,0.2), (0.5,1.0)) """ function separable_view(::Type{AT}, fct, sz::NTuple{N, Int}, args...; all_axes = get_sep_mem(AT, sz), - operation = *, kwargs...) where {AT, N} + operator = *, kwargs...) where {AT, N} res = calculate_separables(AT, fct, sz, args...; all_axes = all_axes, kwargs...) - return LazyArray(@~ operation.(res...)) # to prevent premature evaluation + return LazyArray(@~ operator.(res...)) # to prevent premature evaluation end function separable_view(fct, sz::NTuple{N, Int}, args...; all_axes = get_sep_mem(DefaultArrType, sz), - operation = *, kwargs...) where {N} - separable_view(DefaultArrType, fct, sz::NTuple{N, Int}, args...; all_axes = all_axes, operation=operation, kwargs...) + operator = *, kwargs...) where {N} + separable_view(DefaultArrType, fct, sz::NTuple{N, Int}, args...; all_axes = all_axes, operator=operator, kwargs...) end """ - separable_create([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset = sz.÷2 .+1, scale = one(real(eltype(AT))), operation = *, kwargs...) where {TA, N} + separable_create([::Type{TA},] fct, sz::NTuple{N, Int}, args...; offset = sz.÷2 .+1, scale = one(real(eltype(AT))), operator = *, kwargs...) where {TA, N} creates an array view of an N-dimensional separable function including memory allocation and collection. See the example below. @@ -483,7 +527,7 @@ See the example below. + `all_axes`: if provided, this memory is used instead of allocating a new one. This can be useful if you want to use the same memory for multiple calculations. + `offset`: position of the center from which the position is measured + `scale`: defines the pixel size as vector or scalar. Default: 1.0. -+ `operation`: the separable operation connecting the separable dimensions ++ `operator`: the separable operator connecting the separable dimensions # Example: ```julia @@ -498,19 +542,19 @@ julia> my_gaussian = separable_create(fct, (6,5), (0.5,1.0); ) 6.50731f-5 0.000356206 0.000717312 0.000531398 0.000144823 ``` """ -function separable_create(::Type{TA}, fct, sz::NTuple{N, Int}, args...; operation::Function = *, kwargs...)::similar_arr_type(TA, T, Val(N)) where {T, N, TA <: AbstractArray{T}} +function separable_create(::Type{TA}, fct, sz::NTuple{N, Int}, args...; operator::Function = *, kwargs...)::similar_arr_type(TA, T, Val(N)) where {T, N, TA <: AbstractArray{T}} # res = calculate_separables(TA, fct, sz, args...; kwargs...) - # operation.(res...) + # operator.(res...) res = similar(TA, sz) - res .= calculate_broadcasted(TA, fct, sz, args...; operation=operation, kwargs...) + res .= calculate_broadcasted(TA, fct, sz, args...; operator=operator, kwargs...) return res end ## the code below seems not type-stable but the code above is. Why? -function separable_create(fct, sz::NTuple{N, Int}, args...; operation::Function = *, kwargs...)::similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(N)) where {N} +function separable_create(fct, sz::NTuple{N, Int}, args...; operator::Function = *, kwargs...)::similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(N)) where {N} # TT = similar_arr_type(DefaultArrType, Float32, Val(N)) # res = calculate_separables(similar_arr_type(DefaultArrType, eltype(DefaultArrType), Val(N)), fct, sz, args...; kwargs...) - # return operation.(res...) + # return operator.(res...) # resT = - separable_create(similar_arr_type(DefaultArrType, Float32, Val(N)), fct, sz, args...; operation=operation, kwargs...) + separable_create(similar_arr_type(DefaultArrType, Float32, Val(N)), fct, sz, args...; operator=operator, kwargs...) end diff --git a/src/specific.jl b/src/specific.jl index cb7e8f7..e60cb5d 100644 --- a/src/specific.jl +++ b/src/specific.jl @@ -62,10 +62,10 @@ end # would be nice to have a macro which defines all those function extensions. # But its not quite that simple. First try: -# macro define_separable(basename, fct, basetype, operation) +# macro define_separable(basename, fct, basetype, operator) # @eval function $(Symbol(basename, :_col))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N} # fct = $(fct) # to assign the function to a symbol -# separable_create(TA, fct, sz, args...; operation=$(operation), kwargs...) +# separable_create(TA, fct, sz, args...; operator=$(operator), kwargs...) # end # end @@ -132,25 +132,28 @@ for F in generate_functions_expr() end # @show "added rrule for $(Symbol(F[1], :_raw))" end + @eval function get_operator(::typeof($(Symbol(F[1], :_raw)))) + return $(F[5]) + end @eval function $(Symbol(F[1], :_col))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N} fct = $(F[3]) # to assign the function to a symbol - separable_create(TA, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...) + separable_create(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...) end @eval function $(Symbol(F[1], :_col))(sz::NTuple{N, Int}, args...; kwargs...) where {N} fct = $(F[3]) # to assign the function to a symbol - separable_create(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...) + separable_create(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...) end @eval function $(Symbol(F[1], :_sep))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N} fct = $(F[3]) # to assign the function to a symbol - calculate_broadcasted(TA, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...) + calculate_broadcasted(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...) end @eval function $(Symbol(F[1], :_sep))(sz::NTuple{N, Int}, args...; kwargs...) where {N} fct = $(F[3]) # to assign the function to a symbol - calculate_broadcasted(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...) + calculate_broadcasted(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...) end @eval function $(Symbol(F[1], :_nokw_sep))(::Type{TA}, sz::NTuple{N, Int}, args...; @@ -159,9 +162,9 @@ for F in generate_functions_expr() # fct = $(F[3]) # to assign the function to a symbol # @show "call" - return calculate_broadcasted_nokw(TA, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operation=$(F[5]), all_axes=all_axes) - # operation=$(F[5]) - # return calculate_separables_nokw(TA, fct, sz, args...; all_axes=all_axes), operation + return calculate_broadcasted_nokw(TA, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operator=$(F[5]), all_axes=all_axes) + # operator=$(F[5]) + # return calculate_separables_nokw(TA, fct, sz, args...; all_axes=all_axes), operator end @eval function $(Symbol(F[1], :_nokw_sep))(sz::NTuple{N, Int}, args...; @@ -169,19 +172,19 @@ for F in generate_functions_expr() ) where {N} # fct = $(F[3]) # to assign the function to a symbol # @show "call2" - return calculate_broadcasted_nokw(Array{$(F[4])}, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operation=$(F[5]), all_axes=all_axes) - # operation=$(F[5]) - # return calculate_separables_nokw(Array{$(F[4])}, fct, sz, args...; all_axes=all_axes), operation + return calculate_broadcasted_nokw(Array{$(F[4])}, $(Symbol(F[1], :_raw)), sz, args...; defaults=$(F[2]), operator=$(F[5]), all_axes=all_axes) + # operator=$(F[5]) + # return calculate_separables_nokw(Array{$(F[4])}, fct, sz, args...; all_axes=all_axes), operator end @eval function $(Symbol(F[1], :_lz))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N} fct = $(F[3]) # to assign the function to a symbol - separable_view(TA, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...) + separable_view(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...) end @eval function $(Symbol(F[1], :_lz))(sz::NTuple{N, Int}, args...; kwargs...) where {N} fct = $(F[3]) # to assign the function to a symbol - separable_view(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operation=$(F[5]), kwargs...) + separable_view(Array{$(F[4])}, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...) end # collected: fast separable calculation but resulting in an ND array diff --git a/src/utilities.jl b/src/utilities.jl index c869d51..1b2b183 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -85,6 +85,14 @@ function kwargs_to_args(defaults, kwargs) end +""" + broadcast_reduce(f, op, A...; dims, init) + +broadcasts the function `f` over the arrays `A` and reduces the result with the operation `op` but requiring less memory. + +modified from code by @torrence: +https://discourse.julialang.org/t/mapreduce-with-broadcasting/77053/7?u=rainerheintzmann +""" function broadcast_reduce(f, op, A...; dims, init) bc = Broadcast.instantiate(Broadcast.broadcasted(f, A...)) return reduce(op, bc; dims, init)