From d1bbe2daaa38155970af7cae6bcfb0b9cc939e8a Mon Sep 17 00:00:00 2001
From: RainerHeintzmann <heintzmann@gmail.com>
Date: Wed, 24 Jul 2024 20:26:25 +0200
Subject: [PATCH] gaussfit with CUDA support

---
 examples/Project.toml          |   7 +++
 examples/gauss_fit.jl          |  55 ++++++++++++++++++
 performance_tests/gauss_fit.jl |  33 +++++------
 src/SeparableFunctions.jl      |   2 +
 src/general.jl                 | 101 ++++++++++++++++++++++++++-------
 src/specific.jl                |  20 ++++++-
 6 files changed, 182 insertions(+), 36 deletions(-)
 create mode 100644 examples/Project.toml
 create mode 100644 examples/gauss_fit.jl

diff --git a/examples/Project.toml b/examples/Project.toml
new file mode 100644
index 0000000..211a4bc
--- /dev/null
+++ b/examples/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+InverseModeling = "ce844058-9528-415d-a63d-06f3dd08b29f"
+Noise = "81d43f40-5267-43b7-ae1c-8b967f377efa"
+Optim = "429524aa-4258-5aef-a3af-852621145aeb"
+SeparableFunctions = "c8c7ead4-852c-491e-a42d-3d43bc74259e"
diff --git a/examples/gauss_fit.jl b/examples/gauss_fit.jl
new file mode 100644
index 0000000..0bd7325
--- /dev/null
+++ b/examples/gauss_fit.jl
@@ -0,0 +1,55 @@
+using SeparableFunctions
+using ComponentArrays
+using Optim
+using BenchmarkTools
+using Noise
+using CUDA
+
+# simulate a gaussian blob with Poisson noise and fit it with a Gaussian function
+sz = (1600, 1600)
+vec_true = ComponentVector(;bg=10.0f0, intensity=50f0, off = [8.2f0, 6.5f0], args = [2.4f0, 1.5f0])
+
+dat = Float32.(poisson(Float64.(gaussian_vec(sz, vec_true))))
+
+# dat = CuArray(dat)
+# now prepare the fitting:
+myfg! = get_fg!(dat, gaussian_raw, loss=loss_anscombe_pos, bg=7f0);
+startvals = ComponentVector(;bg=0.5f0, intensity=45f0, off = [9f0, 7f0], args = [3.0f0, 2.0f0])
+
+opt = Optim.Options(iterations = 19); #
+odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals);
+
+# and perform the fit
+@time reso = Optim.optimize(odo, startvals, Optim.LBFGS(), opt);
+reso.f_calls # 61
+reso.minimum 
+@vt dat gaussian_vec(sz, startvals) gaussian_vec(sz, reso.minimizer)
+
+odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals);
+if isa(dat, CuArray)
+    @btime CUDA.@sync reso = Optim.optimize($odo, $startvals, Optim.LBFGS(), $opt);
+else
+    @btime reso = Optim.optimize($odo, $startvals, Optim.LBFGS(), $opt);
+end
+# Zygote-free CPU: 800 µs,  for 1600x1600: 2.7 sec
+# Zygote-free GPU: 52 ms, for 1600x1600: 0.213 sec
+
+using InverseModeling
+
+gstartvals = ComponentVector(;offset = startvals.bg, i0=startvals.intensity, µ=startvals.off.-sz.÷2 .+1, σ=startvals.args)
+@time res1, res2, res3 = gauss_fit(dat, gstartvals; iterations = 99);
+res3.f_calls
+res3.minimum
+vec_true
+res1
+
+@btime res1, res2, res3 = gauss_fit($dat, $gstartvals; x_reltol=0.001);
+# 4.37 ms (27575 allocations: 8.12 MiB)
+@vt dat res2 gaussian_vec(sz, reso.minimizer) (res2 .- dat)  (gaussian_vec(sz, reso.minimizer) .- dat)
+
+@time res1, res2, res3 = gauss_fit(dat);
+
+@btime res1, res2, res3 = gauss_fit($dat);
+# 5 ms (39192 allocations: 4.35 MiB)
+
+# @btime Optim.optimize($loss, $off_start, $sigma_start, LBFGS(); autodiff = :forward); # 1.000 ms (10001 allocations: 1.53 MiB)
diff --git a/performance_tests/gauss_fit.jl b/performance_tests/gauss_fit.jl
index 56cc33e..8c5f1b6 100644
--- a/performance_tests/gauss_fit.jl
+++ b/performance_tests/gauss_fit.jl
@@ -29,10 +29,10 @@ mid = sz.÷2 .+1
 myxx = Float32.(xx((sz[1],1)))
 myyy = Float32.(yy((1, sz[2])))
 
-my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.args[1])) .- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.args[2])))
+my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.((myxx .- vec.off[1])./(sqrt(2f0)*vec.args[1])) .- abs2.((myyy .- vec.off[2])./(sqrt(2f0)*vec.args[2])))
 # my_full_gaussian(vec) = vec.bg .+ vec.intensity.*exp.(.-abs2.(vec.sca[1].*(myxx .- vec.off[1])./(sqrt(2f0)*vec.sigma[1]))).*exp.(.- abs2.(vec.sca[2].*(myyy .- vec.off[2])./(sqrt(2f0)*vec.sigma[2])))
 
-vec_true = ComponentVector(;bg=0.2f0, intensity=1.0f0, sca= (1f0, 1f0), off = (2.2f0, 3.3f0), args = (2.4f0, 1.5f0))
+vec_true = ComponentVector(;bg=0.2f0, intensity=1.0f0, off = (2.2f0, 3.3f0), args = (2.4f0, 1.5f0))
 # @btime my_full_gaussian($vec_true); # 42.800 μs (18 allocations: 16.70 KiB)
 
 Random.seed!(42)
@@ -41,8 +41,8 @@ dat_copy = copy(dat)
 
 loss = (vec) -> sum(abs2.(my_full_gaussian(vec) .- dat))
 
-bc_mem = gaussian_nokw_sep(sz, vec_true.off.+mid, vec_true.sca, vec_true.args)
-my_sep_gaussian(vec) = vec.bg .+ vec.intensity .* gaussian_nokw_sep(sz, vec.off .+mid, vec.sca, vec.args; all_axes=bc_mem)
+bc_mem = gaussian_nokw_sep(sz, vec_true.off.+mid, 1f0, vec_true.args)
+my_sep_gaussian(vec) = vec.bg .+ vec.intensity .* gaussian_nokw_sep(sz, vec.off .+mid, 1f0, vec.args; all_axes=bc_mem)
 loss_sep = (vec) -> sum(abs2.(my_sep_gaussian(vec) .- dat))
 
 # off_start = (2.0f0, 3.0f0)
@@ -51,8 +51,8 @@ loss_sep = (vec) -> sum(abs2.(my_sep_gaussian(vec) .- dat))
 off_start = [2.0f0, 3.0f0]
 # off_start = [-32f0, -32f0]
 sigma_start = [3.0f0, 2.0f0]
-sca_start = [1.2f0, 1.5f0]
-startvals = ComponentVector(;bg = 0.5f0, intensity=1.0f0, sca=sca_start, off=off_start, args=sigma_start)
+# sca_start = [1.2f0, 1.5f0]
+startvals = ComponentVector(;bg = 0.5f0, intensity=1.0f0, off=off_start, args=sigma_start)
 
 # @vt dat my_full_gaussian(startvals) my_sep_gaussian(startvals)
 
@@ -82,9 +82,9 @@ v, g = value_and_gradient(loss_sep, AutoZygote(),  startvals) #     (54.162834f0
 # v, g = value_and_gradient(loss_sep, AutoReverseDiff(),  startvals) #(54.162834f0, (off = Float32[-0.3461006, -1.4118625], sigma = Float32[-0.15418892, 0.4756395]))
 
 mymem = get_bc_mem(typeof(dat), size(dat), *)
-@time fg!(dat, gaussian_raw, startvals.bg, startvals.intensity, startvals.off .+ sz.÷2 .+1, startvals.sca, startvals.sigma; all_axes=mymem)
+@time fg!(dat, gaussian_raw, startvals.bg, startvals.intensity, startvals.off .+ sz.÷2 .+1, startvals.sigma; all_axes=mymem)
 
-@btime fg!($dat, $gaussian_raw, $startvals.bg, $startvals.intensity, $startvals.off .+ $sz.÷2 .+1, startvals.sca, $startvals.sigma; all_axes=$mymem)
+@btime fg!($dat, $gaussian_raw, $startvals.bg, $startvals.intensity, $startvals.off .+ $sz.÷2 .+1, $startvals.sigma; all_axes=$mymem)
 # 35.3 µs, 165 allocs, 126 kB
 
 # broken!
@@ -136,17 +136,17 @@ function fg!(F, G, vec)
     end
 end
 
-myfg! = get_fg!(dat, gaussian_raw)
+myfg! = get_fg!(dat, gaussian_raw, loss=loss_anscombe_pos)
 
 startvals_s = copy(startvals)
 startvals_s.off .+= sz.÷2 .+1
 G = copy(startvals)
 myfg!(1, G, startvals_s)
-
+G
 od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals);
 @time res = Optim.optimize(od, startvals,  Optim.LBFGS(), opt)
-res.f_calls # 33
-res.minimum # 13.9137
+res.f_calls # 31
+res.minimum # 13.927
 # res.f_calls = 0
 
 loss(startvals)
@@ -157,15 +157,15 @@ od = OnceDifferentiable(Optim.NLSolversBase.only_fg!(fg!), startvals);
 # Full: 4.804 ms (14435 allocations: 12.78 MiB)
 # Sep: 4.099 ms (26269 allocations: 11.20 MiB)
 # Hand-Separated: 2.397 ms (15472 allocations: 9.19 MiB)
-# 2.8 ms
+# 2.5 ms
 
 myfg!(1.0, G, startvals_s)
 G
 # opt = Optim.Options(iterations = 9); # why ?
 odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals_s);
 @time reso = Optim.optimize(odo, startvals_s, Optim.LBFGS(), opt);
-reso.f_calls # 33
-reso.minimum # 13.9138
+reso.f_calls # 31
+reso.minimum # 13.927
 
 odo = OnceDifferentiable(Optim.NLSolversBase.only_fg!(myfg!), startvals_s);
 @btime reso = Optim.optimize($odo, $startvals_s, Optim.LBFGS(), $opt);
@@ -176,11 +176,12 @@ using InverseModeling
 gstartvals = ComponentVector(;offset = startvals.bg, i0=startvals.intensity, µ=startvals.off, σ=startvals.args)
 @time res1, res2, res3 = gauss_fit(dat, gstartvals; iterations = 9);
 res3.f_calls
+res3.minimum
 vec_true
 res1
 
 @btime res1, res2, res3 = gauss_fit($dat, $gstartvals; x_reltol=0.001);
-# 4.214 ms (27575 allocations: 8.12 MiB)
+# 4.37 ms (27575 allocations: 8.12 MiB)
 @vt dat res2 (res2 .- dat)
 
 # @btime Optim.optimize($loss, $off_start, $sigma_start, LBFGS(); autodiff = :forward); # 1.000 ms (10001 allocations: 1.53 MiB)
diff --git a/src/SeparableFunctions.jl b/src/SeparableFunctions.jl
index 6b7f527..f5f5447 100644
--- a/src/SeparableFunctions.jl
+++ b/src/SeparableFunctions.jl
@@ -45,6 +45,8 @@ export calc_radial_symm!, calc_radial_symm, get_corner_ranges
 export radial_speedup, radial_speedup_ifa
 export kwargs_to_args
 
+export loss_anscombe, loss_anscombe_pos, loss_gaussian, loss_poisson, loss_poisson_pos 
+
 DefaultResElType = Float32
 DefaultArrType = Array{DefaultResElType}
 DefaultComplexArrType = Array{complex(DefaultResElType)}
diff --git a/src/general.jl b/src/general.jl
index 58d19a9..6fbc8a2 100644
--- a/src/general.jl
+++ b/src/general.jl
@@ -359,24 +359,86 @@ function ChainRulesCore.rrule(config::RuleConfig{>:HasReverseMode}, ::typeof(cal
     return y, calculate_separables_nokw_hook_pullback
 end
 
-function get_fg!(data::AbstractArray{T,N}, fct) where {T,N}
+# from InverseModeling.jl
+loss_gaussian(data::AbstractArray{T}, fwd, bg=zero(T)) where {T} = mapreduce(abs2, +, fwd .- data ; init=zero(T))
+function get_fgC!(data::AbstractArray{T,N}, ::typeof(loss_gaussian), bg=zero(T)) where {T,N}
+    function fg!(F, G, fwd)
+        if !isnothing(G)
+            G .= (fwd .- data)
+            return (!isnothing(F)) ? mapreduce(abs2, +, G; init=zero(T)) :  T(0);
+        end
+        return (!isnothing(F)) ? mapreduce(abs2, +, (fwd .- data); init=zero(T)) : T(0);
+    end
+    return fg!, 2
+end
+
+loss_poisson(data::AbstractArray{T}, fwd, bg=zero(T)) where {T} = sum((fwd.+bg) .- (data.+bg).*log.(fwd.+bg))
+function get_fgC!(data::AbstractArray{T,N}, ::typeof(loss_poisson), bg=zero(T)) where {T,N}
+    function fg!(F, G, fwd)
+        if !isnothing(G)
+            G .= one(T) .- (data .+ bg)./max.(fwd .+ bg, max(b, T(1f-10)))
+        end
+        return (!isnothing(F)) ? reduce(+, (fwd.+bg) .- (data.+bg).*log.(fwd.+bg); init=zero(T)) : T(0);
+    end
+    return fg!, 1
+end
+
+loss_poisson_pos(data::AbstractArray{T}, fwd, bg=zero(T)) where {T} = sum(max.(eltype(fwd)(0),fwd.+bg) .- max.(eltype(fwd)(0),data.+bg).*log.(max.(eltype(fwd)(0),fwd.+bg)))
+function get_fgC!(data::AbstractArray{T,N}, ::typeof(loss_poisson_pos), bg=zero(T)) where {T,N}
+    function fg!(F, G, fwd)
+        if !isnothing(G)
+            G .= one(T) .- (data .+ bg)./max.(fwd .+ bg, T(1f-10))
+        end
+        return (!isnothing(F)) ? reduce(+, max.(eltype(fwd)(0),(fwd.+bg)) .- max.(eltype(fwd)(0),(data.+bg)).*log.(max.(eltype(fwd)(0),fwd.+bg)); init=zero(T)) : T(0);
+    end
+    return fg!, 1
+end
+
+loss_anscombe(data::AbstractArray{T}, fwd, bg=zero(T)) where {T} = sum(abs2.(sqrt.(data.+bg) .- sqrt.(fwd.+bg)))
+function get_fgC!(data::AbstractArray{T,N}, ::typeof(loss_anscombe), bg=zero(T)) where {T,N}
+    function fg!(F, G, fwd)
+        if !isnothing(G)
+            G .= one(T) .- sqrt.(data.+bg)./sqrt.(max.(fwd .+ bg, max(bg, T(1f-10))))
+        end
+        return (!isnothing(F)) ? mapreduce(abs2, +, sqrt.(fwd.+bg) .- sqrt.(data.+bg); init=zero(T)) : T(0);
+    end
+    return fg!, 1
+end
+
+loss_anscombe_pos(data::AbstractArray{T}, fwd, bg=zero(T)) where {T} = sum(abs2.(sqrt.(max.(eltype(fwd)(0),fwd.+bg)) .- sqrt.(max.(eltype(fwd)(0),data.+bg))))
+
+function get_fgC!(data::AbstractArray{T,N}, ::typeof(loss_anscombe_pos), bg=zero(T)) where {T,N}
+    function fg!(F, G, fwd)
+        if !isnothing(G)
+            G .= one(T) .- sqrt.(max.(eltype(fwd)(0), data.+bg))./sqrt.(max.(fwd .+ bg, max(bg, T(1f-10))))
+        end
+        return (!isnothing(F)) ? mapreduce(abs2, +, sqrt.(max.(eltype(fwd)(0), fwd.+bg)) .- sqrt.(max.(eltype(fwd)(0),data.+bg)); init=zero(T)) : T(0);
+    end
+    return fg!, 1
+end
+
+
+function get_fg!(data::AbstractArray{T,N}, fct; loss = loss_gaussian, bg=zero(real(T))) where {T,N}
     RT = real(eltype(data))
     operator = get_operator(fct)
     by = get_bc_mem(typeof(data), size(data), operator)
     y = by.args
     yv = ntuple((d) -> (@view y[d][:]), Val(N))
 
-    resid = similar(data)
     dy = get_sep_mem(typeof(data), size(data))
     dyv = ntuple((d) -> (@view dy[d][:]), Val(N))
 
+    resid = similar(data) # checkpoint
+
+    loss_fg!, C = get_fgC!(data, loss, bg)
+
     # this function returns the forward value and mutates the gradient G
     function fg!(F, G, vec)
-        bg = hasfield(vec, :bg) ? vec.bg : zero(RT);
-        intensity = hasfield(vec, :intensity) ? vec.intensity : one(RT)
-        off = hasfield(vec, :off) ? vec.off : RT.(sz.÷2 .+1)
-        sca = hasfield(vec, :sca) ? vec.sca : one(RT);
-        args = hasfield(vec, :args) ? (vec.args,) : one(RT)
+        bg = hasproperty(vec, :bg) ? vec.bg : zero(RT);
+        intensity = hasproperty(vec, :intensity) ? vec.intensity : one(RT)
+        off = hasproperty(vec, :off) ? vec.off : RT.(sz.÷2 .+1)
+        sca = hasproperty(vec, :sca) ? vec.sca : one(RT);
+        args = hasproperty(vec, :args) ? (vec.args,) : one(RT)
         sz = size(data)
         # mid = sz .÷ 2 .+ 1
         # off = off .+ mid
@@ -386,11 +448,12 @@ function get_fg!(data::AbstractArray{T,N}, fct) where {T,N}
         # 5kB, result is in by
         calculate_broadcasted_nokw(typeof(data), fct, sz, off, sca, args...; operator=operator, all_axes=by)
 
-        if !isnothing(F) || !isnothing(G)
-            resid .= bg .+ intensity .* by .- data
-        end
+        # if !isnothing(F) || !isnothing(G)
+        #     resid .= bg .+ intensity .* by .- data
+        # end
+        loss = loss_fg!(F, resid, bg .+ intensity .* by)
         if !isnothing(G)
-            G.bg = 2*sum(resid)
+            G.bg = C*sum(resid)
 
             other_dims = ntuple((d)-> (ntuple((n)->n, d-1)..., ntuple((n)->d+n, N-d)...), Val(N))
             # @show other_dims
@@ -411,25 +474,25 @@ function get_fg!(data::AbstractArray{T,N}, fct) where {T,N}
 
             args_1d = ntuple((d)-> pick_n.(d, args), Val(N))
 
-            if hasfield(vec, :off)
-                G.off = optional_convert(off, ntuple((d) -> (-2*intensity*pick_n(d, sca)) .* 
+            if hasproperty(vec, :off)
+                G.off = optional_convert(off, ntuple((d) -> (-C*intensity*pick_n(d, sca)) .* 
                 get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
             end
 
-            if hasfield(vec, :args)
+            if hasproperty(vec, :args)
                 dargs = ntuple((argno) -> optional_convert(args[argno],
-                (2*intensity).*ntuple((d) -> get_arg_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))), length(args)) # ids @ offset the -1 is since the argument of fct is idx-offset
+                (C*intensity).*ntuple((d) -> get_arg_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))), length(args)) # ids @ offset the -1 is since the argument of fct is idx-offset
                 G.args = dargs[1]
             end
             # dargs = (0f0,0f0)
 
-            if hasfield(vec, :sca)
+            if hasproperty(vec, :sca)
                 # missuse the dy memory
                 for d = 1:N
                     dyv[d] .= dyv[d].* ids_offset_only[d]
                 end
                 G.sca = optional_convert(sca, ntuple((d) -> 
-                (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
+                (C*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
             end
                 # 1.5 kB:
             # (2*intensity).*get_idx_gradient(fct, yv[d], ids[d], sz[d], dyv[d].* ids_offset_only[d], args_1d[d]...), Val(N))) # ids @ offset the -1 is since the argument of fct is idx-offset
@@ -437,12 +500,12 @@ function get_fg!(data::AbstractArray{T,N}, fct) where {T,N}
         end
 
         # return loss, dfdbg, dfdI, doffset, dscale, dargs...
-        loss = (!isnothing(F)) ? mapreduce(abs2, +, resid; init=zero(T)) : T(0);
+        # loss = (!isnothing(F)) ? mapreduce(abs2, +, resid; init=zero(T)) : T(0);
 
         if !isnothing(G) # forward needs to be calculated
             # resid .*= by # is slower!
             resid .= resid .* by
-            G.intensity = 2*sum(resid)
+            G.intensity = C*sum(resid)
             # G.intensity = 2*sum(resid.*by)
         end
 
diff --git a/src/specific.jl b/src/specific.jl
index e60cb5d..0fe42c1 100644
--- a/src/specific.jl
+++ b/src/specific.jl
@@ -176,7 +176,23 @@ for F in generate_functions_expr()
         # operator=$(F[5])
         # return calculate_separables_nokw(Array{$(F[4])}, fct, sz, args...; all_axes=all_axes), operator
     end
- 
+
+    @eval function $(Symbol(F[1], :_vec))(::Type{TA}, sz::NTuple{N, Int}, vec;
+        all_axes = get_bc_mem(Array{$(F[4])}, sz, $(F[5])) ) where {TA, N}
+        RT = real(eltype(TA))
+        intensity = (hasproperty(vec, :intensity)) ? vec.intensity : one(RT)
+        bg = (hasproperty(vec, :bg)) ? vec.bg : zero(RT)
+        off = (hasproperty(vec, :off)) ? vec.off : sz .÷ 2 .+ 1
+        sca = (hasproperty(vec, :sca)) ? vec.sca : ones(RT, length(sz))
+        args = (hasproperty(vec, :args)) ? (vec.args,) : Tuple([])
+        return bg .+ intensity .* ($(Symbol(F[1], :_nokw_sep))(sz, off, sca, args...; all_axes=all_axes))
+    end
+
+    @eval function $(Symbol(F[1], :_vec))(sz::NTuple{N, Int}, vec;
+        all_axes = get_bc_mem(Array{$(F[4])}, sz, $(F[5])) ) where {N}
+        return $(Symbol(F[1], :_vec))(Array{$(F[4])}, sz, vec; all_axes=all_axes)        
+    end
+
     @eval function $(Symbol(F[1], :_lz))(::Type{TA}, sz::NTuple{N, Int}, args...; kwargs...) where {TA, N}
         fct = $(F[3]) # to assign the function to a symbol
         separable_view(TA, fct, sz, args...; defaults=$(F[2]), operator=$(F[5]), kwargs...)
@@ -191,6 +207,8 @@ for F in generate_functions_expr()
     @eval export $(Symbol(F[1], :_col))
     # separated: a vector of separated contributions is returned and the user has to combine them
     @eval export $(Symbol(F[1], :_sep))
+    # a broadcasted version which accepts a ComponentArray as an input
+    @eval export $(Symbol(F[1], :_vec))
     # lazy: A LazyArray representation is returned
     @eval export $(Symbol(F[1], :_nokw_sep))
     # @eval export $(Symbol(F[1], :_lz))