diff --git a/perf/flame.jl b/perf/flame.jl index b0a7a55c63..936b3fae48 100644 --- a/perf/flame.jl +++ b/perf/flame.jl @@ -50,7 +50,7 @@ allocs_limit["flame_perf_target_threaded"] = 1_276_864 allocs_limit["flame_perf_target_callbacks"] = 398_984 allocs_limit["flame_perf_gw"] = 3_268_961_856 allocs_limit["flame_perf_target_prognostic_edmfx_aquaplanet"] = 299_616 -allocs_limit["flame_gpu_implicit_barowave_moist"] = 381_968 +allocs_limit["flame_gpu_implicit_barowave_moist"] = 658_664 # Ideally, we would like to track all the allocations, but this becomes too # expensive there is too many of them. Here, we set the default sample rate to # 1, but lower it to a smaller value when we expect the job to produce lots of diff --git a/src/cache/diagnostic_edmf_precomputed_quantities.jl b/src/cache/diagnostic_edmf_precomputed_quantities.jl index 4b289ead88..f169c18ca2 100644 --- a/src/cache/diagnostic_edmf_precomputed_quantities.jl +++ b/src/cache/diagnostic_edmf_precomputed_quantities.jl @@ -63,7 +63,8 @@ function set_diagnostic_edmfx_env_quantities_level!( turbconv_model, ) @. u³⁰_halflevel = divide_by_ρa( - ρ_level * u³_halflevel - mapreduce(*, +, ρaʲs_level, u³ʲs_halflevel), + ρ_level * u³_halflevel - + mapreduce_with_init(*, +, ρaʲs_level, u³ʲs_halflevel), ρ_level, ρ_level * u³_halflevel, ρ_level, diff --git a/src/utils/variable_manipulations.jl b/src/utils/variable_manipulations.jl index 69626e83ca..721bc78b0e 100644 --- a/src/utils/variable_manipulations.jl +++ b/src/utils/variable_manipulations.jl @@ -185,7 +185,7 @@ end Computes the total mass-flux subdomain area-weighted density, assuming that the mass-flux subdomain states are stored in `gs.sgsʲs`. """ -ρa⁺(gs) = mapreduce(sgsʲ -> sgsʲ.ρa, +, gs.sgsʲs) +ρa⁺(gs) = mapreduce_with_init(sgsʲ -> sgsʲ.ρa, +, gs.sgsʲs) """ ρah_tot⁺(sgsʲs) @@ -193,7 +193,7 @@ mass-flux subdomain states are stored in `gs.sgsʲs`. Computes the total mass-flux subdomain area-weighted ρh_tot, assuming that the mass-flux subdomain states are stored in `sgsʲs`. """ -ρah_tot⁺(sgsʲs) = mapreduce(sgsʲ -> sgsʲ.ρa * sgsʲ.h_tot, +, sgsʲs) +ρah_tot⁺(sgsʲs) = mapreduce_with_init(sgsʲ -> sgsʲ.ρa * sgsʲ.h_tot, +, sgsʲs) """ ρamse⁺(sgsʲs) @@ -201,7 +201,7 @@ mass-flux subdomain states are stored in `sgsʲs`. Computes the total mass-flux subdomain area-weighted ρmse, assuming that the mass-flux subdomain states are stored in `sgsʲs`. """ -ρamse⁺(sgsʲs) = mapreduce(sgsʲ -> sgsʲ.ρa * sgsʲ.mse, +, sgsʲs) +ρamse⁺(sgsʲs) = mapreduce_with_init(sgsʲ -> sgsʲ.ρa * sgsʲ.mse, +, sgsʲs) """ ρaq_tot⁺(sgsʲs) @@ -209,7 +209,7 @@ mass-flux subdomain states are stored in `sgsʲs`. Computes the total mass-flux subdomain area-weighted ρq_tot, assuming that the mass-flux subdomain states are stored in `sgsʲs`. """ -ρaq_tot⁺(sgsʲs) = mapreduce(sgsʲ -> sgsʲ.ρa * sgsʲ.q_tot, +, sgsʲs) +ρaq_tot⁺(sgsʲs) = mapreduce_with_init(sgsʲ -> sgsʲ.ρa * sgsʲ.q_tot, +, sgsʲs) """ ρa⁰(gs) @@ -217,7 +217,7 @@ mass-flux subdomain states are stored in `sgsʲs`. Computes the environment area-weighted density, assuming that the mass-flux subdomain states are stored in `gs.sgsʲs`. """ -ρa⁰(gs) = gs.ρ - mapreduce(sgsʲ -> sgsʲ.ρa, +, gs.sgsʲs) +ρa⁰(gs) = gs.ρ - mapreduce_with_init(sgsʲ -> sgsʲ.ρa, +, gs.sgsʲs) """ u₃⁺(ρaʲs, u₃ʲs, ρ, u₃, turbconv_model) @@ -229,7 +229,7 @@ are computed from the tuples of subdomain densities and velocities `ρaʲs` and is small. """ u₃⁺(ρaʲs, u₃ʲs, ρ, u₃, turbconv_model) = divide_by_ρa( - mapreduce(*, +, ρaʲs, u₃ʲs), + unrolled_dotproduct(ρaʲs, u₃ʲs), reduce(+, ρaʲs), ρ * u₃, ρ, @@ -247,7 +247,7 @@ environment quantities. The division is computed using `divide_by_ρa` to avoid issues when `a⁰` is small. """ u₃⁰(ρaʲs, u₃ʲs, ρ, u₃, turbconv_model) = divide_by_ρa( - ρ * u₃ - mapreduce(*, +, ρaʲs, u₃ʲs), + ρ * u₃ - unrolled_dotproduct(ρaʲs, u₃ʲs), ρ - reduce(+, ρaʲs), ρ * u₃, ρ, @@ -265,3 +265,25 @@ remove_energy_var(specific_state::NamedTuple) = Base.structdiff(specific_state, NamedTuple{(:e_tot,)}) remove_energy_var(specific_state::Tuple) = map(remove_energy_var, specific_state) + + +import ClimaCore.RecursiveApply: ⊞, ⊠, rzero, rpromote_type +function mapreduce_with_init(f, op, iter...) + r₀ = rzero(rpromote_type(typeof(f(map(first, iter)...)))) + mapreduce(f, op, iter...; init = r₀) +end + +# Inference fails for certain mapreduce calls inside cuda +# kernels, so let's define a recursive unrolled dot product: +promote_type_mul(n::Number, x::Geometry.AxisTensor) = typeof(x) +promote_type_mul(x::Geometry.AxisTensor, n::Number) = typeof(x) +@inline function unrolled_dotproduct(a::Tuple, b::Tuple) + r = rzero(promote_type_mul(first(a), first(b))) + unrolled_dotproduct(r, a, b) +end +@inline unrolled_dotproduct(s, ::Tuple{}, ::Tuple{}) = s +@inline unrolled_dotproduct(s, a::Tuple, b::Tuple) = + s ⊞ (first(a) ⊠ first(b)) ⊞ + unrolled_dotproduct(s, Base.tail(a), Base.tail(b)) +@inline unrolled_dotproduct(s, a::Tuple{<:Any}, b::Tuple{<:Any}) = + s ⊞ (first(a) ⊠ first(b))