diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index af0e5d0d88..f6ed5e4c32 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -766,6 +766,13 @@ steps: slurm_cpus_per_task: 8 # Flame graphs + - label: ":fire: Flame graph: perf target (IO)" + command: > + julia --color=yes --project=perf perf/benchmark_netcdf_io.jl + artifact_paths: "flame_perf_io/*.html" + agents: + slurm_gpus: 1 + - label: ":fire: Flame graph: perf target (default)" command: > julia --color=yes --project=perf perf/flame.jl diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000000..fb9990fa45 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,31 @@ +ClimaAtmos.jl Release Notes +============================ + +Main +------- + +- ![][badge-🚀performance] Reduced the number of allocations in the NetCDF + writer. PRs [#2772](https://github.com/CliMA/ClimaAtmos.jl/pull/2772), + [#2773](https://github.com/CliMA/ClimaAtmos.jl/pull/2773). +- Added a new script, `perf/benchmark_netcdf_io.jl` to test IO performance for + the NetCDF writer. PR [#2773](https://github.com/CliMA/ClimaAtmos.jl/pull/2773). + + + +[badge-🔥behavioralΔ]: https://img.shields.io/badge/🔥behavioralΔ-orange.svg +[badge-🤖precisionΔ]: https://img.shields.io/badge/🤖precisionΔ-black.svg +[badge-💥breaking]: https://img.shields.io/badge/💥BREAKING-red.svg +[badge-🚀performance]: https://img.shields.io/badge/🚀performance-green.svg +[badge-✨feature/enhancement]: https://img.shields.io/badge/feature/enhancement-blue.svg +[badge-🐛bugfix]: https://img.shields.io/badge/🐛bugfix-purple.svg diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 5ecdffe4a3..5fe1cac371 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -317,9 +317,9 @@ version = "0.5.7" [[deps.ClimaCore]] deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "Unrolled"] -git-tree-sha1 = "844afbd6a7c4f112f974e0031ebf4fb13d0b4157" +git-tree-sha1 = "bc6a0154e3bcc1657d3a75f697e216fb70121969" uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.13.1" +version = "0.13.2" weakdeps = ["Krylov"] [deps.ClimaCore.extensions] @@ -327,9 +327,9 @@ weakdeps = ["Krylov"] [[deps.ClimaParams]] deps = ["DocStringExtensions", "TOML", "Test"] -git-tree-sha1 = "323dd6c5423caf31f0da81bb9c288683cbdafb01" +git-tree-sha1 = "ec67949db856e01df4cbf7d6ddafefeda02f93ee" uuid = "5c42b081-d73a-476f-9059-fd94b934656c" -version = "0.10.2" +version = "0.10.3" [[deps.ClimaTimeSteppers]] deps = ["ClimaComms", "Colors", "DataStructures", "DiffEqBase", "DiffEqCallbacks", "KernelAbstractions", "Krylov", "LinearAlgebra", "LinearOperators", "NVTX", "SciMLBase", "StaticArrays"] diff --git a/examples/Manifest.toml b/examples/Manifest.toml index 914a130c24..f4356d44f0 100644 --- a/examples/Manifest.toml +++ b/examples/Manifest.toml @@ -305,9 +305,9 @@ version = "0.5.7" [[deps.ClimaCore]] deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "Unrolled"] -git-tree-sha1 = "844afbd6a7c4f112f974e0031ebf4fb13d0b4157" +git-tree-sha1 = "bc6a0154e3bcc1657d3a75f697e216fb70121969" uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.13.1" +version = "0.13.2" weakdeps = ["Krylov"] [deps.ClimaCore.extensions] @@ -345,9 +345,9 @@ version = "0.7.5" [[deps.ClimaParams]] deps = ["DocStringExtensions", "TOML", "Test"] -git-tree-sha1 = "323dd6c5423caf31f0da81bb9c288683cbdafb01" +git-tree-sha1 = "ec67949db856e01df4cbf7d6ddafefeda02f93ee" uuid = "5c42b081-d73a-476f-9059-fd94b934656c" -version = "0.10.2" +version = "0.10.3" [[deps.ClimaTimeSteppers]] deps = ["ClimaComms", "Colors", "DataStructures", "DiffEqBase", "DiffEqCallbacks", "KernelAbstractions", "Krylov", "LinearAlgebra", "LinearOperators", "NVTX", "SciMLBase", "StaticArrays"] diff --git a/perf/Manifest.toml b/perf/Manifest.toml index 31ea1fa53d..38d4f18055 100644 --- a/perf/Manifest.toml +++ b/perf/Manifest.toml @@ -314,9 +314,9 @@ version = "0.5.7" [[deps.ClimaCore]] deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "Unrolled"] -git-tree-sha1 = "844afbd6a7c4f112f974e0031ebf4fb13d0b4157" +git-tree-sha1 = "bc6a0154e3bcc1657d3a75f697e216fb70121969" uuid = "d414da3d-4745-48bb-8d80-42e94e092884" -version = "0.13.1" +version = "0.13.2" weakdeps = ["Krylov"] [deps.ClimaCore.extensions] @@ -354,9 +354,9 @@ version = "0.7.5" [[deps.ClimaParams]] deps = ["DocStringExtensions", "TOML", "Test"] -git-tree-sha1 = "323dd6c5423caf31f0da81bb9c288683cbdafb01" +git-tree-sha1 = "ec67949db856e01df4cbf7d6ddafefeda02f93ee" uuid = "5c42b081-d73a-476f-9059-fd94b934656c" -version = "0.10.2" +version = "0.10.3" [[deps.ClimaTimeSteppers]] deps = ["ClimaComms", "Colors", "DataStructures", "DiffEqBase", "DiffEqCallbacks", "KernelAbstractions", "Krylov", "LinearAlgebra", "LinearOperators", "NVTX", "SciMLBase", "StaticArrays"] diff --git a/perf/benchmark_netcdf_io.jl b/perf/benchmark_netcdf_io.jl new file mode 100644 index 0000000000..05a0a1f4e7 --- /dev/null +++ b/perf/benchmark_netcdf_io.jl @@ -0,0 +1,120 @@ +import ClimaAtmos as CA +import ClimaAtmos.Diagnostics as CAD +import ClimaComms +import Profile, ProfileCanvas +import NCDatasets +import Base: rm +using BenchmarkTools + +# This script runs our NetCDF writer and compares its performance with +# NCDatasets. It also produces a flamegraph for IO. + +# Number of target interpolation point along each dimension +const NUM = 100 + +timings = Dict{ClimaComms.AbstractDevice, Any}( + ClimaComms.CPUSingleThreaded() => missing, +) +# If a GPU is available, runs on CPU and GPU +if ClimaComms.device() isa ClimaComms.CUDADevice + timings[ClimaComms.CUDADevice()] = missing +end + +timings_ncdataset = copy(timings) + +function add_nc(nc, outarray) + v = nc["rhoa"] + temporal_size, spatial_size... = size(v) + time_index = temporal_size + 1 + nc["time"][time_index] = 10.0 * time_index + v[time_index, :, :, :] = outarray +end + +for device in keys(timings) + device_name = nameof(typeof(device)) + config = CA.AtmosConfig(; comms_ctx = ClimaComms.context(device)) + config.parsed_args["diagnostics"] = + [Dict("short_name" => "rhoa", "period" => config.parsed_args["dt"])] + config.parsed_args["netcdf_interpolation_num_points"] = [NUM, NUM, NUM] + config.parsed_args["job_id"] = "flame_perf_io" + + simulation = CA.get_simulation(config) + + # Cleanup pre-existing NC files + foreach( + rm, + filter(endswith(".nc"), readdir(simulation.output_dir, join = true)), + ) + + atmos_model = simulation.integrator.p.atmos + cspace = axes(simulation.integrator.u.c) + diagnostics, _ = CA.get_diagnostics(config.parsed_args, atmos_model, cspace) + rhoa_diag = diagnostics[end] + + netcdf_writer = simulation.output_writers[2] + field = simulation.integrator.u.c.ρ + + integrator = simulation.integrator + + # Run once to create the file + CAD.write_field!( + netcdf_writer, + field, + rhoa_diag, + integrator.u, + integrator.p, + integrator.t, + simulation.output_dir, + ) + # Now, profile + @info "Profiling ($device_name)" + prof = Profile.@profile CAD.save_diagnostic_to_disk!( + netcdf_writer, + field, + rhoa_diag, + integrator.u, + integrator.p, + integrator.t, + simulation.output_dir, + ) + results = Profile.fetch() + flame_path = joinpath(simulation.output_dir, "flame_$device_name.html") + ProfileCanvas.html_file(flame_path, results) + @info "Flame saved in $flame_path" + + @info "Benchmarking our NetCDF writer (only IO) ($device_name)" + timings[device] = @benchmark CAD.save_diagnostic_to_disk!( + $netcdf_writer, + $field, + $rhoa_diag, + $(integrator.u), + $(integrator.p), + $(integrator.t), + $(simulation.output_dir), + ) + + @info "Benchmarking NCDatasets ($device_name)" + + output_path = joinpath(simulation.output_dir, "clean_netcdf.nc") + nc = NCDatasets.NCDataset(output_path, "c") + NCDatasets.defDim(nc, "time", Inf) + NCDatasets.defVar(nc, "time", Float32, ("time",)) + NCDatasets.defDim(nc, "x", NUM) + NCDatasets.defDim(nc, "y", NUM) + NCDatasets.defDim(nc, "z", NUM) + v = NCDatasets.defVar(nc, "rhoa", Float32, ("time", "x", "y", "z")) + outarray = Array(netcdf_writer.remappers["rhoa"]._interpolated_values) + v[1, :, :, :] = outarray + + timings_ncdataset[device] = @benchmark $add_nc($nc, $outarray) +end + +for device in keys(timings) + println("DEVICE: ", device) + println("Our writer") + show(stdout, MIME"text/plain"(), timings[device]) + println() + println("NCDatasets") + show(stdout, MIME"text/plain"(), timings_ncdataset[device]) + println() +end diff --git a/src/diagnostics/netcdf_writer.jl b/src/diagnostics/netcdf_writer.jl index 033d4867e4..c9f27d1e94 100644 --- a/src/diagnostics/netcdf_writer.jl +++ b/src/diagnostics/netcdf_writer.jl @@ -459,7 +459,7 @@ and `domain` (e.g., `ClimaCore.Geometry.LatLongPoint`s). """ function hcoords_from_horizontal_space(space, domain, hpts) end -struct NetCDFWriter{T, TS} +struct NetCDFWriter{T, TS, DI} # TODO: At the moment, each variable gets its remapper. This is a little bit of a waste # because we probably only need a handful of remappers since the same remapper can be @@ -488,6 +488,10 @@ struct NetCDFWriter{T, TS} # When disable_vertical_interpolation is true, the num_points on the vertical direction # is ignored. disable_vertical_interpolation::Bool + + # Areas of memory preallocated where the interpolation output is saved. Only the root + # process uses this + preallocated_output_arrays::DI end """ @@ -565,30 +569,30 @@ function NetCDFWriter(; interpolated_physical_z = interpolate(remapper, Fields.coordinate_field(space).z) - return NetCDFWriter{typeof(num_points), typeof(interpolated_physical_z)}( + preallocated_arrays = + ClimaComms.iamroot(ClimaComms.context(space)) ? Dict{String, Array}() : + Dict{String, Nothing}() + + return NetCDFWriter{ + typeof(num_points), + typeof(interpolated_physical_z), + typeof(preallocated_arrays), + }( Dict{String, Remapper}(), num_points, compression_level, interpolated_physical_z, Dict{String, NCDatasets.NCDataset}(), disable_vertical_interpolation, + preallocated_arrays, ) end -function write_field!( - writer::NetCDFWriter, - field, - diagnostic, - u, - p, - t, - output_dir, -) +function interpolate_field!(writer::NetCDFWriter, field, diagnostic, u, p, t) var = diagnostic.variable space = axes(field) - FT = Spaces.undertype(space) horizontal_space = Spaces.horizontal_space(space) @@ -644,10 +648,38 @@ function write_field!( # Now we can interpolate onto the target points # There's an MPI call in here (to aggregate the results) - interpolated_field = interpolate(remapper, field) + # + # The first time we call this, we call interpolate and allocate a new array. + # Future calls are in-place + if haskey(writer.preallocated_output_arrays, var.short_name) + interpolate!( + writer.preallocated_output_arrays[var.short_name], + remapper, + field, + ) + else + writer.preallocated_output_arrays[var.short_name] = + interpolate(remapper, field) + end + return nothing +end +function save_diagnostic_to_disk!( + writer::NetCDFWriter, + field, + diagnostic, + u, + p, + t, + output_dir, +) # Only the root process has to write - ClimaComms.iamroot(ClimaComms.context(field)) || return + ClimaComms.iamroot(ClimaComms.context(field)) || return nothing + + var = diagnostic.variable + interpolated_field = writer.preallocated_output_arrays[var.short_name] + space = axes(field) + FT = Spaces.undertype(space) output_path = joinpath(output_dir, "$(diagnostic.output_short_name).nc") @@ -709,4 +741,19 @@ function write_field!( # Write data to disk NCDatasets.sync(writer.open_files[output_path]) + return nothing +end + +function write_field!( + writer::NetCDFWriter, + field, + diagnostic, + u, + p, + t, + output_dir, +) + interpolate_field!(writer, field, diagnostic, u, p, t) + save_diagnostic_to_disk!(writer, field, diagnostic, u, p, t, output_dir) + return nothing end diff --git a/src/diagnostics/writers.jl b/src/diagnostics/writers.jl index ebf228f9fb..2b26356dfe 100644 --- a/src/diagnostics/writers.jl +++ b/src/diagnostics/writers.jl @@ -3,7 +3,7 @@ # This file defines generic writers for diagnostics with opinionated defaults. import ClimaCore: Hypsography -import ClimaCore.Remapping: Remapper, interpolate, interpolate_array +import ClimaCore.Remapping: Remapper, interpolate, interpolate! import NCDatasets