Skip to content

Commit

Permalink
Merge branch 'main' into sk/add_distributed_gpu_run
Browse files Browse the repository at this point in the history
  • Loading branch information
sriharshakandala committed Sep 27, 2023
2 parents cd7ba92 + bac0b76 commit 1b946b4
Show file tree
Hide file tree
Showing 46 changed files with 2,855 additions and 100 deletions.
22 changes: 22 additions & 0 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,20 @@ steps:
--job_id sphere_baroclinic_wave_rhoe_equilmoist
--out_dir sphere_baroclinic_wave_rhoe_equilmoist
artifact_paths: "sphere_baroclinic_wave_rhoe_equilmoist/*"

- label: ":computer: no lim ARS baroclinic wave (ρe) equilmoist explicit vertdiff"
command: >
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file $CONFIG_PATH/sphere_baroclinic_wave_rhoe_equilmoist_expvdiff.yml
julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl
--data_dir sphere_baroclinic_wave_rhoe_equilmoist_expvdiff
--out_dir sphere_baroclinic_wave_rhoe_equilmoist_expvdiff
julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl
--nc_dir sphere_baroclinic_wave_rhoe_equilmoist_expvdiff
--fig_dir sphere_baroclinic_wave_rhoe_equilmoist_expvdiff --case_name aquaplanet
artifact_paths: "sphere_baroclinic_wave_rhoe_equilmoist_expvdiff/*"

- label: ":computer: SSP zalesak tracer & energy upwind baroclinic wave (ρe_tot) equilmoist"
command: >
Expand Down Expand Up @@ -1023,6 +1037,14 @@ steps:
agents:
slurm_mem: 20GB

- label: ":fire: Flame graph: perf target (diagnostics)"
command: >
julia --color=yes --project=perf perf/flame.jl
--config_file $PERF_CONFIG_PATH/flame/diagnostics.yml
artifact_paths: "flame_perf_diagnostics/*"
agents:
slurm_mem: 20GB

# Inference
- label: ":rocket: JET n-failures (inference)"
command: >
Expand Down
37 changes: 12 additions & 25 deletions .buildkite/scaling/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,32 +65,28 @@ for i in "${!resolutions[@]}"; do
done

# set up environment and agents
cat << EOM
env:
JULIA_VERSION: "1.9.3"
MPICH_VERSION: "4.0.0"
OPENMPI_VERSION: "4.1.1"
MPI_IMPL: "$mpi_impl"
CUDA_VERSION: "11.3"
OPENBLAS_NUM_THREADS: 1
CLIMATEMACHINE_SETTINGS_FIX_RNG_SEED: "true"
cat << 'EOM'
agents:
config: cpu
queue: central
modules: julia/1.9.3 cuda/11.8 ucx/1.14.1_cuda-11.8 openmpi/4.1.5_cuda-11.8 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1
env:
JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite"
OPENBLAS_NUM_THREADS: 1
JULIA_NVTX_CALLBACKS: gc
OMPI_MCA_opal_warn_on_missing_libcuda: 0
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
JULIA_CPU_TARGET: 'broadwell;skylake'
SLURM_KILL_BAD_EXIT: 1
steps:
- label: "init :computer:"
key: "init_cpu_env"
command:
- echo "--- Configure MPI"
- julia -e 'using Pkg; Pkg.add("MPIPreferences"); using MPIPreferences; use_system_binary()'
- echo "--- Instantiate"
- "julia --project=examples -e 'using Pkg; Pkg.instantiate(;verbose=true)'"
- "julia --project=examples -e 'using Pkg; Pkg.precompile()'"
- "julia --project=examples -e 'using Pkg; Pkg.status()'"
agents:
slurm_cpus_per_task: 8
env:
Expand Down Expand Up @@ -129,11 +125,7 @@ if [[ "$profiling" == "enable" ]]; then
else
cpus_per_proc=1
fi
if [[ "$mpi_impl" == "mpich" ]]; then
launcher="srun --cpu-bind=cores"
else
launcher="mpiexec --map-by node:PE=$cpus_per_proc --bind-to core"
fi
launcher="srun --cpu-bind=cores"

if [[ "$res" == "low" ]]; then
time="04:00:00"
Expand All @@ -160,7 +152,6 @@ cat << EOM
- label: "$nprocs"
key: "$job_id"
command:
- "module load cuda/11.3 nsight-systems/2022.2.1"
- "$launcher $command"
- "find ${job_id} -iname '*.nsys-rep' -printf '%f\\\\n' | sort -V | jq --raw-input --slurp 'split(\"\n\") | .[0:-1] | {files: .} + {\"extension\": \"nsys-view\", \"version\": \"1.0\"}' > ${job_id}/${job_id}.nsys-view"
- "find ${job_id} -iname '*.nsys-*' | sort -V | tar cvzf ${job_id}-nsys.tar.gz -T -"
Expand All @@ -170,8 +161,6 @@ cat << EOM
env:
CLIMACORE_DISTRIBUTED: "MPI"
agents:
config: cpu
queue: central
slurm_time: $time
EOM

Expand Down Expand Up @@ -209,8 +198,6 @@ cat << EOM
- "${res}-*.png"
- "${res}-*.pdf"
agents:
config: cpu
queue: central
slurm_nodes: 1
slurm_tasks_per_node: 1
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ deps/src/
docs/build/
docs/site/

# File generated by make_available_diagnostics.jl
docs/src/available_diagnostics.md

# File generated by Pkg, the package manager, based on a corresponding Project.toml
# It records a fixed state of all packages used by the project. As such, it should not be
# committed for packages, but should be committed for applications that require a static
Expand Down
3 changes: 3 additions & 0 deletions config/default_configs/default_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,6 @@ override_τ_precip:
log_params:
help: "Log parameters to file [`false` (default), `true`]"
value: false
output_default_diagnostics:
help: "Output the default diagnostics associated to the selected atmospheric model"
value: false
6 changes: 4 additions & 2 deletions config/model_configs/edmfx_adv_test_box.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ hyperdiff: "true"
kappa_4: 1e8
x_max: 1e4
y_max: 1e4
z_max: 3e4
z_max: 5.5e4
x_elem: 2
y_elem: 2
z_elem: 45
z_elem: 63
dz_bottom: 30.0
dz_top: 3000.0
dt: "10secs"
t_end: "3600secs"
dt_save_to_disk: "100secs"
FLOAT_TYPE: "Float64"
toml: [toml/edmfx_box_advection.toml]
2 changes: 1 addition & 1 deletion config/model_configs/edmfx_trmm_box.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ y_elem: 2
z_elem: 82
z_stretch: false
dt: 1secs
t_end: 6hours
t_end: 2hours
dt_save_to_disk: 10mins
FLOAT_TYPE: "Float64"
toml: [toml/edmfx_box.toml]
2 changes: 1 addition & 1 deletion config/model_configs/sphere_baroclinic_wave_rhoe.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
dt_save_to_disk: "2days"
regression_test: true
initial_condition: "DryBaroclinicWave"
dt: "580secs"
dt: "400secs"
t_end: "10days"
job_id: "sphere_baroclinic_wave_rhoe"
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
precip_model: "0M"
vert_diff: "true"
z_elem: 20
dz_bottom: 100
dt_save_to_disk: "12hours"
initial_condition: "MoistBaroclinicWave"
dt: "40secs"
t_end: "12hours"
job_id: "sphere_baroclinic_wave_rhoe_equilmoist_expvdiff"
moist: "equil"
toml: [toml/sphere_baroclinic_wave_rhoe_equilmoist_expvdiff.toml]
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ viscous_sponge: true
job_id: "sphere_held_suarez_rhoe_equilmoist_hightop_sponge"
moist: "equil"
toml: [toml/sphere_held_suarez_rhoe_equilmoist_hightop_sponge.toml]
output_default_diagnostics: true
2 changes: 1 addition & 1 deletion config/model_configs/sphere_held_suarez_rhoe_hightop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ dt_save_to_disk: "4days"
regression_test: true
t_end: "8days"
forcing: "held_suarez"
dt: "500secs"
dt: "400secs"
z_elem: 25
job_id: "sphere_held_suarez_rhoe_hightop"
z_max: 45000.0
2 changes: 1 addition & 1 deletion config/model_configs/sphere_held_suarez_rhotheta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ dt_save_to_disk: "10days"
regression_test: true
t_end: "20days"
forcing: "held_suarez"
dt: "500secs"
dt: "400secs"
job_id: "sphere_held_suarez_rhotheta"
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dt_save_to_disk: "5days"
initial_condition: "MoistBaroclinicWave"
max_newton_iters_ode: 4
dt: "500secs"
dt: "400secs"
tracer_upwinding: zalesak
t_end: "5days"
ode_algo: "SSP333"
Expand Down
11 changes: 11 additions & 0 deletions config/perf_configs/flame/diagnostics.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
job_id: "flame_perf_diagnostics"
diagnostics:
- short_name: ua
period: 1secs
reduction_time: average
- short_name: va
period: 1secs
reduction_time: max
- short_name: ta
period: 1secs
reduction_time: max
2 changes: 1 addition & 1 deletion config/perf_configs/gpu_baroclinic_wave_rhoe.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
job_id: "gpu_baroclinic_wave_rhoe"
dt: "580secs"
dt: "400secs"
t_end: "10days"
dt_save_to_disk: "2days"
initial_condition: "DryBaroclinicWave"
Expand Down
2 changes: 1 addition & 1 deletion config/perf_configs/gpu_held_suarez_rhoe_hightop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ z_elem: 25
dz_bottom: 300
forcing: "held_suarez"
job_id: "gpu_held_suarez_rhoe_hightop"
dt: "500secs"
dt: "400secs"
t_end: "8days"
dt_save_to_disk: "4days"
4 changes: 4 additions & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ disable_logging(Base.CoreLogging.Info) # Hide doctest's `@info` printing
doctest(ClimaAtmos)
disable_logging(Base.CoreLogging.BelowMinLevel) # Re-enable all logging

include("make_diagnostic_table.jl")

makedocs(
CitationBibliography(joinpath(@__DIR__, "bibliography.bib")),
modules = [ClimaAtmos],
Expand All @@ -26,6 +28,8 @@ makedocs(
"Contributor Guide" => "contributor_guide.md",
"Equations" => "equations.md",
"EDMF Equations" => "edmf_equations.md",
"Diagnostics" => "diagnostics.md",
"Available Diagnostics" => "available_diagnostics.md",
"Diagnostic EDMF Equations" => "diagnostic_edmf_equations.md",
"Gravity Wave Drag Parameterizations" => "gravity_wave.md",
"Radiative Equilibrium" => "radiative_equilibrium.md",
Expand Down
32 changes: 32 additions & 0 deletions docs/make_diagnostic_table.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import ClimaAtmos as CA

# Read all the diagnostics we know how to compute, and print them into a
# markdown table that is later compiled into the docs

# basename(pwd()) if the code is run from inside the docs folder. If we don't
# have that, we will assume that we are running from the project root. If this
# code is run from anywhere but these two places, mkdocs will fail to find
# availbale_diagnostics.md
prefix = basename(pwd()) == "docs" ? "" : "docs/"

out_path = "$(prefix)src/available_diagnostics.md"

open(out_path, "w") do file

write(file, "# Available diagnostic variables\n\n")

write(
file,
"| Short name | Long name | Standard name | Units | Comments |\n",
)
write(file, "|---|---|---|---|---|\n")

for d in values(CA.Diagnostics.ALL_DIAGNOSTICS)
write(file, "| `$(d.short_name)` ")
write(file, "| $(d.long_name) ")
write(file, "| `$(d.standard_name)` ")
write(file, "| $(d.units) ")
write(file, "| $(d.comments)|\n")
end
end
@info "Written $out_path"
Loading

0 comments on commit 1b946b4

Please sign in to comment.