Skip to content

Commit

Permalink
Merge pull request #2852 from CliMA/zs/scaling_diagnostics
Browse files Browse the repository at this point in the history
add a gpu scaling job with diagnostics
  • Loading branch information
szy21 authored Jul 24, 2024
2 parents e047c24 + a1f4d3d commit e3ffce0
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 9 deletions.
22 changes: 21 additions & 1 deletion .buildkite/gpu_pipeline/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
agents:
queue: clima
slurm_mem: 8G
modules: julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2024.2.1
modules: julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2024.4.1

env:
JULIA_MPI_HAS_CUDA: "true"
Expand Down Expand Up @@ -123,6 +123,26 @@ steps:
- group: "DYAMOND GPU strong scaling"
steps:

- label: "gpu_aquaplanet_dyamond with diagnostics - strong scaling - 1 GPU"
command:
- mkdir -p gpu_aquaplanet_dyamond_diag_1process
- >
srun --cpu-bind=threads --cpus-per-task=4
nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond_diag_1process/output_active/report julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_diag_1process.yml
--job_id gpu_aquaplanet_dyamond_diag_1process
artifact_paths: "gpu_aquaplanet_dyamond_diag_1process/output_active/*"
env:
CLIMACOMMS_DEVICE: "CUDA"
CLIMACOMMS_CONTEXT: "MPI"
agents:
slurm_gpus_per_task: 1
slurm_cpus_per_task: 4
slurm_ntasks: 1
slurm_mem: 32G
slurm_time: 8:00:00
slurm_exclusive:

- label: "gpu_aquaplanet_dyamond - strong scaling - 1 GPU"
command:
- mkdir -p gpu_aquaplanet_dyamond_ss_1process
Expand Down
25 changes: 25 additions & 0 deletions config/gpu_configs/gpu_aquaplanet_dyamond_diag_1process.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
dt_save_state_to_disk: 12hours
dt_save_to_sol: 12hours
h_elem: 30
z_max: 55000.0
z_elem: 63
dz_bottom: 30.0
dz_top: 3000.0
moist: equil
precip_model: 1M
rad: allskywithclear
idealized_insolation: false
dt_rad: 1hours
vert_diff: FriersonDiffusion
implicit_diffusion: true
approximate_linear_solve_iters: 2
dt_cloud_fraction: 1hours
surface_setup: DefaultMoninObukhov
rayleigh_sponge: true
dt: 90secs
t_end: 1days
toml: [toml/longrun_aquaplanet.toml]
output_default_diagnostics: false
diagnostics:
- short_name: ["pfull", "wa", "va", "rv", "ke"]
period: "12hours"
2 changes: 1 addition & 1 deletion config/gpu_configs/gpu_aquaplanet_dyamond_ss_1process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ dt_cloud_fraction: 1hours
surface_setup: DefaultMoninObukhov
rayleigh_sponge: true
dt: "90secs"
t_end: 12hours
t_end: 1days
toml: [toml/longrun_aquaplanet.toml]
2 changes: 1 addition & 1 deletion config/gpu_configs/gpu_aquaplanet_dyamond_ss_2process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ approximate_linear_solve_iters: 2
surface_setup: "DefaultMoninObukhov"
rayleigh_sponge: true
dt: "90secs"
t_end: "12hours"
t_end: "1days"
toml: [toml/longrun_aquaplanet.toml]
2 changes: 1 addition & 1 deletion config/gpu_configs/gpu_aquaplanet_dyamond_ss_4process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ approximate_linear_solve_iters: 2
surface_setup: "DefaultMoninObukhov"
rayleigh_sponge: true
dt: "90secs"
t_end: "12hours"
t_end: "1days"
toml: [toml/longrun_aquaplanet.toml]
2 changes: 1 addition & 1 deletion config/gpu_configs/gpu_aquaplanet_dyamond_ws_1process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ approximate_linear_solve_iters: 2
surface_setup: "DefaultMoninObukhov"
rayleigh_sponge: true
dt: "90secs"
t_end: "12hours"
t_end: "1days"
toml: [toml/longrun_aquaplanet.toml]
2 changes: 1 addition & 1 deletion config/gpu_configs/gpu_aquaplanet_dyamond_ws_2process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ approximate_linear_solve_iters: 2
surface_setup: "DefaultMoninObukhov"
rayleigh_sponge: true
dt: "90secs"
t_end: "12hours"
t_end: "1days"
toml: [toml/longrun_aquaplanet.toml]
2 changes: 1 addition & 1 deletion config/gpu_configs/gpu_aquaplanet_dyamond_ws_4process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ approximate_linear_solve_iters: 2
surface_setup: "DefaultMoninObukhov"
rayleigh_sponge: true
dt: "90secs"
t_end: "12hours"
t_end: "1days"
toml: [toml/longrun_aquaplanet.toml]
4 changes: 2 additions & 2 deletions config/model_configs/aquaplanet_diagedmf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ edmfx_sgs_mass_flux: true
edmfx_sgs_diffusive_flux: true
cloud_model: diagnostic_edmfx
precip_model: 0M
dt: 90secs
t_end: 12hours
dt: 90secs
t_end: 1days
toml: [toml/diagnostic_edmfx.toml]
ode_algo: ARS343

0 comments on commit e3ffce0

Please sign in to comment.