From 41b6f7987f69cb3e11b8085cd0d3ca7c58db5280 Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Mon, 4 Mar 2024 14:34:58 -0800 Subject: [PATCH] Add strong and weak scaling jobs for CHAP configuration. --- .buildkite/gpu_pipeline/pipeline.yml | 170 +++++++++++++++--- config/gpu_configs/gpu_aquaplanet_chap.yml | 2 +- .../gpu_aquaplanet_chap_2process.yml | 29 +++ .../gpu_aquaplanet_chap_4process.yml | 29 +++ .../gpu_aquaplanet_chap_ws_1process.yml | 29 +++ .../gpu_aquaplanet_chap_ws_2process.yml | 29 +++ .../gpu_aquaplanet_chap_ws_4process.yml | 29 +++ .../gpu_aquaplanet_dyamond_2process.yml | 20 +++ .../gpu_aquaplanet_dyamond_4process.yml | 20 +++ 9 files changed, 332 insertions(+), 25 deletions(-) create mode 100644 config/gpu_configs/gpu_aquaplanet_chap_2process.yml create mode 100644 config/gpu_configs/gpu_aquaplanet_chap_4process.yml create mode 100644 config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml create mode 100644 config/gpu_configs/gpu_aquaplanet_chap_ws_2process.yml create mode 100644 config/gpu_configs/gpu_aquaplanet_chap_ws_4process.yml create mode 100644 config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml create mode 100644 config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml index 31e787a7b5..a8ba755a07 100644 --- a/.buildkite/gpu_pipeline/pipeline.yml +++ b/.buildkite/gpu_pipeline/pipeline.yml @@ -55,30 +55,7 @@ steps: agents: slurm_gpus: 1 slurm_cpus_per_task: 4 - - - label: "gpu_aquaplanet_dyamond" - command: - - mkdir -p gpu_aquaplanet_dyamond - - > - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond/report - julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl - --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond.yml - artifact_paths: "gpu_aquaplanet_dyamond/*" - agents: - slurm_gpus: 1 - slurm_cpus_per_task: 4 - - - label: "gpu_aquaplanet_chap" - command: - - mkdir -p gpu_aquaplanet_chap - - > - nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_chap/report - julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl - --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap.yml - artifact_paths: "gpu_aquaplanet_chap/*" - agents: - slurm_gpus: 1 - slurm_cpus_per_task: 4 + slurm_exclusive: - label: "moist Held-Suarez" key: "gpu_hs_rhoe_equil_55km_nz63_0M" @@ -92,6 +69,7 @@ steps: agents: slurm_gpus: 1 slurm_cpus_per_task: 4 + slurm_exclusive: - label: "moist Held-Suarez - 4 gpus" key: "gpu_hs_rhoe_equil_55km_nz63_0M_4process" @@ -107,6 +85,7 @@ steps: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 4 + slurm_exclusive: - label: "dry baroclinic wave - 4 gpus" key: "target_gpu_implicit_baroclinic_wave_4process" @@ -122,3 +101,146 @@ steps: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 4 + slurm_exclusive: + + - group: "CHAP GPU strong scaling" + steps: + + - label: "gpu_aquaplanet_chap - strong scaling - 1 GPU" + command: + - mkdir -p gpu_aquaplanet_chap + - > + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap.yml + artifact_paths: "gpu_aquaplanet_chap/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: + + - label: "gpu_aquaplanet_chap - strong scaling - 2 GPUs" + command: + - mkdir -p gpu_aquaplanet_chap_2process + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap_2process.yml + artifact_paths: "gpu_aquaplanet_chap_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_exclusive: + + - label: "gpu_aquaplanet_chap - strong scaling - 4 GPUs" + command: + - mkdir -p gpu_aquaplanet_chap_4process + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap_4process.yml + artifact_paths: "gpu_aquaplanet_chap_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_exclusive: + + - group: "CHAP GPU weak scaling" + steps: + + - label: "gpu_aquaplanet_chap - weak scaling - 1 GPU" + command: + - mkdir -p gpu_aquaplanet_chap_ws_1process + - > + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap_ws_1process.yml + artifact_paths: "gpu_aquaplanet_chap_ws_1process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: + + - label: "gpu_aquaplanet_chap - weak scaling - 2 GPUs" + command: + - mkdir -p gpu_aquaplanet_chap_ws_2process + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap_ws_2process.yml + artifact_paths: "gpu_aquaplanet_chap_ws_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: + + - label: "gpu_aquaplanet_chap - weak scaling - 4 GPUs" + command: + - mkdir -p gpu_aquaplanet_chap_ws_4process + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_chap_ws_4process.yml + artifact_paths: "gpu_aquaplanet_chap_ws_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: + + - group: "DYAMOND GPU strong scaling" + steps: + + - label: "gpu_aquaplanet_dyamond - 1 GPU" + command: + - mkdir -p gpu_aquaplanet_dyamond + - > + nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond/report + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond.yml + artifact_paths: "gpu_aquaplanet_dyamond/*" + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 4 + slurm_exclusive: + + - label: "gpu_aquaplanet_dyamond - 2 GPUs" + command: + - mkdir -p gpu_aquaplanet_dyamond_2process + - > + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_2process.yml + artifact_paths: "gpu_aquaplanet_dyamond_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: + + - label: "gpu_aquaplanet_dyamond - 4 GPUs" + command: + - mkdir -p gpu_aquaplanet_dyamond_4process + - > + julia --threads=3 --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond_4process.yml + artifact_paths: "gpu_aquaplanet_dyamond_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: diff --git a/config/gpu_configs/gpu_aquaplanet_chap.yml b/config/gpu_configs/gpu_aquaplanet_chap.yml index 9841e94ddf..6cb4218066 100644 --- a/config/gpu_configs/gpu_aquaplanet_chap.yml +++ b/config/gpu_configs/gpu_aquaplanet_chap.yml @@ -2,7 +2,7 @@ job_id: gpu_aquaplanet_chap dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" output_default_diagnostics: false -h_elem: 16 +h_elem: 30 z_max: 55000.0 z_elem: 63 dz_bottom: 30.0 diff --git a/config/gpu_configs/gpu_aquaplanet_chap_2process.yml b/config/gpu_configs/gpu_aquaplanet_chap_2process.yml new file mode 100644 index 0000000000..f9b649d7f0 --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_chap_2process.yml @@ -0,0 +1,29 @@ +job_id: gpu_aquaplanet_chap_2process +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 30 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: equil +surface_setup: DefaultMoninObukhov +rad: allskywithclear +idealized_insolation: false +dt_rad: 1hours +dt_cloud_fraction: 1hours +turbconv: diagnostic_edmfx +implicit_diffusion: true +approximate_linear_solve_iters: 2 +prognostic_tke: true +edmfx_upwinding: first_order +edmfx_entr_model: "Generalized" +edmfx_detr_model: "Generalized" +edmfx_nh_pressure: true +edmfx_sgs_mass_flux: true +edmfx_sgs_diffusive_flux: true +precip_model: 0M +dt: 100secs +t_end: 1days +toml: [toml/diagnostic_edmfx_box.toml] diff --git a/config/gpu_configs/gpu_aquaplanet_chap_4process.yml b/config/gpu_configs/gpu_aquaplanet_chap_4process.yml new file mode 100644 index 0000000000..0faddbf60f --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_chap_4process.yml @@ -0,0 +1,29 @@ +job_id: gpu_aquaplanet_chap_4process +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 30 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: equil +surface_setup: DefaultMoninObukhov +rad: allskywithclear +idealized_insolation: false +dt_rad: 1hours +dt_cloud_fraction: 1hours +turbconv: diagnostic_edmfx +implicit_diffusion: true +approximate_linear_solve_iters: 2 +prognostic_tke: true +edmfx_upwinding: first_order +edmfx_entr_model: "Generalized" +edmfx_detr_model: "Generalized" +edmfx_nh_pressure: true +edmfx_sgs_mass_flux: true +edmfx_sgs_diffusive_flux: true +precip_model: 0M +dt: 100secs +t_end: 1days +toml: [toml/diagnostic_edmfx_box.toml] diff --git a/config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml b/config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml new file mode 100644 index 0000000000..06f70b2277 --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml @@ -0,0 +1,29 @@ +job_id: gpu_aquaplanet_chap_ws_1process +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 30 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: equil +surface_setup: DefaultMoninObukhov +rad: allskywithclear +idealized_insolation: false +dt_rad: 1hours +dt_cloud_fraction: 1hours +turbconv: diagnostic_edmfx +implicit_diffusion: true +approximate_linear_solve_iters: 2 +prognostic_tke: true +edmfx_upwinding: first_order +edmfx_entr_model: "Generalized" +edmfx_detr_model: "Generalized" +edmfx_nh_pressure: true +edmfx_sgs_mass_flux: true +edmfx_sgs_diffusive_flux: true +precip_model: 0M +dt: 50secs +t_end: 1days +toml: [toml/diagnostic_edmfx_box.toml] diff --git a/config/gpu_configs/gpu_aquaplanet_chap_ws_2process.yml b/config/gpu_configs/gpu_aquaplanet_chap_ws_2process.yml new file mode 100644 index 0000000000..49f50ce316 --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_chap_ws_2process.yml @@ -0,0 +1,29 @@ +job_id: gpu_aquaplanet_chap_ws_2process +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 42 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: equil +surface_setup: DefaultMoninObukhov +rad: allskywithclear +idealized_insolation: false +dt_rad: 1hours +dt_cloud_fraction: 1hours +turbconv: diagnostic_edmfx +implicit_diffusion: true +approximate_linear_solve_iters: 2 +prognostic_tke: true +edmfx_upwinding: first_order +edmfx_entr_model: "Generalized" +edmfx_detr_model: "Generalized" +edmfx_nh_pressure: true +edmfx_sgs_mass_flux: true +edmfx_sgs_diffusive_flux: true +precip_model: 0M +dt: 50secs +t_end: 1days +toml: [toml/diagnostic_edmfx_box.toml] diff --git a/config/gpu_configs/gpu_aquaplanet_chap_ws_4process.yml b/config/gpu_configs/gpu_aquaplanet_chap_ws_4process.yml new file mode 100644 index 0000000000..fd666d4c08 --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_chap_ws_4process.yml @@ -0,0 +1,29 @@ +job_id: gpu_aquaplanet_chap_ws_4process +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 60 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: equil +surface_setup: DefaultMoninObukhov +rad: allskywithclear +idealized_insolation: false +dt_rad: 1hours +dt_cloud_fraction: 1hours +turbconv: diagnostic_edmfx +implicit_diffusion: true +approximate_linear_solve_iters: 2 +prognostic_tke: true +edmfx_upwinding: first_order +edmfx_entr_model: "Generalized" +edmfx_detr_model: "Generalized" +edmfx_nh_pressure: true +edmfx_sgs_mass_flux: true +edmfx_sgs_diffusive_flux: true +precip_model: 0M +dt: 50secs +t_end: 1days +toml: [toml/diagnostic_edmfx_box.toml] diff --git a/config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml b/config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml new file mode 100644 index 0000000000..4e7cc8b6bd --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml @@ -0,0 +1,20 @@ +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 30 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: "equil" +precip_model: "0M" +rad: "allskywithclear" +idealized_insolation: false +dt_rad: "1hours" +vert_diff: "true" +surface_setup: "DefaultMoninObukhov" +rayleigh_sponge: true +dt: "50secs" +t_end: "12hours" +job_id: "gpu_aquaplanet_dyamond_2process" +toml: [toml/longrun_aquaplanet_dyamond.toml] diff --git a/config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml b/config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml new file mode 100644 index 0000000000..8b671cb541 --- /dev/null +++ b/config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml @@ -0,0 +1,20 @@ +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +output_default_diagnostics: false +h_elem: 30 +z_max: 55000.0 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +moist: "equil" +precip_model: "0M" +rad: "allskywithclear" +idealized_insolation: false +dt_rad: "1hours" +vert_diff: "true" +surface_setup: "DefaultMoninObukhov" +rayleigh_sponge: true +dt: "50secs" +t_end: "12hours" +job_id: "gpu_aquaplanet_dyamond_4process" +toml: [toml/longrun_aquaplanet_dyamond.toml]