From e9538ec238bac20d2b28805901bb774fa2775f5d Mon Sep 17 00:00:00 2001 From: sriharshakandala Date: Wed, 27 Sep 2023 15:37:23 -0700 Subject: [PATCH] Add moist Held-Suarez and distributed baroclinic wave simulations to GPU pipeline --- .buildkite/gpu_pipeline/pipeline.yml | 46 ++++++++++++++++--- ...hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml | 16 +++++++ ..._gpu_implicit_baroclinic_wave_4process.yml | 7 +++ 3 files changed, 62 insertions(+), 7 deletions(-) create mode 100644 config/gpu_configs/gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml create mode 100644 config/gpu_configs/target_gpu_implicit_baroclinic_wave_4process.yml diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml index ad065e1a0b..e4d401a82b 100644 --- a/.buildkite/gpu_pipeline/pipeline.yml +++ b/.buildkite/gpu_pipeline/pipeline.yml @@ -1,13 +1,14 @@ agents: queue: clima slurm_mem: 8G - modules: julia/1.9.3 cuda/julia-pref openmpi/4.1.5-cuda + modules: julia/1.9.3 cuda/julia-pref openmpi/4.1.5 env: OPENBLAS_NUM_THREADS: 1 JULIA_NVTX_CALLBACKS: gc JULIA_MAX_NUM_PRECOMPILE_FILES: 100 GPU_CONFIG_PATH: "config/gpu_configs/" + SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291 steps: - label: "init :GPU:" @@ -21,6 +22,7 @@ steps: - echo "--- Configure CUDA" # force the initialization of the CUDA runtime as it is lazily loaded by default - "julia --project -e 'using CUDA; CUDA.precompile_runtime()'" + - julia --project -e 'using CUDA; CUDA.versioninfo()' - echo "--- Instantiate examples" - "julia --project=examples -e 'using Pkg; Pkg.instantiate(;verbose=true)'" @@ -29,6 +31,7 @@ steps: - echo "--- Download artifacts" - "julia --project=examples artifacts/download_artifacts.jl" + agents: slurm_gpus: 1 env: @@ -40,13 +43,42 @@ steps: - group: "GPU target simulations" steps: - - label: "target_gpu_implicit_baroclinic_wave" - command: > - julia --project -e 'using CUDA; CUDA.versioninfo()' - - nsys profile --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/report julia --color=yes --project=examples examples/hybrid/driver.jl - --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml + - label: "dry baroclinic wave" + key: "target_gpu_implicit_baroclinic_wave" + command: + - mkdir -p target_gpu_implicit_baroclinic_wave + - > + nsys profile --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/report + julia --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml artifact_paths: "target_gpu_implicit_baroclinic_wave/*" agents: slurm_gpus: 1 slurm_time: 23:00:00 + + - label: "moist Held-Suarez" + key: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km" + command: + - mkdir -p gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km + - > + nsys profile --trace=nvtx,cuda --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report + julia --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml + artifact_paths: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*" + agents: + slurm_gpus: 1 + slurm_time: 23:00:00 + + - label: "dry baroclinic wave - 4 gpus" + key: "target_gpu_implicit_baroclinic_wave_4process" + command: + - mkdir -p target_gpu_implicit_baroclinic_wave_4process + - > + srun + nsys profile --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK} + julia --color=yes --project=examples examples/hybrid/driver.jl + --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave_4process.yml + artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_ntasks: 4 diff --git a/config/gpu_configs/gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml b/config/gpu_configs/gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml new file mode 100644 index 0000000000..4a2eb8b36d --- /dev/null +++ b/config/gpu_configs/gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml @@ -0,0 +1,16 @@ +dt_save_to_disk: "10days" +dt: "150secs" +t_end: "300days" +h_elem: 16 +z_elem: 63 +dz_bottom: 30.0 +dz_top: 3000.0 +z_max: 55000.0 +kappa_4: 2.0e16 +vert_diff: "true" +moist: "equil" +precip_model: "0M" +rayleigh_sponge: true +forcing: "held_suarez" +job_id: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km" +toml: [toml/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.toml] diff --git a/config/gpu_configs/target_gpu_implicit_baroclinic_wave_4process.yml b/config/gpu_configs/target_gpu_implicit_baroclinic_wave_4process.yml new file mode 100644 index 0000000000..58206770cb --- /dev/null +++ b/config/gpu_configs/target_gpu_implicit_baroclinic_wave_4process.yml @@ -0,0 +1,7 @@ +h_elem: 30 +initial_condition: "DryBaroclinicWave" +t_end: "1days" +z_elem: 45 +dt: "150secs" +dt_save_to_sol: "Inf" +job_id: "target_gpu_implicit_baroclinic_wave_4process"