From 844ebd8eb877ebf2a6a819bc94b58fc18c6c3a14 Mon Sep 17 00:00:00 2001 From: Zhaoyi Shen <11598433+szy21@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:07:55 -0800 Subject: [PATCH] add gpu longrun pipeline --- .buildkite/gpu_pipeline/pipeline.yml | 2 + .buildkite/longruns/pipeline.yml | 52 ---------- .buildkite/longruns_gpu/pipeline.yml | 95 +++++++++++++++++++ ...hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml | 2 +- 4 files changed, 98 insertions(+), 53 deletions(-) create mode 100644 .buildkite/longruns_gpu/pipeline.yml diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml index 194030502fc..896e2c48d8f 100644 --- a/.buildkite/gpu_pipeline/pipeline.yml +++ b/.buildkite/gpu_pipeline/pipeline.yml @@ -18,6 +18,8 @@ env: MPI_CONFIG_PATH: "config/mpi_configs" CLIMAATMOS_GC_NSTEPS: 10 +timeout_in_minutes: 1440 + steps: - label: "init :GPU:" key: "init_gpu_env" diff --git a/.buildkite/longruns/pipeline.yml b/.buildkite/longruns/pipeline.yml index 56441e4600c..6fc22bbfff6 100644 --- a/.buildkite/longruns/pipeline.yml +++ b/.buildkite/longruns/pipeline.yml @@ -49,30 +49,6 @@ steps: steps: - - label: ":computer: baroclinic wave (ρe_tot) high resolution" - command: - - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml - - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME - - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name dry_baroclinic_wave - artifact_paths: "$$JOB_NAME/*" - agents: - slurm_ntasks: 32 - slurm_time: 24:00:00 - env: - JOB_NAME: "longrun_bw_rhoe_highres" - - - label: ":computer: no lim ARS baroclinic wave (ρe_tot) equilmoist high resolution" - command: - - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml - - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME - - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name moist_baroclinic_wave - artifact_paths: "$$JOB_NAME/*" - agents: - slurm_ntasks: 32 - slurm_time: 24:00:00 - env: - JOB_NAME: "longrun_bw_rhoe_equil_highres" - - label: ":computer: lim ARS zalesak baroclinic wave (ρe_tot) equilmoist high resolution" command: - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml @@ -98,34 +74,6 @@ steps: env: JOB_NAME: "longrun_ssp_bw_rhoe_equil_highres" - - label: ":computer: held-suarez, dry, high-topped (55km), high-sponge (35km), helem_16 np_3" - command: - - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml - - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME - - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name dry_held_suarez - artifact_paths: "$$JOB_NAME/*" - env: - CLIMACORE_DISTRIBUTED: "MPI" - JOB_NAME: "longrun_hs_rhoe_dry_nz63_55km_rs35km" - agents: - slurm_ntasks: 64 - slurm_mem_per_cpu: 16GB - slurm_time: 24:00:00 - - - label: ":computer: held-suarez, equilmoist, high-topped (55km), high-sponge (35km), helem_16 np_3" - command: - - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml - - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME - - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name aquaplanet - artifact_paths: "$$JOB_NAME/*" - env: - CLIMACORE_DISTRIBUTED: "MPI" - JOB_NAME: "longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km" - agents: - slurm_ntasks: 64 - slurm_mem_per_cpu: 16GB - slurm_time: 24:00:00 - - label: ":computer: aquaplanet, equilmoist, high-topped (55km), gray-radiation, vertdiff, high-sponge (35km), helem_16 np_3" command: - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml diff --git a/.buildkite/longruns_gpu/pipeline.yml b/.buildkite/longruns_gpu/pipeline.yml new file mode 100644 index 00000000000..c4d2c596184 --- /dev/null +++ b/.buildkite/longruns_gpu/pipeline.yml @@ -0,0 +1,95 @@ +agents: + queue: clima + slurm_mem: 8G + modules: julia/1.9.4 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2023.4.1 + +env: + JULIA_CUDA_MEMORY_POOL: none + JULIA_MPI_HAS_CUDA: "true" + JULIA_NVTX_CALLBACKS: gc + JULIA_MAX_NUM_PRECOMPILE_FILES: 100 + OPENBLAS_NUM_THREADS: 1 + OMPI_MCA_opal_warn_on_missing_libcuda: 0 + SLURM_KILL_BAD_EXIT: 1 + SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291 + CONFIG_PATH: "config/longrun_configs" + CLIMAATMOS_GC_NSTEPS: 10 + +timeout_in_minutes: 1440 + +steps: + - label: "init :GPU:" + key: "init_gpu_env" + command: + - echo "--- Instantiate examples" + - julia --project=examples -e 'using Pkg; Pkg.instantiate(;verbose=true)' + - julia --project=examples -e 'using Pkg; Pkg.precompile()' + - julia --project=examples -e 'using CUDA; CUDA.precompile_runtime()' + - julia --project=examples -e 'using Pkg; Pkg.status()' + + - echo "--- Download artifacts" + - julia --project=examples artifacts/download_artifacts.jl + + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 8 + env: + JULIA_NUM_PRECOMPILE_TASKS: 8 + JULIA_MAX_NUM_PRECOMPILE_FILES: 50 + + - wait + + - group: "Targeted resolution AMIP long runs" + steps: + + - label: ":computer: baroclinic wave (ρe_tot) high resolution" + command: + - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml + - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME + - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name dry_baroclinic_wave + artifact_paths: "$$JOB_NAME/*" + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 4 + slurm_time: 24:00:00 + env: + JOB_NAME: "longrun_bw_rhoe_highres" + + - label: ":computer: no lim ARS baroclinic wave (ρe_tot) equilmoist high resolution" + command: + - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml + - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME + - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name moist_baroclinic_wave + artifact_paths: "$$JOB_NAME/*" + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 4 + slurm_time: 24:00:00 + env: + JOB_NAME: "longrun_bw_rhoe_equil_highres" + + - label: ":computer: held-suarez, dry, high-topped (55km), high-sponge (35km), helem_16 np_3" + command: + - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml + - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME + - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name dry_held_suarez + artifact_paths: "$$JOB_NAME/*" + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 4 + slurm_time: 24:00:00 + env: + JOB_NAME: "longrun_hs_rhoe_dry_nz63_55km_rs35km" + + - label: ":computer: held-suarez, equilmoist, high-topped (55km), high-sponge (35km), helem_16 np_3" + command: + - srun julia --project=examples examples/hybrid/driver.jl --config_file $CONFIG_PATH/$$JOB_NAME.yml + - julia --color=yes --project=examples post_processing/remap/remap_pipeline.jl --data_dir $$JOB_NAME --out_dir $$JOB_NAME + - julia --color=yes --project=examples post_processing/plot/plot_pipeline.jl --nc_dir $$JOB_NAME --fig_dir $$JOB_NAME --case_name aquaplanet + artifact_paths: "$$JOB_NAME/*" + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 4 + slurm_time: 24:00:00 + env: + JOB_NAME: "longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km" diff --git a/config/longrun_configs/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml b/config/longrun_configs/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml index 0be31066aaf..01b9251124e 100644 --- a/config/longrun_configs/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml +++ b/config/longrun_configs/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml @@ -6,7 +6,7 @@ z_elem: 63 dz_bottom: 30.0 dz_top: 3000.0 z_max: 55000.0 -kappa_4: 2.0e16 +kappa_4: 1.0e16 vert_diff: "true" moist: "equil" precip_model: "0M"