From 8aeaeed60eae3857a1f970da23e97235229ecfae Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 09:57:56 -0800 Subject: [PATCH] add strong scaling GPU AMIP --- .buildkite/gpu/pipeline.yml | 88 +++++++++++++++++++ config/gpu_configs/gpu_amip_chap.yml | 22 +++++ config/gpu_configs/gpu_amip_chap_2process.yml | 22 +++++ config/gpu_configs/gpu_amip_chap_4process.yml | 22 +++++ experiments/AMIP/coupler_driver.jl | 8 +- 5 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 .buildkite/gpu/pipeline.yml create mode 100644 config/gpu_configs/gpu_amip_chap.yml create mode 100644 config/gpu_configs/gpu_amip_chap_2process.yml create mode 100644 config/gpu_configs/gpu_amip_chap_4process.yml diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml new file mode 100644 index 0000000000..e1a952b0dd --- /dev/null +++ b/.buildkite/gpu/pipeline.yml @@ -0,0 +1,88 @@ +agents: + queue: clima + slurm_mem: 8G + modules: common nsight-systems/2023.4.1 + +env: + JULIA_CUDA_MEMORY_POOL: none + JULIA_MPI_HAS_CUDA: "true" + JULIA_NVTX_CALLBACKS: gc + JULIA_MAX_NUM_PRECOMPILE_FILES: 100 + OPENBLAS_NUM_THREADS: 1 + OMPI_MCA_opal_warn_on_missing_libcuda: 0 + SLURM_KILL_BAD_EXIT: 1 + SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291 + GPU_CONFIG_PATH: "config/gpu_configs" + CLIMAATMOS_GC_NSTEPS: 10 + +steps: + - label: "init :GPU:" + key: "init_gpu_env" + command: + - echo "--- Instantiate experiments/AMIP" + - julia --project=experiments/AMIP -e 'using Pkg; Pkg.instantiate(;verbose=true)' + - julia --project=experiments/AMIP -e 'using Pkg; Pkg.precompile()' + - julia --project=experiments/AMIP -e 'using CUDA; CUDA.precompile_runtime()' + - julia --project=experiments/AMIP -e 'using Pkg; Pkg.status()' + + - echo "--- Download artifacts" + - "julia --project=artifacts -e 'using Pkg; Pkg.instantiate(;verbose=true)'" + - "julia --project=artifacts -e 'using Pkg; Pkg.precompile()'" + - "julia --project=artifacts -e 'using Pkg; Pkg.status()'" + - "julia --project=artifacts artifacts/download_artifacts.jl" + + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 8 + env: + JULIA_NUM_PRECOMPILE_TASKS: 8 + JULIA_MAX_NUM_PRECOMPILE_FILES: 50 + + - wait + + - group: "CHAP GPU strong scaling" + steps: + + - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + key: "gpu_amip_chap" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + artifact_paths: "gpu_amip_chap/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: + + - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + key: "gpu_amip_chap_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + artifact_paths: "gpu_amip_chap_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_exclusive: + + - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + key: "gpu_amip_chap_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/hybrid/driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + artifact_paths: "gpu_amip_chap_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap.yml b/config/gpu_configs/gpu_amip_chap.yml new file mode 100644 index 0000000000..c0a122f78a --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_2process.yml b/config/gpu_configs/gpu_amip_chap_2process.yml new file mode 100644 index 0000000000..bb8075fae1 --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_2process.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_2process.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_2process" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_2process" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_4process.yml b/config/gpu_configs/gpu_amip_chap_4process.yml new file mode 100644 index 0000000000..000bf8e85d --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_4process.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_4process.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_4process" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_4process" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl index 15301cae09..a4aea6f38f 100644 --- a/experiments/AMIP/coupler_driver.jl +++ b/experiments/AMIP/coupler_driver.jl @@ -678,7 +678,7 @@ function solve_coupler!(cs) end @show walltime - return cs + return walltime end ## exit if running performance anaysis #hide @@ -687,7 +687,11 @@ if haskey(ENV, "CI_PERF_SKIP_COUPLED_RUN") #hide end #hide ## run the coupled simulation -solve_coupler!(cs); +walltime = solve_coupler!(cs); + +# Show the simulated years per day of the simulation +es = CA.EfficiencyStats(tspan, walltime) +@info "SYPD: $(CA.simulated_years_per_day(es))" #= ## Postprocessing