Skip to content

Commit

Permalink
Merge #443
Browse files Browse the repository at this point in the history
443: unify buildkite pipelines r=juliasloan25 a=juliasloan25



Co-authored-by: Julia Sloan <jsloan@caltech.edu>
  • Loading branch information
bors[bot] and juliasloan25 authored Oct 5, 2023
2 parents 2970957 + 4cce0a6 commit 15314cf
Showing 6 changed files with 34 additions and 35 deletions.
55 changes: 29 additions & 26 deletions .buildkite/longruns/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
agents:
queue: central
slurm_time: 24:00:00
modules: julia/1.9.3 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1

env:
JULIA_VERSION: "1.9.3"
MPI_IMPL: "openmpi"
OPENMPI_VERSION: "4.1.1"
CUDA_VERSION: "12.2"
JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite"
OPENBLAS_NUM_THREADS: 1
CLIMATEMACHINE_SETTINGS_FIX_RNG_SEED: "true"
BUILDKITE_COMMIT: "${BUILDKITE_COMMIT}"
BUILDKITE_BRANCH: "${BUILDKITE_BRANCH}"
JULIA_NVTX_CALLBACKS: gc
OMPI_MCA_opal_warn_on_missing_libcuda: 0
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
GKSwstype: 100

CONFIG_PATH: "config/longrun_configs"
PERF_CONFIG_PATH: "config/perf_configs"
# JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/cpu"

agents:
config: cpu
queue: central
slurm_ntasks: 1
slurm_time: 24:00:00

timeout_in_minutes: 1440

@@ -63,7 +59,8 @@ steps:
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks: 1
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

# - label: "MPI AMIP FINE: longrun" # unstable after 6 months
@@ -74,7 +71,8 @@ steps:
# CLIMACORE_DISTRIBUTED: "MPI"
# BUILD_HISTORY_HANDLE: ""
# agents:
# slurm_ntasks: 64
# slurm_ntasks_per_node: 16
# slurm_nodes: 4
# slurm_mem_per_cpu: 16G

- label: "MPI AMIP FINE: target longrun"
@@ -85,7 +83,8 @@ steps:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks: 32
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G

# MPI performance scaling (10 days)
@@ -97,9 +96,9 @@ steps:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks: 64
slurm_ntasks_per_node: 16
slurm_nodes: 4
slurm_mem_per_cpu: 16G
slurm_tasks_per_node: 8

- label: "MPI AMIP FINE: n32"
key: "mpi_amip_fine_n32"
@@ -109,9 +108,9 @@ steps:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks: 32
slurm_ntasks_per_node: 8
slurm_nodes: 4
slurm_mem_per_cpu: 16G
slurm_tasks_per_node: 8

- label: "MPI AMIP FINE: n8"
key: "mpi_amip_fine_n8"
@@ -121,9 +120,9 @@ steps:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks: 8
slurm_ntasks_per_node: 8
slurm_nodes: 1
slurm_mem_per_cpu: 16G
slurm_tasks_per_node: 8

- label: "MPI AMIP FINE: n2" # 10d take 21h, so reducing to 1d
key: "mpi_amip_fine_n2"
@@ -133,9 +132,9 @@ steps:
CLIMACORE_DISTRIBUTED: "MPI"
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks: 2
slurm_ntasks_per_node: 2
slurm_nodes: 1
slurm_mem_per_cpu: 16G
slurm_tasks_per_node: 2

- label: "MPI AMIP FINE: n1" # also reported by longruns with a flame graph; 10d take 21h, so reducing to 1d
key: "mpi_amip_fine_n1"
@@ -144,14 +143,18 @@ steps:
env:
BUILD_HISTORY_HANDLE: ""
agents:
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16G

# mpi_amip_fine_n1 flame graph report (NB: arguments passed from the ci pipeline.yml)
- label: ":rocket: performance: flame graph diff: perf_target_amip_n1_shortrun"
command: "julia --color=yes --project=perf perf/flame_diff.jl --config_file $PERF_CONFIG_PATH/perf_diff_target_amip_n1_shortrun.yml"
artifact_paths: "perf/output/perf_diff_target_amip_n1_shortrun/*"
agents:
slurm_mem: 20GB
slurm_ntasks_per_node: 1
slurm_nodes: 1
slurm_mem_per_cpu: 16GB

- wait

4 changes: 0 additions & 4 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
@@ -10,16 +10,12 @@ env:
JULIA_NVTX_CALLBACKS: gc
OMPI_MCA_opal_warn_on_missing_libcuda: 0
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/cpu"
GKSwstype: 100

CONFIG_PATH: "config/model_configs"
PERF_CONFIG_PATH: "config/perf_configs"
MPI_CONFIG_PATH: "config/mpi_configs"

BUILDKITE_COMMIT: "${BUILDKITE_COMMIT}"
BUILDKITE_BRANCH: "${BUILDKITE_BRANCH}"

timeout_in_minutes: 1440

steps:
4 changes: 2 additions & 2 deletions config/longrun_configs/amip_longrun_target.yml
Original file line number Diff line number Diff line change
@@ -15,8 +15,8 @@ kappa_4: 3e16
rayleigh_sponge: true
alpha_rayleigh_uh: 0
dt: "150secs"
t_end: "140days"
t_end: "100days" # TODO this has been decreased from 140 days to avoid instability
job_id: "amip_longrun_target"
dt_save_to_sol: "5days"
dt_save_to_disk: "1days"
apply_limiter: true
apply_limiter: true
2 changes: 1 addition & 1 deletion config/longrun_configs/amip_n2_shortrun.yml
Original file line number Diff line number Diff line change
@@ -19,6 +19,6 @@ energy_check: false
mode_name: "amip"
t_end: "1days"
dt_save_to_sol: "100days"
mono_surface: "false"
mono_surface: false
apply_limiter: true
precip_model: "0M"
2 changes: 1 addition & 1 deletion config/longrun_configs/amip_n32_shortrun.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
job_ide: "amip_n32_shortrun"
job_id: "amip_n32_shortrun"
run_name: "amip_n32_shortrun"
moist: "equil"
vert_diff: "true"
2 changes: 1 addition & 1 deletion config/longrun_configs/amip_n8_shortrun.yml
Original file line number Diff line number Diff line change
@@ -19,6 +19,6 @@ energy_check: false
mode_name: "amip"
t_end: "10days"
dt_save_to_sol: "100days"
mono_surface: "false"
mono_surface: false
apply_limiter: true
precip_model: "0M"

0 comments on commit 15314cf

Please sign in to comment.