diff --git a/.buildkite/longruns/pipeline.yml b/.buildkite/longruns/pipeline.yml index a1c9cd8ab2..aec6781103 100644 --- a/.buildkite/longruns/pipeline.yml +++ b/.buildkite/longruns/pipeline.yml @@ -1,6 +1,5 @@ agents: queue: central - slurm_mem: 20G slurm_time: 24:00:00 modules: julia/1.9.3 cuda/12.2 ucx/1.14.1_cuda-12.2 openmpi/4.1.5_cuda-12.2 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1 @@ -14,7 +13,6 @@ env: CONFIG_PATH: "config/longrun_configs" PERF_CONFIG_PATH: "config/perf_configs" - timeout_in_minutes: 1440 steps: @@ -63,6 +61,7 @@ steps: agents: slurm_ntasks: 1 slurm_time: 24:00:00 + slurm_mem_per_cpu: 20G # DYAMOND AMIP: 1 day (convection resolving) @@ -76,6 +75,7 @@ steps: BUILD_HISTORY_HANDLE: "" agents: slurm_time: 24:00:00 + slurm_mem_per_cpu: 20G # mid-resolution AMIP: longrun (140 days) - label: "MPI AMIP FINE: target longrun" @@ -88,6 +88,7 @@ steps: agents: slurm_ntasks: 32 slurm_time: 24:00:00 + slurm_mem_per_cpu: 20G # mid-resolution AMIP: MPI performance scaling (10 days) - label: "MPI AMIP FINE: n64" @@ -101,6 +102,7 @@ steps: slurm_ntasks: 64 slurm_time: 24:00:00 slurm_tasks_per_node: 8 + slurm_mem_per_cpu: 20G - label: "MPI AMIP FINE: n32" key: "mpi_amip_fine_n32" @@ -113,6 +115,7 @@ steps: slurm_ntasks: 32 slurm_time: 24:00:00 slurm_tasks_per_node: 8 + slurm_mem_per_cpu: 20G - label: "MPI AMIP FINE: n8" key: "mpi_amip_fine_n8" @@ -125,6 +128,7 @@ steps: slurm_ntasks: 8 slurm_time: 24:00:00 slurm_tasks_per_node: 8 + slurm_mem_per_cpu: 20G - label: "MPI AMIP FINE: n2" # 10d take 21h, so reducing to 1d key: "mpi_amip_fine_n2" @@ -137,6 +141,7 @@ steps: slurm_ntasks: 2 slurm_time: 24:00:00 slurm_tasks_per_node: 2 + slurm_mem_per_cpu: 20G - label: "MPI AMIP FINE: n1" # also reported by longruns with a flame graph; 10d take 21h, so reducing to 1d key: "mpi_amip_fine_n1" @@ -147,6 +152,7 @@ steps: agents: slurm_ntasks: 1 slurm_time: 24:00:00 + slurm_mem_per_cpu: 20G - label: "MPI AMIP FINE: n1 no couple" # sim time = Δt_cpl (~ benchmarking with standalone models) key: "mpi_amip_fine_n1_nocouple" @@ -157,6 +163,7 @@ steps: agents: slurm_ntasks: 1 slurm_time: 24:00:00 + slurm_mem_per_cpu: 20G # mpi_amip_fine_n1 flame graph report (NB: arguments passed from the ci pipeline.yml) - label: ":rocket: performance: flame graph diff: perf_target_amip_n1_shortrun" @@ -165,6 +172,7 @@ steps: agents: slurm_ntasks: 1 slurm_time: 24:00:00 + slurm_mem_per_cpu: 20G - wait