From e579d8466e8a4bf75e9a42399c7248032e2a2e26 Mon Sep 17 00:00:00 2001 From: LenkaNovak Date: Mon, 18 Sep 2023 20:32:14 -0700 Subject: [PATCH] srun --- .buildkite/pipeline.yml | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index c91ccfba8b..30aed55522 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,3 +1,8 @@ +agents: + queue: central + slurm_mem: 8G + modules: julia/1.9.3 cuda/11.8 ucx/1.14.1_cuda-11.8 openmpi/4.1.5_cuda-11.8 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1 + env: JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite" OPENBLAS_NUM_THREADS: 1 @@ -9,19 +14,9 @@ env: CONFIG_PATH: "config/model_configs" PERF_CONFIG_PATH: "config/perf_configs" MPI_CONFIG_PATH: "config/mpi_configs" + SLURM_KILL_BAD_EXIT: 1 BUILDKITE_COMMIT: "${BUILDKITE_COMMIT}" BUILDKITE_BRANCH: "${BUILDKITE_BRANCH}" - SLURM_KILL_BAD_EXIT: 1 - -agents: - config: cpu - queue: central - modules: julia/1.9.3 cuda/11.8 ucx/1.14.1_cuda-11.8 openmpi/4.1.5_cuda-11.8 hdf5/1.12.2-ompi415 nsight-systems/2023.2.1 - slurm_ntasks: 1 - slurm_time: 24:00:00 - slurm_mem: 8G - slurm_gpus: 1 - slurm_cpus_per_task: 8 timeout_in_minutes: 1440 @@ -68,6 +63,7 @@ steps: agents: slurm_cpus_per_task: 8 + slurm_gpus: 1 env: JULIA_NUM_PRECOMPILE_TASKS: 8 JULIA_MAX_NUM_PRECOMPILE_FILES: 50 @@ -104,15 +100,13 @@ steps: - label: "MPI Checkpointer unit tests" key: "checkpointer_mpi_tests" - command: "mpiexec julia --color=yes --project=test/ test/mpi_tests/checkpointer_mpi_tests.jl --run_name checkpointer_mpi --job_id checkpointer_mpi" + command: "srun julia --color=yes --project=test/ test/mpi_tests/checkpointer_mpi_tests.jl --run_name checkpointer_mpi --job_id checkpointer_mpi" timeout_in_minutes: 20 env: CLIMACORE_DISTRIBUTED: "MPI" agents: - config: cpu - queue: central - slurm_nodes: 3 - slurm_tasks_per_node: 1 + slurm_ntasks: 2 + slurm_mem: 16GB - label: "Perf flame graph diff tests" command: "julia --color=yes --project=perf/ perf/flame_test.jl --run_name flame_test --job_id flame_perf_target"