From b9ad06a68e0cd73f5810dd6509f1a7ca0ea17fc1 Mon Sep 17 00:00:00 2001 From: Nat Efrat-Henrici <60049837+nefrathenrici@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:28:08 -0700 Subject: [PATCH] Add Clima GPU server test (#93) * Add Clima GPU server test * remove partition * Tweak slurm job control * Try manually unsetting env vars --- .buildkite/clima_server_test/pipeline.yml | 28 +++++++++++++++++++ .buildkite/pipeline.yml | 2 +- src/backends.jl | 20 +++++++------ src/ekp_interface.jl | 2 +- src/slurm.jl | 26 +++++++++-------- ...altech_hpc_e2e.jl => slurm_backend_e2e.jl} | 10 ++----- test/slurm_unit_tests.jl | 10 ++----- 7 files changed, 62 insertions(+), 36 deletions(-) create mode 100644 .buildkite/clima_server_test/pipeline.yml rename test/{caltech_hpc_e2e.jl => slurm_backend_e2e.jl} (88%) diff --git a/.buildkite/clima_server_test/pipeline.yml b/.buildkite/clima_server_test/pipeline.yml new file mode 100644 index 00000000..d711c824 --- /dev/null +++ b/.buildkite/clima_server_test/pipeline.yml @@ -0,0 +1,28 @@ +agents: + queue: clima + modules: julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2024.2.1 + +env: + JULIA_MPI_HAS_CUDA: "true" + JULIA_NVTX_CALLBACKS: gc + JULIA_MAX_NUM_PRECOMPILE_FILES: 100 + OPENBLAS_NUM_THREADS: 1 + OMPI_MCA_opal_warn_on_missing_libcuda: 0 + SLURM_KILL_BAD_EXIT: 1 + SLURM_GRES_FLAGS: "allow-task-sharing" + JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/default" + +steps: + - label: "init :computer:" + key: "init_cpu_env" + command: + - echo "--- Instantiate SurfaceFluxes calibration project" + - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.precompile()' + + - wait + - label: "SurfaceFluxes perfect model calibration" + command: julia --project=experiments/surface_fluxes_perfect_model test/slurm_backend_e2e.jl + artifact_paths: output/surface_fluxes_perfect_model/* + + - label: "Slurm job controller unit tests" + command: julia --project=experiments/surface_fluxes_perfect_model test/slurm_unit_tests.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index b196ad06..e5b54a93 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -17,7 +17,7 @@ steps: - wait - label: "SurfaceFluxes perfect model calibration" - command: julia --project=experiments/surface_fluxes_perfect_model test/caltech_hpc_e2e.jl + command: julia --project=experiments/surface_fluxes_perfect_model test/slurm_backend_e2e.jl artifact_paths: output/surface_fluxes_perfect_model/* - label: "Slurm job controller unit tests" diff --git a/src/backends.jl b/src/backends.jl index 992fd9d6..6c9bfa3a 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -12,15 +12,17 @@ Get ideal backend for deploying forward model runs. Each backend is found via `gethostname()`. Defaults to JuliaBackend if none is found. """ function get_backend() - hostname = gethostname() - if occursin(r"^hpc-(\d\d)-(\d\d).cm.cluster$", hostname) || - occursin(r"^login[1-4].cm.cluster$match", hostname) - return CaltechHPCBackend - elseif hostname == "clima.gps.caltech.edu" - return ClimaGPUBackend - else - return JuliaBackend + HOSTNAMES = [ + (r"^clima.gps.caltech.edu$", ClimaGPUBackend), + (r"^login[1-4].cm.cluster$", CaltechHPCBackend), + (r"^hpc-(\d\d)-(\d\d).cm.cluster$", CaltechHPCBackend), + ] + + for (pattern, backend) in HOSTNAMES + !isnothing(match(pattern, gethostname())) && return backend end + + return JuliaBackend end """ @@ -36,7 +38,7 @@ end function module_load_string(::Type{ClimaGPUBackend}) return """module purge - modules load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline""" + module load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline""" end """ diff --git a/src/ekp_interface.jl b/src/ekp_interface.jl index 59ebd822..bd1c811e 100644 --- a/src/ekp_interface.jl +++ b/src/ekp_interface.jl @@ -241,7 +241,7 @@ function initialize( param_dict = get_param_dict(prior) save_parameter_ensemble( - EKP.get_u_final(eki), # constraints applied when saving + EKP.get_u_final(eki), # constraints applied when saving prior, param_dict, output_dir, diff --git a/src/slurm.jl b/src/slurm.jl index 22d5b587..77074487 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -28,13 +28,12 @@ function generate_sbatch_script( slurm_directives = map(collect(slurm_kwargs)) do (k, v) "#SBATCH --$(replace(string(k), "_" => "-"))=$(replace(string(v), "_" => "-"))" end - slurm_directives_str = join(slurm_directives, "\n") sbatch_contents = """ #!/bin/bash #SBATCH --job-name=run_$(iter)_$(member) #SBATCH --output=$member_log - $slurm_directives_str + $(join(slurm_directives, "\n")) $module_load_str srun --output=$member_log --open-mode=append julia --project=$experiment_dir -e ' @@ -188,33 +187,38 @@ function report_iteration_status(statuses, output_dir, iter) end end -function submit_sbatch_job(sbatch_filepath; debug = false, env = ENV) +function submit_sbatch_job(sbatch_filepath; debug = false, env = deepcopy(ENV)) + unset_env_vars = + ("SLURM_MEM_PER_CPU", "SLURM_MEM_PER_GPU", "SLURM_MEM_PER_NODE") + for k in unset_env_vars + haskey(env, k) && delete!(env, k) + end jobid = readchomp(setenv(`sbatch --parsable $sbatch_filepath`, env)) debug || rm(sbatch_filepath) return parse(Int, jobid) end -job_running(status) = status == "RUNNING" -job_success(status) = status == "COMPLETED" -job_failed(status) = status == "FAILED" +job_running(status) = status == :RUNNING +job_success(status) = status == :COMPLETED +job_failed(status) = status == :FAILED job_completed(status) = job_failed(status) || job_success(status) """ job_status(jobid) -Parse the slurm jobid's state and return one of three status strings: "COMPLETED", "FAILED", or "RUNNING" +Parse the slurm jobid's state and return one of three status symbols: :COMPLETED, :FAILED, or :RUNNING. """ -function job_status(jobid) +function job_status(jobid::Int) failure_statuses = ("FAILED", "CANCELLED+", "CANCELLED") output = readchomp(`sacct -j $jobid --format=State --noheader`) # Jobs usually have multiple statuses statuses = strip.(split(output, "\n")) if all(s -> s == "COMPLETED", statuses) - return "COMPLETED" + return :COMPLETED elseif any(s -> s in failure_statuses, statuses) - return "FAILED" + return :FAILED else - return "RUNNING" + return :RUNNING end end diff --git a/test/caltech_hpc_e2e.jl b/test/slurm_backend_e2e.jl similarity index 88% rename from test/caltech_hpc_e2e.jl rename to test/slurm_backend_e2e.jl index 169b00a1..3cef7c92 100644 --- a/test/caltech_hpc_e2e.jl +++ b/test/slurm_backend_e2e.jl @@ -2,8 +2,7 @@ # To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/caltech # And include this file -import ClimaCalibrate: - get_backend, CaltechHPCBackend, JuliaBackend, calibrate, get_prior, kwargs +import ClimaCalibrate: get_backend, JuliaBackend, calibrate, get_prior, kwargs using Test import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final @@ -33,12 +32,9 @@ function test_sf_calibration_output(eki, prior) end end -# Test Caltech HPC backend -backend = get_backend() -@test backend == CaltechHPCBackend +@assert get_backend() != JuliaBackend eki = calibrate( - backend, experiment_dir; model_interface, slurm_kwargs = kwargs(time = 5), @@ -46,7 +42,7 @@ eki = calibrate( ) test_sf_calibration_output(eki, prior) -# Pure Julia Backend +# Pure Julia calibration, this should run anywhere eki = calibrate(JuliaBackend, experiment_dir) test_sf_calibration_output(eki, prior) diff --git a/test/slurm_unit_tests.jl b/test/slurm_unit_tests.jl index b7fe3b17..7e45b649 100644 --- a/test/slurm_unit_tests.jl +++ b/test/slurm_unit_tests.jl @@ -7,7 +7,6 @@ const ITER = 1 const MEMBER = 1 const TIME_LIMIT = 90 const NTASKS = 1 -const PARTITION = "expansion" const CPUS_PER_TASK = 16 const GPUS_PER_TASK = 1 const EXPERIMENT_DIR = "exp/dir" @@ -15,7 +14,6 @@ const MODEL_INTERFACE = "model_interface.jl" const MODULE_LOAD_STR = CAL.module_load_string(CAL.CaltechHPCBackend) const slurm_kwargs = CAL.kwargs( time = TIME_LIMIT, - partition = PARTITION, cpus_per_task = CPUS_PER_TASK, gpus_per_task = GPUS_PER_TASK, ) @@ -41,7 +39,6 @@ expected_sbatch_contents = """ #!/bin/bash #SBATCH --job-name=run_1_1 #SBATCH --output=test/iteration_001/member_001/model_log.txt -#SBATCH --partition=expansion #SBATCH --gpus-per-task=1 #SBATCH --cpus-per-task=16 #SBATCH --time=01:30:00 @@ -77,16 +74,15 @@ end test_cmd = """ #!/bin/bash #SBATCH --time=00:00:10 -#SBATCH --partition=expansion sleep 10 """ jobid = submit_cmd_helper(test_cmd) -@test CAL.job_status(jobid) == "RUNNING" +@test CAL.job_status(jobid) == :RUNNING @test CAL.job_running(CAL.job_status(jobid)) sleep(180) # Ensure job finishes. To debug, lower sleep time or comment out the code block -@test CAL.job_status(jobid) == "COMPLETED" +@test CAL.job_status(jobid) == :COMPLETED @test CAL.job_completed(CAL.job_status(jobid)) @test CAL.job_success(CAL.job_status(jobid)) @@ -94,7 +90,7 @@ sleep(180) # Ensure job finishes. To debug, lower sleep time or comment out the jobid = submit_cmd_helper(test_cmd) CAL.kill_slurm_job(jobid) sleep(1) -@test CAL.job_status(jobid) == "FAILED" +@test CAL.job_status(jobid) == :FAILED @test CAL.job_completed(CAL.job_status(jobid)) && CAL.job_failed(CAL.job_status(jobid))