Skip to content

Commit

Permalink
Add Clima GPU server test (#93)
Browse files Browse the repository at this point in the history
* Add Clima GPU server test

* remove partition

* Tweak slurm job control

* Try manually unsetting env vars
  • Loading branch information
nefrathenrici authored Jun 21, 2024
1 parent 3c023b7 commit b9ad06a
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 36 deletions.
28 changes: 28 additions & 0 deletions .buildkite/clima_server_test/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
agents:
queue: clima
modules: julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2024.2.1

env:
JULIA_MPI_HAS_CUDA: "true"
JULIA_NVTX_CALLBACKS: gc
JULIA_MAX_NUM_PRECOMPILE_FILES: 100
OPENBLAS_NUM_THREADS: 1
OMPI_MCA_opal_warn_on_missing_libcuda: 0
SLURM_KILL_BAD_EXIT: 1
SLURM_GRES_FLAGS: "allow-task-sharing"
JULIA_DEPOT_PATH: "${BUILDKITE_BUILD_PATH}/${BUILDKITE_PIPELINE_SLUG}/depot/default"

steps:
- label: "init :computer:"
key: "init_cpu_env"
command:
- echo "--- Instantiate SurfaceFluxes calibration project"
- julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.precompile()'

- wait
- label: "SurfaceFluxes perfect model calibration"
command: julia --project=experiments/surface_fluxes_perfect_model test/slurm_backend_e2e.jl
artifact_paths: output/surface_fluxes_perfect_model/*

- label: "Slurm job controller unit tests"
command: julia --project=experiments/surface_fluxes_perfect_model test/slurm_unit_tests.jl
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ steps:

- wait
- label: "SurfaceFluxes perfect model calibration"
command: julia --project=experiments/surface_fluxes_perfect_model test/caltech_hpc_e2e.jl
command: julia --project=experiments/surface_fluxes_perfect_model test/slurm_backend_e2e.jl
artifact_paths: output/surface_fluxes_perfect_model/*

- label: "Slurm job controller unit tests"
Expand Down
20 changes: 11 additions & 9 deletions src/backends.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@ Get ideal backend for deploying forward model runs.
Each backend is found via `gethostname()`. Defaults to JuliaBackend if none is found.
"""
function get_backend()
hostname = gethostname()
if occursin(r"^hpc-(\d\d)-(\d\d).cm.cluster$", hostname) ||
occursin(r"^login[1-4].cm.cluster$match", hostname)
return CaltechHPCBackend
elseif hostname == "clima.gps.caltech.edu"
return ClimaGPUBackend
else
return JuliaBackend
HOSTNAMES = [
(r"^clima.gps.caltech.edu$", ClimaGPUBackend),
(r"^login[1-4].cm.cluster$", CaltechHPCBackend),
(r"^hpc-(\d\d)-(\d\d).cm.cluster$", CaltechHPCBackend),
]

for (pattern, backend) in HOSTNAMES
!isnothing(match(pattern, gethostname())) && return backend
end

return JuliaBackend
end

"""
Expand All @@ -36,7 +38,7 @@ end

function module_load_string(::Type{ClimaGPUBackend})
return """module purge
modules load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline"""
module load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline"""
end

"""
Expand Down
2 changes: 1 addition & 1 deletion src/ekp_interface.jl
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ function initialize(
param_dict = get_param_dict(prior)

save_parameter_ensemble(
EKP.get_u_final(eki), # constraints applied when saving
EKP.get_u_final(eki), # constraints applied when saving
prior,
param_dict,
output_dir,
Expand Down
26 changes: 15 additions & 11 deletions src/slurm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,12 @@ function generate_sbatch_script(
slurm_directives = map(collect(slurm_kwargs)) do (k, v)
"#SBATCH --$(replace(string(k), "_" => "-"))=$(replace(string(v), "_" => "-"))"
end
slurm_directives_str = join(slurm_directives, "\n")

sbatch_contents = """
#!/bin/bash
#SBATCH --job-name=run_$(iter)_$(member)
#SBATCH --output=$member_log
$slurm_directives_str
$(join(slurm_directives, "\n"))
$module_load_str
srun --output=$member_log --open-mode=append julia --project=$experiment_dir -e '
Expand Down Expand Up @@ -188,33 +187,38 @@ function report_iteration_status(statuses, output_dir, iter)
end
end

function submit_sbatch_job(sbatch_filepath; debug = false, env = ENV)
function submit_sbatch_job(sbatch_filepath; debug = false, env = deepcopy(ENV))
unset_env_vars =
("SLURM_MEM_PER_CPU", "SLURM_MEM_PER_GPU", "SLURM_MEM_PER_NODE")
for k in unset_env_vars
haskey(env, k) && delete!(env, k)
end
jobid = readchomp(setenv(`sbatch --parsable $sbatch_filepath`, env))
debug || rm(sbatch_filepath)
return parse(Int, jobid)
end

job_running(status) = status == "RUNNING"
job_success(status) = status == "COMPLETED"
job_failed(status) = status == "FAILED"
job_running(status) = status == :RUNNING
job_success(status) = status == :COMPLETED
job_failed(status) = status == :FAILED
job_completed(status) = job_failed(status) || job_success(status)

"""
job_status(jobid)
Parse the slurm jobid's state and return one of three status strings: "COMPLETED", "FAILED", or "RUNNING"
Parse the slurm jobid's state and return one of three status symbols: :COMPLETED, :FAILED, or :RUNNING.
"""
function job_status(jobid)
function job_status(jobid::Int)
failure_statuses = ("FAILED", "CANCELLED+", "CANCELLED")
output = readchomp(`sacct -j $jobid --format=State --noheader`)
# Jobs usually have multiple statuses
statuses = strip.(split(output, "\n"))
if all(s -> s == "COMPLETED", statuses)
return "COMPLETED"
return :COMPLETED
elseif any(s -> s in failure_statuses, statuses)
return "FAILED"
return :FAILED
else
return "RUNNING"
return :RUNNING
end
end

Expand Down
10 changes: 3 additions & 7 deletions test/caltech_hpc_e2e.jl → test/slurm_backend_e2e.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/caltech
# And include this file

import ClimaCalibrate:
get_backend, CaltechHPCBackend, JuliaBackend, calibrate, get_prior, kwargs
import ClimaCalibrate: get_backend, JuliaBackend, calibrate, get_prior, kwargs
using Test
import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final

Expand Down Expand Up @@ -33,20 +32,17 @@ function test_sf_calibration_output(eki, prior)
end
end

# Test Caltech HPC backend
backend = get_backend()
@test backend == CaltechHPCBackend
@assert get_backend() != JuliaBackend

eki = calibrate(
backend,
experiment_dir;
model_interface,
slurm_kwargs = kwargs(time = 5),
verbose = true,
)
test_sf_calibration_output(eki, prior)

# Pure Julia Backend
# Pure Julia calibration, this should run anywhere
eki = calibrate(JuliaBackend, experiment_dir)
test_sf_calibration_output(eki, prior)

Expand Down
10 changes: 3 additions & 7 deletions test/slurm_unit_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,13 @@ const ITER = 1
const MEMBER = 1
const TIME_LIMIT = 90
const NTASKS = 1
const PARTITION = "expansion"
const CPUS_PER_TASK = 16
const GPUS_PER_TASK = 1
const EXPERIMENT_DIR = "exp/dir"
const MODEL_INTERFACE = "model_interface.jl"
const MODULE_LOAD_STR = CAL.module_load_string(CAL.CaltechHPCBackend)
const slurm_kwargs = CAL.kwargs(
time = TIME_LIMIT,
partition = PARTITION,
cpus_per_task = CPUS_PER_TASK,
gpus_per_task = GPUS_PER_TASK,
)
Expand All @@ -41,7 +39,6 @@ expected_sbatch_contents = """
#!/bin/bash
#SBATCH --job-name=run_1_1
#SBATCH --output=test/iteration_001/member_001/model_log.txt
#SBATCH --partition=expansion
#SBATCH --gpus-per-task=1
#SBATCH --cpus-per-task=16
#SBATCH --time=01:30:00
Expand Down Expand Up @@ -77,24 +74,23 @@ end
test_cmd = """
#!/bin/bash
#SBATCH --time=00:00:10
#SBATCH --partition=expansion
sleep 10
"""

jobid = submit_cmd_helper(test_cmd)
@test CAL.job_status(jobid) == "RUNNING"
@test CAL.job_status(jobid) == :RUNNING
@test CAL.job_running(CAL.job_status(jobid))

sleep(180) # Ensure job finishes. To debug, lower sleep time or comment out the code block
@test CAL.job_status(jobid) == "COMPLETED"
@test CAL.job_status(jobid) == :COMPLETED
@test CAL.job_completed(CAL.job_status(jobid))
@test CAL.job_success(CAL.job_status(jobid))

# Test job cancellation
jobid = submit_cmd_helper(test_cmd)
CAL.kill_slurm_job(jobid)
sleep(1)
@test CAL.job_status(jobid) == "FAILED"
@test CAL.job_status(jobid) == :FAILED
@test CAL.job_completed(CAL.job_status(jobid)) &&
CAL.job_failed(CAL.job_status(jobid))

Expand Down

0 comments on commit b9ad06a

Please sign in to comment.