From 3dc9114382299ca39fd5900a5336bf9e96b04391 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Thu, 20 Jun 2024 14:04:30 -0700 Subject: [PATCH] Add Clima GPU backend, need gpu pipeline --- .buildkite/pipeline.yml | 2 +- docs/src/quickstart.md | 4 +-- src/backends.jl | 57 +++++++++++++++++++++++++++++----------- src/slurm.jl | 23 +++++++--------- test/caltech_hpc_e2e.jl | 6 ++--- test/slurm_unit_tests.jl | 12 +++------ 6 files changed, 60 insertions(+), 44 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2dc61ddc..b196ad06 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: key: "init_cpu_env" command: - echo "--- Instantiate SurfaceFluxes calibration project" - - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.build("ClimaCalibrate"); Pkg.precompile()' + - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.precompile()' - wait - label: "SurfaceFluxes perfect model calibration" diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md index 2ce9596e..20743979 100644 --- a/docs/src/quickstart.md +++ b/docs/src/quickstart.md @@ -37,14 +37,14 @@ To run this experiment: 3. Start julia: `julia --project=experiments/surace_fluxes_perfect_model` 4. Run the following: ```julia -import ClimaCalibrate: CaltechHPC, calibrate +import ClimaCalibrate: CaltechHPCBackend, calibrate experiment_dir = dirname(Base.active_project()) include(joinpath(experiment_dir, "generate_data.jl")) model_interface = joinpath(experiment_dir, "model_interface.jl") include(joinpath(experiment_dir, "observation_map.jl")) -eki = calibrate(CaltechHPC, experiment_dir; +eki = calibrate(CaltechHPCBackend, experiment_dir; time_limit = 3, model_interface) include(joinpath(experiment_dir, "postprocessing.jl")) diff --git a/src/backends.jl b/src/backends.jl index 29becec6..992fd9d6 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -1,20 +1,42 @@ abstract type AbstractBackend end struct JuliaBackend <: AbstractBackend end -struct CaltechHPC <: AbstractBackend end +abstract type SlurmBackend <: AbstractBackend end +struct CaltechHPCBackend <: SlurmBackend end +struct ClimaGPUBackend <: SlurmBackend end """ get_backend() -Determine the appropriate backend using relevant system information. +Get ideal backend for deploying forward model runs. +Each backend is found via `gethostname()`. Defaults to JuliaBackend if none is found. """ function get_backend() - backend = JuliaBackend - if isfile("/etc/redhat-release") && - occursin("Red Hat", read("/etc/redhat-release", String)) - backend = CaltechHPC + hostname = gethostname() + if occursin(r"^hpc-(\d\d)-(\d\d).cm.cluster$", hostname) || + occursin(r"^login[1-4].cm.cluster$match", hostname) + return CaltechHPCBackend + elseif hostname == "clima.gps.caltech.edu" + return ClimaGPUBackend + else + return JuliaBackend end - return backend +end + +""" + module_load_string(T) where {T<:Type{SlurmBackend}} + +Return a string that loads the correct modules for a given backend when executed via bash. +""" +function module_load_string(::Type{CaltechHPCBackend}) + return """export MODULEPATH=/groups/esm/modules:\$MODULEPATH + module purge + module load climacommon/2024_05_27""" +end + +function module_load_string(::Type{ClimaGPUBackend}) + return """module purge + modules load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline""" end """ @@ -77,8 +99,8 @@ function calibrate( end """ - calibrate(::Type{CaltechHPC}, config::ExperimentConfig; kwargs...) - calibrate(::Type{CaltechHPC}, experiment_dir; kwargs...) + calibrate(::Type{SlurmBackend}, config::ExperimentConfig; kwargs...) + calibrate(::Type{SlurmBackend}, experiment_dir; kwargs...) Run a full calibration, scheduling the forward model runs on Caltech's HPC cluster. @@ -93,7 +115,7 @@ Takes either an ExperimentConfig or an experiment folder. # Usage Open julia: `julia --project=experiments/surface_fluxes_perfect_model` ```julia -import ClimaCalibrate: CaltechHPC, calibrate +import ClimaCalibrate: CaltechHPCBackend, calibrate experiment_dir = dirname(Base.active_project()) model_interface = joinpath(experiment_dir, "model_interface.jl") @@ -104,11 +126,11 @@ include(joinpath(experiment_dir, "observation_map.jl")) include(model_interface) slurm_kwargs = kwargs(time = 3) -eki = calibrate(CaltechHPC, experiment_dir; model_interface, slurm_kwargs); +eki = calibrate(CaltechHPCBackend, experiment_dir; model_interface, slurm_kwargs); ``` """ function calibrate( - b::Type{CaltechHPC}, + b::Type{<:SlurmBackend}, experiment_dir::AbstractString; slurm_kwargs, ekp_kwargs..., @@ -117,7 +139,7 @@ function calibrate( end function calibrate( - ::Type{CaltechHPC}, + b::Type{<:SlurmBackend}, config::ExperimentConfig; experiment_dir = dirname(Base.active_project()), model_interface = abspath( @@ -133,6 +155,7 @@ function calibrate( initialize(config; ekp_kwargs...) eki = nothing + module_load_str = module_load_string(b) for iter in 0:(n_iterations - 1) @info "Iteration $iter" jobids = map(1:ensemble_size) do member @@ -142,7 +165,8 @@ function calibrate( member, output_dir, experiment_dir, - model_interface; + model_interface, + module_load_str; slurm_kwargs, ) end @@ -152,9 +176,10 @@ function calibrate( output_dir, iter, experiment_dir, - model_interface; - verbose, + model_interface, + module_load_str; slurm_kwargs, + verbose, ) report_iteration_status(statuses, output_dir, iter) @info "Completed iteration $iter, updating ensemble" diff --git a/src/slurm.jl b/src/slurm.jl index 591928cc..22d5b587 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -16,8 +16,8 @@ function generate_sbatch_script( member, output_dir, experiment_dir, - model_interface; - module_load_str, + model_interface, + module_load_str; slurm_kwargs, ) member_log = path_to_model_log(output_dir, iter, member) @@ -35,7 +35,6 @@ function generate_sbatch_script( #SBATCH --job-name=run_$(iter)_$(member) #SBATCH --output=$member_log $slurm_directives_str - $module_load_str srun --output=$member_log --open-mode=append julia --project=$experiment_dir -e ' @@ -76,17 +75,13 @@ function sbatch_model_run( member, output_dir, experiment_dir, - model_interface; + model_interface, + module_load_str; slurm_kwargs = Dict{Symbol, Any}( :time => 45, :ntasks => 1, :cpus_per_task => 1, ), - module_load_str = """ - export MODULEPATH=/groups/esm/modules:\$MODULEPATH - module purge - module load climacommon/2024_05_27 - """, kwargs..., ) sbatch_contents = generate_sbatch_script( @@ -94,9 +89,9 @@ function sbatch_model_run( member, output_dir, experiment_dir, - model_interface; + model_interface, + module_load_str; slurm_kwargs, - module_load_str, kwargs..., ) @@ -112,7 +107,8 @@ function wait_for_jobs( output_dir, iter, experiment_dir, - model_interface; + model_interface, + module_load_str; verbose, slurm_kwargs, ) @@ -135,7 +131,8 @@ function wait_for_jobs( m, output_dir, experiment_dir, - model_interface; + model_interface, + module_load_str; slurm_kwargs, ) push!(rerun_jobs, m) diff --git a/test/caltech_hpc_e2e.jl b/test/caltech_hpc_e2e.jl index 29fdabc2..169b00a1 100644 --- a/test/caltech_hpc_e2e.jl +++ b/test/caltech_hpc_e2e.jl @@ -1,9 +1,9 @@ # Tests for SurfaceFluxes example calibration on slurm, used in buildkite testing -# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model +# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/caltech # And include this file import ClimaCalibrate: - get_backend, CaltechHPC, JuliaBackend, calibrate, get_prior, kwargs + get_backend, CaltechHPCBackend, JuliaBackend, calibrate, get_prior, kwargs using Test import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final @@ -35,7 +35,7 @@ end # Test Caltech HPC backend backend = get_backend() -@test backend == CaltechHPC +@test backend == CaltechHPCBackend eki = calibrate( backend, diff --git a/test/slurm_unit_tests.jl b/test/slurm_unit_tests.jl index db25e674..b7fe3b17 100644 --- a/test/slurm_unit_tests.jl +++ b/test/slurm_unit_tests.jl @@ -12,7 +12,7 @@ const CPUS_PER_TASK = 16 const GPUS_PER_TASK = 1 const EXPERIMENT_DIR = "exp/dir" const MODEL_INTERFACE = "model_interface.jl" - +const MODULE_LOAD_STR = CAL.module_load_string(CAL.CaltechHPCBackend) const slurm_kwargs = CAL.kwargs( time = TIME_LIMIT, partition = PARTITION, @@ -32,13 +32,9 @@ sbatch_file = CAL.generate_sbatch_script( MEMBER, OUTPUT_DIR, EXPERIMENT_DIR, - MODEL_INTERFACE; + MODEL_INTERFACE, + MODULE_LOAD_STR; slurm_kwargs, - module_load_str = """ - export MODULEPATH=/groups/esm/modules:\$MODULEPATH - module purge - module load climacommon/2024_05_27 - """, ) expected_sbatch_contents = """ @@ -49,12 +45,10 @@ expected_sbatch_contents = """ #SBATCH --gpus-per-task=1 #SBATCH --cpus-per-task=16 #SBATCH --time=01:30:00 - export MODULEPATH=/groups/esm/modules:\$MODULEPATH module purge module load climacommon/2024_05_27 - srun --output=test/iteration_001/member_001/model_log.txt --open-mode=append julia --project=exp/dir -e ' import ClimaCalibrate as CAL iteration = 1; member = 1