Skip to content

Commit

Permalink
Merge pull request #87 from CliMA/ne/climagpu
Browse files Browse the repository at this point in the history
Add ClimaGPU Backend
  • Loading branch information
nefrathenrici authored Jun 20, 2024
2 parents 8d7ea1c + 3dc9114 commit 3c023b7
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 44 deletions.
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ steps:
key: "init_cpu_env"
command:
- echo "--- Instantiate SurfaceFluxes calibration project"
- julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.build("ClimaCalibrate"); Pkg.precompile()'
- julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.precompile()'

- wait
- label: "SurfaceFluxes perfect model calibration"
Expand Down
4 changes: 2 additions & 2 deletions docs/src/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,14 @@ To run this experiment:
3. Start julia: `julia --project=experiments/surace_fluxes_perfect_model`
4. Run the following:
```julia
import ClimaCalibrate: CaltechHPC, calibrate
import ClimaCalibrate: CaltechHPCBackend, calibrate

experiment_dir = dirname(Base.active_project())

include(joinpath(experiment_dir, "generate_data.jl"))
model_interface = joinpath(experiment_dir, "model_interface.jl")
include(joinpath(experiment_dir, "observation_map.jl"))
eki = calibrate(CaltechHPC, experiment_dir;
eki = calibrate(CaltechHPCBackend, experiment_dir;
time_limit = 3, model_interface)

include(joinpath(experiment_dir, "postprocessing.jl"))
Expand Down
57 changes: 41 additions & 16 deletions src/backends.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,42 @@
abstract type AbstractBackend end

struct JuliaBackend <: AbstractBackend end
struct CaltechHPC <: AbstractBackend end
abstract type SlurmBackend <: AbstractBackend end
struct CaltechHPCBackend <: SlurmBackend end
struct ClimaGPUBackend <: SlurmBackend end

"""
get_backend()
Determine the appropriate backend using relevant system information.
Get ideal backend for deploying forward model runs.
Each backend is found via `gethostname()`. Defaults to JuliaBackend if none is found.
"""
function get_backend()
backend = JuliaBackend
if isfile("/etc/redhat-release") &&
occursin("Red Hat", read("/etc/redhat-release", String))
backend = CaltechHPC
hostname = gethostname()
if occursin(r"^hpc-(\d\d)-(\d\d).cm.cluster$", hostname) ||
occursin(r"^login[1-4].cm.cluster$match", hostname)
return CaltechHPCBackend
elseif hostname == "clima.gps.caltech.edu"
return ClimaGPUBackend
else
return JuliaBackend
end
return backend
end

"""
module_load_string(T) where {T<:Type{SlurmBackend}}
Return a string that loads the correct modules for a given backend when executed via bash.
"""
function module_load_string(::Type{CaltechHPCBackend})
return """export MODULEPATH=/groups/esm/modules:\$MODULEPATH
module purge
module load climacommon/2024_05_27"""
end

function module_load_string(::Type{ClimaGPUBackend})
return """module purge
modules load julia/1.10.0 cuda/julia-pref openmpi/4.1.5-mpitrampoline"""
end

"""
Expand Down Expand Up @@ -77,8 +99,8 @@ function calibrate(
end

"""
calibrate(::Type{CaltechHPC}, config::ExperimentConfig; kwargs...)
calibrate(::Type{CaltechHPC}, experiment_dir; kwargs...)
calibrate(::Type{SlurmBackend}, config::ExperimentConfig; kwargs...)
calibrate(::Type{SlurmBackend}, experiment_dir; kwargs...)
Run a full calibration, scheduling the forward model runs on Caltech's HPC cluster.
Expand All @@ -93,7 +115,7 @@ Takes either an ExperimentConfig or an experiment folder.
# Usage
Open julia: `julia --project=experiments/surface_fluxes_perfect_model`
```julia
import ClimaCalibrate: CaltechHPC, calibrate
import ClimaCalibrate: CaltechHPCBackend, calibrate
experiment_dir = dirname(Base.active_project())
model_interface = joinpath(experiment_dir, "model_interface.jl")
Expand All @@ -104,11 +126,11 @@ include(joinpath(experiment_dir, "observation_map.jl"))
include(model_interface)
slurm_kwargs = kwargs(time = 3)
eki = calibrate(CaltechHPC, experiment_dir; model_interface, slurm_kwargs);
eki = calibrate(CaltechHPCBackend, experiment_dir; model_interface, slurm_kwargs);
```
"""
function calibrate(
b::Type{CaltechHPC},
b::Type{<:SlurmBackend},
experiment_dir::AbstractString;
slurm_kwargs,
ekp_kwargs...,
Expand All @@ -117,7 +139,7 @@ function calibrate(
end

function calibrate(
::Type{CaltechHPC},
b::Type{<:SlurmBackend},
config::ExperimentConfig;
experiment_dir = dirname(Base.active_project()),
model_interface = abspath(
Expand All @@ -133,6 +155,7 @@ function calibrate(
initialize(config; ekp_kwargs...)

eki = nothing
module_load_str = module_load_string(b)
for iter in 0:(n_iterations - 1)
@info "Iteration $iter"
jobids = map(1:ensemble_size) do member
Expand All @@ -142,7 +165,8 @@ function calibrate(
member,
output_dir,
experiment_dir,
model_interface;
model_interface,
module_load_str;
slurm_kwargs,
)
end
Expand All @@ -152,9 +176,10 @@ function calibrate(
output_dir,
iter,
experiment_dir,
model_interface;
verbose,
model_interface,
module_load_str;
slurm_kwargs,
verbose,
)
report_iteration_status(statuses, output_dir, iter)
@info "Completed iteration $iter, updating ensemble"
Expand Down
23 changes: 10 additions & 13 deletions src/slurm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ function generate_sbatch_script(
member,
output_dir,
experiment_dir,
model_interface;
module_load_str,
model_interface,
module_load_str;
slurm_kwargs,
)
member_log = path_to_model_log(output_dir, iter, member)
Expand All @@ -35,7 +35,6 @@ function generate_sbatch_script(
#SBATCH --job-name=run_$(iter)_$(member)
#SBATCH --output=$member_log
$slurm_directives_str
$module_load_str
srun --output=$member_log --open-mode=append julia --project=$experiment_dir -e '
Expand Down Expand Up @@ -76,27 +75,23 @@ function sbatch_model_run(
member,
output_dir,
experiment_dir,
model_interface;
model_interface,
module_load_str;
slurm_kwargs = Dict{Symbol, Any}(
:time => 45,
:ntasks => 1,
:cpus_per_task => 1,
),
module_load_str = """
export MODULEPATH=/groups/esm/modules:\$MODULEPATH
module purge
module load climacommon/2024_05_27
""",
kwargs...,
)
sbatch_contents = generate_sbatch_script(
iter,
member,
output_dir,
experiment_dir,
model_interface;
model_interface,
module_load_str;
slurm_kwargs,
module_load_str,
kwargs...,
)

Expand All @@ -112,7 +107,8 @@ function wait_for_jobs(
output_dir,
iter,
experiment_dir,
model_interface;
model_interface,
module_load_str;
verbose,
slurm_kwargs,
)
Expand All @@ -135,7 +131,8 @@ function wait_for_jobs(
m,
output_dir,
experiment_dir,
model_interface;
model_interface,
module_load_str;
slurm_kwargs,
)
push!(rerun_jobs, m)
Expand Down
6 changes: 3 additions & 3 deletions test/caltech_hpc_e2e.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Tests for SurfaceFluxes example calibration on slurm, used in buildkite testing
# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model
# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/caltech
# And include this file

import ClimaCalibrate:
get_backend, CaltechHPC, JuliaBackend, calibrate, get_prior, kwargs
get_backend, CaltechHPCBackend, JuliaBackend, calibrate, get_prior, kwargs
using Test
import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final

Expand Down Expand Up @@ -35,7 +35,7 @@ end

# Test Caltech HPC backend
backend = get_backend()
@test backend == CaltechHPC
@test backend == CaltechHPCBackend

eki = calibrate(
backend,
Expand Down
12 changes: 3 additions & 9 deletions test/slurm_unit_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const CPUS_PER_TASK = 16
const GPUS_PER_TASK = 1
const EXPERIMENT_DIR = "exp/dir"
const MODEL_INTERFACE = "model_interface.jl"

const MODULE_LOAD_STR = CAL.module_load_string(CAL.CaltechHPCBackend)
const slurm_kwargs = CAL.kwargs(
time = TIME_LIMIT,
partition = PARTITION,
Expand All @@ -32,13 +32,9 @@ sbatch_file = CAL.generate_sbatch_script(
MEMBER,
OUTPUT_DIR,
EXPERIMENT_DIR,
MODEL_INTERFACE;
MODEL_INTERFACE,
MODULE_LOAD_STR;
slurm_kwargs,
module_load_str = """
export MODULEPATH=/groups/esm/modules:\$MODULEPATH
module purge
module load climacommon/2024_05_27
""",
)

expected_sbatch_contents = """
Expand All @@ -49,12 +45,10 @@ expected_sbatch_contents = """
#SBATCH --gpus-per-task=1
#SBATCH --cpus-per-task=16
#SBATCH --time=01:30:00
export MODULEPATH=/groups/esm/modules:\$MODULEPATH
module purge
module load climacommon/2024_05_27
srun --output=test/iteration_001/member_001/model_log.txt --open-mode=append julia --project=exp/dir -e '
import ClimaCalibrate as CAL
iteration = 1; member = 1
Expand Down

0 comments on commit 3c023b7

Please sign in to comment.