diff --git a/src/slurm_interface.jl b/src/slurm_interface.jl index 0552b408..ce571041 100644 --- a/src/slurm_interface.jl +++ b/src/slurm_interface.jl @@ -1,4 +1,29 @@ +""" + slurm_calibration(; + experiment_dir = dirname(Base.active_project()), + model_interface_path = joinpath(experiment_dir, "..", "..", "model_interface.jl"), + time_limit = "0:45:00", + ntasks = 1, + cpus_per_task = 1, + gpus_per_task = 0, + verbose = false, + ) + +Runs a full calibration, scheduling forward model runs on the slurm cluster using `srun_model`. +This function makes heavy assumptions and requires some setup. + - The correct project must be selected, and the observation map has already been `include`d + - The session is not running in an existing slurm job, and is running on the Resnick central cluster. + +Input arguments: + - `experiment_dir`: The directory storing relevant experiment information. (Default: dirname(Base.active_project())) + - `model_interface_path`: Model interface file to be included during the model run. (Default: joinpath(experiment_dir, "..", "..", "model_interface.jl")) + - `time_limit`: Slurm time limit + - `ntasks`: Slurm ntasks + - `cpus_per_task`: Slurm CPUs per task + - `gpus_per_task`: Slurm GPUs per task + - `verbose`: Turn on verbose model logging +""" function slurm_calibration(; experiment_dir = dirname(Base.active_project()), model_interface_path = abspath( @@ -37,7 +62,8 @@ function slurm_calibration(; verbose, ) end - handle_ensemble_procs(procs, iter, output_dir) + + handle_ensemble_procs(procs, iter, output_dir, verbose) @info "Completed iteration $iter, updating ensemble" G_ensemble = observation_map(Val(Symbol(config.id)), iter) save_G_ensemble(config, iter, G_ensemble) @@ -46,7 +72,13 @@ function slurm_calibration(; return eki end -function handle_ensemble_procs(procs, iteration, output_dir) +""" + handle_ensemble_procs(procs, iteration, output_dir, verbose) + +Helper function for `slurm_calibration`. + Handles the ensemble of processes running the forward model via slurm. +""" +function handle_ensemble_procs(procs, iteration, output_dir, verbose) # Initial try handles InterruptException try asyncmap(enumerate(procs)) do (member, p) @@ -57,18 +89,17 @@ function handle_ensemble_procs(procs, iteration, output_dir) try wait(p) if p.exitcode != 0 - warn_on_member_error(member, member_log) + warn_on_member_error(member, member_log, verbose) end catch e - warn_on_member_error(member, member_log) + warn_on_member_error(member, member_log, verbose) end end catch e - e isa InterruptException && map(p -> kill(p), procs) + e isa InterruptException && foreach(kill, procs) end # Wait for processes to be killed sleep(0.25) - exit_codes = map(x -> getproperty(x, :exitcode), procs) if !any(x -> x == 0, exit_codes) error("Full ensemble for iteration $iteration has failed. See model logs in $(path_to_iteration(output_dir, iteration)) for details.") @@ -78,19 +109,32 @@ function handle_ensemble_procs(procs, iteration, output_dir) end function warn_on_member_error(member, member_log, verbose = false) - warn_str = if verbose - """ - Ensemble member $member raised an error. See model log at $member_log for stacktrace: - $(replace(readchomp(member_log), "\\n" => "\n")) - """ - else - """ - Ensemble member $member raised an error. See model log at $member_log for stacktrace. - """ + warn_str = "Ensemble member $member raised an error. See model log at $member_log for stacktrace" + if verbose + stacktrace = replace(readchomp(member_log), "\\n" => "\n") + warn_str = warn_str * ": \n$stacktrace" end @warn warn_str end +""" + srun_model(; + output_dir, + iter, + member, + time_limit, + ntasks, + partition, + cpus_per_task, + gpus_per_task, + experiment_dir, + model_interface_path, + verbose, + ) + +Runs a single forward model ensemble member. Constructs the `srun` command, then +runs it in a separate process. +""" function srun_model(; output_dir, iter, diff --git a/test/e2e_test.jl b/test/e2e_test.jl index f943233e..e2411908 100644 --- a/test/e2e_test.jl +++ b/test/e2e_test.jl @@ -35,7 +35,6 @@ experiment_config = ExperimentConfig( prior, output_dir, false, - false, ) # Model interface