Skip to content

Commit

Permalink
Add nice docstrings, fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nefrathenrici committed Apr 22, 2024
1 parent 3638932 commit f73d5eb
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 16 deletions.
74 changes: 59 additions & 15 deletions src/slurm_interface.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,29 @@

"""
slurm_calibration(;
experiment_dir = dirname(Base.active_project()),
model_interface_path = joinpath(experiment_dir, "..", "..", "model_interface.jl"),
time_limit = "0:45:00",
ntasks = 1,
cpus_per_task = 1,
gpus_per_task = 0,
verbose = false,
)
Runs a full calibration, scheduling forward model runs on the slurm cluster using `srun_model`.
This function makes heavy assumptions and requires some setup.
- The correct project must be selected, and the observation map has already been `include`d
- The session is not running in an existing slurm job, and is running on the Resnick central cluster.
Input arguments:
- `experiment_dir`: The directory storing relevant experiment information. (Default: dirname(Base.active_project()))
- `model_interface_path`: Model interface file to be included during the model run. (Default: joinpath(experiment_dir, "..", "..", "model_interface.jl"))
- `time_limit`: Slurm time limit
- `ntasks`: Slurm ntasks
- `cpus_per_task`: Slurm CPUs per task
- `gpus_per_task`: Slurm GPUs per task
- `verbose`: Turn on verbose model logging
"""
function slurm_calibration(;
experiment_dir = dirname(Base.active_project()),
model_interface_path = abspath(
Expand Down Expand Up @@ -37,7 +62,8 @@ function slurm_calibration(;
verbose,
)
end
handle_ensemble_procs(procs, iter, output_dir)

handle_ensemble_procs(procs, iter, output_dir, verbose)
@info "Completed iteration $iter, updating ensemble"
G_ensemble = observation_map(Val(Symbol(config.id)), iter)
save_G_ensemble(config, iter, G_ensemble)
Expand All @@ -46,7 +72,13 @@ function slurm_calibration(;
return eki
end

function handle_ensemble_procs(procs, iteration, output_dir)
"""
handle_ensemble_procs(procs, iteration, output_dir, verbose)
Helper function for `slurm_calibration`.
Handles the ensemble of processes running the forward model via slurm.
"""
function handle_ensemble_procs(procs, iteration, output_dir, verbose)
# Initial try handles InterruptException
try
asyncmap(enumerate(procs)) do (member, p)
Expand All @@ -57,18 +89,17 @@ function handle_ensemble_procs(procs, iteration, output_dir)
try
wait(p)
if p.exitcode != 0
warn_on_member_error(member, member_log)
warn_on_member_error(member, member_log, verbose)
end
catch e
warn_on_member_error(member, member_log)
warn_on_member_error(member, member_log, verbose)
end
end
catch e
e isa InterruptException && map(p -> kill(p), procs)
e isa InterruptException && foreach(kill, procs)
end
# Wait for processes to be killed
sleep(0.25)

exit_codes = map(x -> getproperty(x, :exitcode), procs)
if !any(x -> x == 0, exit_codes)
error("Full ensemble for iteration $iteration has failed. See model logs in $(path_to_iteration(output_dir, iteration)) for details.")
Expand All @@ -78,19 +109,32 @@ function handle_ensemble_procs(procs, iteration, output_dir)
end

function warn_on_member_error(member, member_log, verbose = false)
warn_str = if verbose
"""
Ensemble member $member raised an error. See model log at $member_log for stacktrace:
$(replace(readchomp(member_log), "\\n" => "\n"))
"""
else
"""
Ensemble member $member raised an error. See model log at $member_log for stacktrace.
"""
warn_str = "Ensemble member $member raised an error. See model log at $member_log for stacktrace"
if verbose
stacktrace = replace(readchomp(member_log), "\\n" => "\n")
warn_str = warn_str * ": \n$stacktrace"
end
@warn warn_str
end

"""
srun_model(;
output_dir,
iter,
member,
time_limit,
ntasks,
partition,
cpus_per_task,
gpus_per_task,
experiment_dir,
model_interface_path,
verbose,
)
Runs a single forward model ensemble member. Constructs the `srun` command, then
runs it in a separate process.
"""
function srun_model(;
output_dir,
iter,
Expand Down
1 change: 0 additions & 1 deletion test/e2e_test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ experiment_config = ExperimentConfig(
prior,
output_dir,
false,
false,
)

# Model interface
Expand Down

0 comments on commit f73d5eb

Please sign in to comment.