From 2e6e77f4803ca9cf22eff1b736867ec244827ce6 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Wed, 31 Jul 2024 10:58:55 -0700 Subject: [PATCH] Add pmap for backend --- .buildkite/clima_server_test/pipeline.yml | 2 +- .buildkite/pipeline.yml | 2 +- Project.toml | 3 ++- src/backends.jl | 33 ++++++++++++++++------- test/Project.toml | 3 +++ 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/.buildkite/clima_server_test/pipeline.yml b/.buildkite/clima_server_test/pipeline.yml index 90c24eb1..02faaaad 100644 --- a/.buildkite/clima_server_test/pipeline.yml +++ b/.buildkite/clima_server_test/pipeline.yml @@ -17,7 +17,7 @@ steps: key: "init_cpu_env" command: - echo "--- Instantiate SurfaceFluxes calibration project" - - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.precompile()' + - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.develop(;path="."); Pkg.precompile()' - wait - label: "SurfaceFluxes perfect model calibration" diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 085008e6..7288490b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: key: "init_cpu_env" command: - echo "--- Instantiate SurfaceFluxes calibration project" - - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.precompile()' + - julia --project=experiments/surface_fluxes_perfect_model -e 'using Pkg; Pkg.develop(;path="."); Pkg.precompile()' - wait - label: "SurfaceFluxes perfect model calibration" diff --git a/Project.toml b/Project.toml index 12eae9ba..8da120ce 100644 --- a/Project.toml +++ b/Project.toml @@ -1,9 +1,10 @@ name = "ClimaCalibrate" uuid = "4347a170-ebd6-470c-89d3-5c705c0cacc2" authors = ["Climate Modeling Alliance"] -version = "0.0.2" +version = "0.0.3" [deps] +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" EnsembleKalmanProcesses = "aa8a2aa5-91d8-4396-bcef-d4f2ec43552d" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" diff --git a/src/backends.jl b/src/backends.jl index b544d08f..af7646a4 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -1,3 +1,5 @@ +using Distributed + export get_backend, calibrate, model_run abstract type AbstractBackend end @@ -58,28 +60,39 @@ function module_load_string(::Type{DerechoBackend}) """ end -calibrate(config::ExperimentConfig; ekp_kwargs...) = - calibrate(get_backend(), config; ekp_kwargs...) +calibrate(config::ExperimentConfig; reruns = 0, ekp_kwargs...) = + calibrate(get_backend(), config; reruns, ekp_kwargs...) -calibrate(experiment_dir::AbstractString; ekp_kwargs...) = - calibrate(get_backend(), ExperimentConfig(experiment_dir); ekp_kwargs...) +calibrate(experiment_dir::AbstractString; reruns = 0, ekp_kwargs...) = + calibrate( + get_backend(), + ExperimentConfig(experiment_dir); + reruns, + ekp_kwargs..., + ) calibrate( b::Type{JuliaBackend}, experiment_dir::AbstractString; + reruns = 0, ekp_kwargs..., -) = calibrate(b, ExperimentConfig(experiment_dir); ekp_kwargs...) +) = calibrate(b, ExperimentConfig(experiment_dir); reruns, ekp_kwargs...) function calibrate( ::Type{JuliaBackend}, config::ExperimentConfig; + reruns = 0, ekp_kwargs..., ) (; n_iterations, ensemble_size) = config eki = initialize(config; ekp_kwargs...) + on_error(e::InterruptException) = rethrow(e) + on_error(e) = + @error "Single ensemble member has errored. See stacktrace" exception = + (e, catch_backtrace()) for i in 0:(n_iterations - 1) @info "Running iteration $i" - for m in 1:ensemble_size + pmap(1:ensemble_size; retry_delays = reruns, on_error) do m run_forward_model(set_up_forward_model(m, i, config)) @info "Completed member $m" end @@ -100,11 +113,11 @@ Takes either an ExperimentConfig or an experiment folder. Available Backends: CaltechHPCBackend, ClimaGPUBackend, DerechoBackend, JuliaBackend - # Keyword Arguments - `experiment_dir: Directory containing experiment configurations. - `model_interface: Path to the model interface file. - `hpc_kwargs`: Dictionary of resource arguments, passed to the job scheduler. +- `reruns`: Number of times to retry a failed ensemble member. - `verbose::Bool`: Enable verbose logging. # Usage @@ -168,7 +181,7 @@ function calibrate( ) end - statuses = wait_for_jobs( + wait_for_jobs( jobids, output_dir, iter, @@ -206,7 +219,7 @@ Arguments: - hpc_kwargs: Dictionary containing the resources for the job. Easily generated using [`kwargs`](@ref). """ model_run( - b::Type{<:SlurmBackend}, + ::Type{<:SlurmBackend}, iter, member, output_dir, @@ -224,7 +237,7 @@ model_run( hpc_kwargs, ) model_run( - b::Type{DerechoBackend}, + ::Type{DerechoBackend}, iter, member, output_dir, diff --git a/test/Project.toml b/test/Project.toml index be47dac3..b878c3b5 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -11,3 +11,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[compat] +EnsembleKalmanProcesses = "< 1.1.6" \ No newline at end of file