From caed9da6728ca201ffab891bb0ab725809b49007 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Thu, 5 Dec 2024 17:39:30 -0800 Subject: [PATCH 1/3] Add SlurmManager, simplify model interface --- .github/workflows/ci.yml | 2 +- docs/src/api.md | 3 +- .../Manifest.toml | 416 ++++++++++-------- .../experiment_config.yml | 2 +- .../generate_data.jl | 7 +- .../model_config.yml | 3 - .../model_interface.jl | 78 +--- .../postprocessing.jl | 24 +- src/ClimaCalibrate.jl | 3 + src/backends.jl | 2 +- src/ekp_interface.jl | 23 +- src/model_interface.jl | 28 +- src/pbs.jl | 2 +- src/slurm.jl | 39 +- src/slurm_workers.jl | 307 +++++++++++++ test/hpc_backend_e2e.jl | 34 +- test/model_interface.jl | 41 +- test/pbs_unit_tests.jl | 2 +- test/pure_julia_e2e.jl | 27 +- test/slurm_unit_tests.jl | 3 +- test/slurm_workers.jl | 87 ++++ 21 files changed, 762 insertions(+), 371 deletions(-) delete mode 100644 experiments/surface_fluxes_perfect_model/model_config.yml create mode 100644 src/slurm_workers.jl create mode 100644 test/slurm_workers.jl diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 796f73ee..62092a99 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,7 +48,7 @@ jobs: max_attempts: 2 timeout_minutes: 10 command: | - julia --color=yes --project=test -e 'using Pkg; Pkg.instantiate()' + julia --color=yes --project=test -e 'using Pkg; Pkg.develop(;path="."); Pkg.instantiate()' julia --project=test -e 'using Conda; Conda.add("scipy=1.14.1"); Conda.add("scikit-learn=1.5.1")' env: PYTHON: "" diff --git a/docs/src/api.md b/docs/src/api.md index a55eb4bd..04393628 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -3,8 +3,7 @@ ## Model Interface ```@docs -ClimaCalibrate.set_up_forward_model -ClimaCalibrate.run_forward_model +ClimaCalibrate.forward_model ClimaCalibrate.observation_map ``` diff --git a/experiments/surface_fluxes_perfect_model/Manifest.toml b/experiments/surface_fluxes_perfect_model/Manifest.toml index 8dcdb428..58331155 100644 --- a/experiments/surface_fluxes_perfect_model/Manifest.toml +++ b/experiments/surface_fluxes_perfect_model/Manifest.toml @@ -28,9 +28,9 @@ version = "0.4.5" [[deps.Adapt]] deps = ["LinearAlgebra", "Requires"] -git-tree-sha1 = "6a55b747d1812e699320963ffde36f1ebdda4099" +git-tree-sha1 = "50c3c56a52972d78e8be9fd135bfb91c9574c140" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "4.0.4" +version = "4.1.1" weakdeps = ["StaticArrays"] [deps.Adapt.extensions] @@ -49,9 +49,9 @@ version = "1.1.3" [[deps.Animations]] deps = ["Colors"] -git-tree-sha1 = "e81c509d2c8e49592413bfb0bb3b08150056c79d" +git-tree-sha1 = "e092fa223bf66a3c41f9c022bd074d916dc303e7" uuid = "27a7e980-b3e6-11e9-2bcd-0b925532e340" -version = "0.4.1" +version = "0.4.2" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" @@ -71,15 +71,16 @@ version = "3.5.1+1" [[deps.ArrayInterface]] deps = ["Adapt", "LinearAlgebra"] -git-tree-sha1 = "3640d077b6dafd64ceb8fd5c1ec76f7ca53bcf76" +git-tree-sha1 = "d5140b60b87473df18cf4fe66382b7c3596df047" uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" -version = "7.16.0" +version = "7.17.1" [deps.ArrayInterface.extensions] ArrayInterfaceBandedMatricesExt = "BandedMatrices" ArrayInterfaceBlockBandedMatricesExt = "BlockBandedMatrices" ArrayInterfaceCUDAExt = "CUDA" ArrayInterfaceCUDSSExt = "CUDSS" + ArrayInterfaceChainRulesCoreExt = "ChainRulesCore" ArrayInterfaceChainRulesExt = "ChainRules" ArrayInterfaceGPUArraysCoreExt = "GPUArraysCore" ArrayInterfaceReverseDiffExt = "ReverseDiff" @@ -93,6 +94,7 @@ version = "7.16.0" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2" + ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" @@ -105,15 +107,25 @@ version = "1.11.0" [[deps.Atomix]] deps = ["UnsafeAtomics"] -git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" +git-tree-sha1 = "c3b238aa28c1bebd4b5ea4988bebf27e9a01b72b" uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" -version = "0.1.0" +version = "1.0.1" + + [deps.Atomix.extensions] + AtomixCUDAExt = "CUDA" + AtomixMetalExt = "Metal" + AtomixoneAPIExt = "oneAPI" + + [deps.Atomix.weakdeps] + CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + Metal = "dde4c033-4e86-420c-a63e-0dd931031962" + oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [[deps.Automa]] -deps = ["PrecompileTools", "TranscodingStreams"] -git-tree-sha1 = "014bc22d6c400a7703c0f5dc1fdc302440cf88be" +deps = ["PrecompileTools", "SIMD", "TranscodingStreams"] +git-tree-sha1 = "a8f503e8e1a5f583fbef15a8440c8c7e32185df2" uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" -version = "1.0.4" +version = "1.1.0" [[deps.AxisAlgorithms]] deps = ["LinearAlgebra", "Random", "SparseArrays", "WoodburyMatrices"] @@ -139,9 +151,9 @@ version = "1.5.0" [[deps.Bzip2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "9e2a6b69137e6969bab0152632dcb3bc108c8bdd" +git-tree-sha1 = "8873e196c2eb87962a2048b3b8e08946535864a1" uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" -version = "1.0.8+1" +version = "1.0.8+2" [[deps.CEnum]] git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc" @@ -160,9 +172,9 @@ version = "1.0.1+0" [[deps.Cairo]] deps = ["Cairo_jll", "Colors", "Glib_jll", "Graphics", "Libdl", "Pango_jll"] -git-tree-sha1 = "7b6ad8c35f4bc3bca8eb78127c8b99719506a5fb" +git-tree-sha1 = "71aa551c5c33f1a4415867fe06b7844faadb0ae9" uuid = "159f3aea-2a34-519c-b102-8c37f9878175" -version = "1.1.0" +version = "1.1.1" [[deps.CairoMakie]] deps = ["CRC32c", "Cairo", "Cairo_jll", "Colors", "FileIO", "FreeType", "GeometryBasics", "LinearAlgebra", "Makie", "PrecompileTools"] @@ -172,9 +184,9 @@ version = "0.12.12" [[deps.Cairo_jll]] deps = ["Artifacts", "Bzip2_jll", "CompilerSupportLibraries_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "a2f1c8c668c8e3cb4cca4e57a8efdb09067bb3fd" +git-tree-sha1 = "009060c9a6168704143100f36ab08f06c2af4642" uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a" -version = "1.18.0+2" +version = "1.18.2+1" [[deps.ChainRulesCore]] deps = ["Compat", "LinearAlgebra"] @@ -187,10 +199,10 @@ weakdeps = ["SparseArrays"] ChainRulesCoreSparseArraysExt = "SparseArrays" [[deps.ClimaCalibrate]] -deps = ["Distributed", "Distributions", "EnsembleKalmanProcesses", "JLD2", "Random", "TOML", "YAML"] +deps = ["ClusterManagers", "Distributed", "Distributions", "EnsembleKalmanProcesses", "JLD2", "Random", "TOML", "YAML"] path = "../.." uuid = "4347a170-ebd6-470c-89d3-5c705c0cacc2" -version = "0.0.4" +version = "0.0.5" [deps.ClimaCalibrate.extensions] CESExt = "CalibrateEmulateSample" @@ -204,6 +216,12 @@ git-tree-sha1 = "b43ca371c435056129295445122ea87fd843b505" uuid = "5c42b081-d73a-476f-9059-fd94b934656c" version = "0.10.14" +[[deps.ClusterManagers]] +deps = ["Distributed", "Logging", "Pkg", "Sockets"] +git-tree-sha1 = "6a678b98d5ea4d2773e92c7ae607cf7371043684" +uuid = "34f1f09b-3a8b-5176-ab39-66d58a4d544e" +version = "0.4.6" + [[deps.CodecBzip2]] deps = ["Bzip2_jll", "TranscodingStreams"] git-tree-sha1 = "e7c529cc31bb85b97631b922fa2e6baf246f5905" @@ -224,9 +242,9 @@ version = "0.4.0" [[deps.ColorSchemes]] deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "PrecompileTools", "Random"] -git-tree-sha1 = "b5278586822443594ff615963b0c09755771b3e0" +git-tree-sha1 = "c785dfb1b3bfddd1da557e861b919819b82bbe5b" uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4" -version = "3.26.0" +version = "3.27.1" [[deps.ColorTypes]] deps = ["FixedPointNumbers", "Random"] @@ -289,9 +307,9 @@ version = "0.6.3" [[deps.Convex]] deps = ["AbstractTrees", "BenchmarkTools", "LDLFactorizations", "LinearAlgebra", "MathOptInterface", "OrderedCollections", "SparseArrays", "Test"] -git-tree-sha1 = "dac1878b4996fa56292d2c3bd28f2498b980bb93" +git-tree-sha1 = "e84e371b9206bdd678fe7a8cf809c7dec949e88f" uuid = "f65535da-76fb-5f13-bab9-19810c17039a" -version = "0.16.3" +version = "0.15.4" [[deps.DataAPI]] git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe" @@ -315,10 +333,10 @@ uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" version = "1.11.0" [[deps.DelaunayTriangulation]] -deps = ["AdaptivePredicates", "EnumX", "ExactPredicates", "PrecompileTools", "Random"] -git-tree-sha1 = "668bb97ea6df5e654e6288d87d2243591fe68665" +deps = ["AdaptivePredicates", "EnumX", "ExactPredicates", "Random"] +git-tree-sha1 = "e1371a23fd9816080c828d0ce04373857fe73d33" uuid = "927a84f5-c5f4-47a5-9785-b46e178433df" -version = "1.6.0" +version = "1.6.3" [[deps.DiffResults]] deps = ["StaticArraysCore"] @@ -371,10 +389,10 @@ uuid = "5ae413db-bbd1-5e63-b57d-d24a61df00f5" version = "2.2.4+0" [[deps.EnsembleKalmanProcesses]] -deps = ["Convex", "Distributions", "DocStringExtensions", "GaussianRandomFields", "Interpolations", "LinearAlgebra", "MathOptInterface", "Optim", "QuadGK", "Random", "RecipesBase", "SCS", "SparseArrays", "Statistics", "StatsBase", "TOML"] -git-tree-sha1 = "00bb94ff704d7aeed9c72d4a2a05d6abf6cb7946" +deps = ["Convex", "Distributions", "DocStringExtensions", "GaussianRandomFields", "LinearAlgebra", "MathOptInterface", "Optim", "QuadGK", "Random", "RecipesBase", "SCS", "SparseArrays", "Statistics", "StatsBase", "TOML"] +git-tree-sha1 = "b67e9cc4cd50415c17388696c4ec208b02fceba2" uuid = "aa8a2aa5-91d8-4396-bcef-d4f2ec43552d" -version = "2.0.1" +version = "1.1.5" [[deps.EnumX]] git-tree-sha1 = "bdb1942cd4c45e3c678fd11569d5cccd80976237" @@ -389,9 +407,9 @@ version = "2.2.8" [[deps.Expat_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "1c6317308b9dc757616f0b5cb379db10494443a7" +git-tree-sha1 = "e51db81749b0777b2147fbe7b783ee79045b8e99" uuid = "2e619515-83b5-522b-bb60-26c02a35a201" -version = "2.6.2+0" +version = "2.6.4+1" [[deps.Extents]] git-tree-sha1 = "81023caa0021a41712685887db1fc03db26f41f5" @@ -424,9 +442,15 @@ version = "0.5.1" [[deps.FileIO]] deps = ["Pkg", "Requires", "UUIDs"] -git-tree-sha1 = "82d8afa92ecf4b52d78d869f038ebfb881267322" +git-tree-sha1 = "2dd20384bf8c6d411b5c7370865b1e9b26cb2ea3" uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" -version = "1.16.3" +version = "1.16.6" + + [deps.FileIO.extensions] + HTTPExt = "HTTP" + + [deps.FileIO.weakdeps] + HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" [[deps.FilePaths]] deps = ["FilePathsBase", "MacroTools", "Reexport", "Requires"] @@ -462,19 +486,21 @@ weakdeps = ["PDMats", "SparseArrays", "Statistics"] FillArraysStatisticsExt = "Statistics" [[deps.FiniteDiff]] -deps = ["ArrayInterface", "LinearAlgebra", "Setfield", "SparseArrays"] -git-tree-sha1 = "f9219347ebf700e77ca1d48ef84e4a82a6701882" +deps = ["ArrayInterface", "LinearAlgebra", "Setfield"] +git-tree-sha1 = "84e3a47db33be7248daa6274b287507dd6ff84e8" uuid = "6a86dc24-6348-571c-b903-95158fe2bd41" -version = "2.24.0" +version = "2.26.2" [deps.FiniteDiff.extensions] FiniteDiffBandedMatricesExt = "BandedMatrices" FiniteDiffBlockBandedMatricesExt = "BlockBandedMatrices" + FiniteDiffSparseArraysExt = "SparseArrays" FiniteDiffStaticArraysExt = "StaticArrays" [deps.FiniteDiff.weakdeps] BandedMatrices = "aae01518-5342-5314-be14-df237901396f" BlockBandedMatrices = "ffab5731-97b5-5995-9138-79e8c1846df0" + SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [[deps.FixedPointNumbers]] @@ -485,9 +511,9 @@ version = "0.8.5" [[deps.Fontconfig_jll]] deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Zlib_jll"] -git-tree-sha1 = "db16beca600632c95fc8aca29890d83788dd8b23" +git-tree-sha1 = "21fac3c77d7b5a9fc03b0ec503aa1a6392c34d2b" uuid = "a3f928ae-7b40-5064-980b-68af3947d34b" -version = "2.13.96+0" +version = "2.15.0+0" [[deps.Format]] git-tree-sha1 = "9c68794ef81b08086aeb32eeaf33531668d5f5fc" @@ -496,9 +522,9 @@ version = "1.3.7" [[deps.ForwardDiff]] deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "LogExpFunctions", "NaNMath", "Preferences", "Printf", "Random", "SpecialFunctions"] -git-tree-sha1 = "cf0fe81336da9fb90944683b8c41984b08793dad" +git-tree-sha1 = "a2df1b776752e3f344e5116c06d75a10436ab853" uuid = "f6369f11-7733-5829-9624-2563aa707210" -version = "0.10.36" +version = "0.10.38" weakdeps = ["StaticArrays"] [deps.ForwardDiff.extensions] @@ -512,15 +538,15 @@ version = "4.1.1" [[deps.FreeType2_jll]] deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Zlib_jll"] -git-tree-sha1 = "5c1d8ae0efc6c2e7b1fc502cbe25def8f661b7bc" +git-tree-sha1 = "786e968a8d2fb167f2e4880baba62e0e26bd8e4e" uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7" -version = "2.13.2+0" +version = "2.13.3+1" [[deps.FreeTypeAbstraction]] deps = ["ColorVectorSpace", "Colors", "FreeType", "GeometryBasics"] -git-tree-sha1 = "2493cdfd0740015955a8e46de4ef28f49460d8bc" +git-tree-sha1 = "d52e255138ac21be31fa633200b65e4e71d26802" uuid = "663a7486-cb36-511b-a19d-713bb74d65c9" -version = "0.10.3" +version = "0.10.6" [[deps.FriBidi_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -546,9 +572,9 @@ version = "0.4.2" [[deps.GeoInterface]] deps = ["Extents", "GeoFormatTypes"] -git-tree-sha1 = "2f6fce56cdb8373637a6614e14a5768a88450de2" +git-tree-sha1 = "826b4fd69438d9ce4d2b19de6bc2f970f45f0f88" uuid = "cf35fbd7-0cd7-5166-be24-54bfbe79505f" -version = "1.3.7" +version = "1.3.8" [[deps.GeometryBasics]] deps = ["EarCut_jll", "Extents", "GeoInterface", "IterTools", "LinearAlgebra", "StaticArrays", "StructArrays", "Tables"] @@ -562,29 +588,35 @@ git-tree-sha1 = "9b02998aba7bf074d14de89f9d37ca24a1a0b046" uuid = "78b55507-aeef-58d4-861c-77aaff3498b1" version = "0.21.0+0" +[[deps.Giflib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "0224cce99284d997f6880a42ef715a37c99338d1" +uuid = "59f7168a-df46-5410-90c8-f2779963d0ec" +version = "5.2.2+0" + [[deps.Glib_jll]] deps = ["Artifacts", "Gettext_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE2_jll", "Zlib_jll"] -git-tree-sha1 = "674ff0db93fffcd11a3573986e550d66cd4fd71f" +git-tree-sha1 = "48b5d4c75b2c9078ead62e345966fa51a25c05ad" uuid = "7746bdde-850d-59dc-9ae8-88ece973131d" -version = "2.80.5+0" +version = "2.82.2+1" [[deps.Graphics]] deps = ["Colors", "LinearAlgebra", "NaNMath"] -git-tree-sha1 = "d61890399bc535850c4bf08e4e0d3a7ad0f21cbd" +git-tree-sha1 = "a641238db938fff9b2f60d08ed9030387daf428c" uuid = "a2bd30eb-e257-5431-a919-1863eab51364" -version = "1.1.2" +version = "1.1.3" [[deps.Graphite2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "344bf40dcab1073aca04aa0df4fb092f920e4011" +git-tree-sha1 = "01979f9b37367603e2848ea225918a3b3861b606" uuid = "3b182d85-2403-5c21-9c21-1e1f0cc25472" -version = "1.3.14+0" +version = "1.3.14+1" [[deps.GridLayoutBase]] deps = ["GeometryBasics", "InteractiveUtils", "Observables"] -git-tree-sha1 = "fc713f007cff99ff9e50accba6373624ddd33588" +git-tree-sha1 = "dc6bed05c15523624909b3953686c5f5ffa10adc" uuid = "3955a311-db13-416c-9275-1d80ed98e5e9" -version = "0.11.0" +version = "0.11.1" [[deps.Grisu]] git-tree-sha1 = "53bb909d1151e57e2484c3d1b53e19552b887fb2" @@ -593,21 +625,21 @@ version = "1.0.2" [[deps.HarfBuzz_jll]] deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"] -git-tree-sha1 = "401e4f3f30f43af2c8478fc008da50096ea5240f" +git-tree-sha1 = "55c53be97790242c29031e5cd45e8ac296dadda3" uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566" -version = "8.3.1+0" +version = "8.5.0+0" [[deps.HypergeometricFunctions]] deps = ["LinearAlgebra", "OpenLibm_jll", "SpecialFunctions"] -git-tree-sha1 = "7c4195be1649ae622304031ed46a2f4df989f1eb" +git-tree-sha1 = "b1c2585431c382e3fe5805874bda6aea90a95de9" uuid = "34004b35-14d8-5ef3-9330-4cdb6864b03a" -version = "0.3.24" +version = "0.3.25" [[deps.ImageAxes]] deps = ["AxisArrays", "ImageBase", "ImageCore", "Reexport", "SimpleTraits"] -git-tree-sha1 = "2e4520d67b0cef90865b3ef727594d2a58e0e1f8" +git-tree-sha1 = "e12629406c6c4442539436581041d372d69c55ba" uuid = "2803e5a7-5153-5ecf-9a86-9b4c37f5f5ac" -version = "0.6.11" +version = "0.6.12" [[deps.ImageBase]] deps = ["ImageCore", "Reexport"] @@ -617,21 +649,21 @@ version = "0.1.7" [[deps.ImageCore]] deps = ["ColorVectorSpace", "Colors", "FixedPointNumbers", "MappedArrays", "MosaicViews", "OffsetArrays", "PaddedViews", "PrecompileTools", "Reexport"] -git-tree-sha1 = "b2a7eaa169c13f5bcae8131a83bc30eff8f71be0" +git-tree-sha1 = "8c193230235bbcee22c8066b0374f63b5683c2d3" uuid = "a09fc81d-aa75-5fe9-8630-4744c3626534" -version = "0.10.2" +version = "0.10.5" [[deps.ImageIO]] -deps = ["FileIO", "IndirectArrays", "JpegTurbo", "LazyModules", "Netpbm", "OpenEXR", "PNGFiles", "QOI", "Sixel", "TiffImages", "UUIDs"] -git-tree-sha1 = "437abb322a41d527c197fa800455f79d414f0a3c" +deps = ["FileIO", "IndirectArrays", "JpegTurbo", "LazyModules", "Netpbm", "OpenEXR", "PNGFiles", "QOI", "Sixel", "TiffImages", "UUIDs", "WebP"] +git-tree-sha1 = "696144904b76e1ca433b886b4e7edd067d76cbf7" uuid = "82e4d734-157c-48bb-816b-45c225c6df19" -version = "0.6.8" +version = "0.6.9" [[deps.ImageMetadata]] deps = ["AxisArrays", "ImageAxes", "ImageBase", "ImageCore"] -git-tree-sha1 = "355e2b974f2e3212a75dfb60519de21361ad3cb7" +git-tree-sha1 = "2a81c3897be6fbcde0802a0ebe6796d0562f63ec" uuid = "bc367c6b-8a6b-528e-b4bd-a4b897500b49" -version = "0.9.9" +version = "0.9.10" [[deps.Imath_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -671,17 +703,16 @@ weakdeps = ["Unitful"] InterpolationsUnitfulExt = "Unitful" [[deps.IntervalArithmetic]] -deps = ["CRlibm_jll", "MacroTools", "RoundingEmulator"] -git-tree-sha1 = "8e125d40cae3a9f4276cdfeb4fcdb1828888a4b3" +deps = ["CRlibm_jll", "LinearAlgebra", "MacroTools", "RoundingEmulator"] +git-tree-sha1 = "24c095b1ec7ee58b936985d31d5df92f9b9cfebb" uuid = "d1acc4aa-44c8-5952-acd4-ba5d80a2a253" -version = "0.22.17" -weakdeps = ["DiffRules", "ForwardDiff", "IntervalSets", "LinearAlgebra", "RecipesBase"] +version = "0.22.19" +weakdeps = ["DiffRules", "ForwardDiff", "IntervalSets", "RecipesBase"] [deps.IntervalArithmetic.extensions] IntervalArithmeticDiffRulesExt = "DiffRules" IntervalArithmeticForwardDiffExt = "ForwardDiff" IntervalArithmeticIntervalSetsExt = "IntervalSets" - IntervalArithmeticLinearAlgebraExt = "LinearAlgebra" IntervalArithmeticRecipesBaseExt = "RecipesBase" [[deps.IntervalSets]] @@ -724,9 +755,9 @@ version = "0.4.53" [[deps.JLLWrappers]] deps = ["Artifacts", "Preferences"] -git-tree-sha1 = "f389674c99bfcde17dc57454011aa44d5a260a40" +git-tree-sha1 = "be3dc50a92e5a386872a493a10050136d4703f9b" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.6.0" +version = "1.6.1" [[deps.JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] @@ -747,10 +778,10 @@ uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" version = "3.0.4+0" [[deps.KernelAbstractions]] -deps = ["Adapt", "Atomix", "InteractiveUtils", "MacroTools", "PrecompileTools", "Requires", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] -git-tree-sha1 = "04e52f596d0871fa3890170fa79cb15e481e4cd8" +deps = ["Adapt", "Atomix", "InteractiveUtils", "MacroTools", "PrecompileTools", "Requires", "StaticArrays", "UUIDs"] +git-tree-sha1 = "b9a838cd3028785ac23822cded5126b3da394d1a" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.9.28" +version = "0.9.31" [deps.KernelAbstractions.extensions] EnzymeExt = "EnzymeCore" @@ -780,23 +811,11 @@ git-tree-sha1 = "70f582b446a1c3ad82cf87e62b878668beef9d13" uuid = "40e66cde-538c-5869-a4ad-c39174c6795b" version = "0.10.1" -[[deps.LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Requires", "Unicode"] -git-tree-sha1 = "4ad43cb0a4bb5e5b1506e1d1f48646d7e0c80363" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "9.1.2" - - [deps.LLVM.extensions] - BFloat16sExt = "BFloat16s" - - [deps.LLVM.weakdeps] - BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" - -[[deps.LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "05a8bd5a42309a9ec82f700876903abce1017dd3" -uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.34+0" +[[deps.LERC_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "36bdbc52f13a7d1dcb0f3cd694e01677a515655b" +uuid = "88015f11-f218-50d7-93a8-a6af411a945d" +version = "4.0.0+0" [[deps.LLVMOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -811,9 +830,9 @@ uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac" version = "2.10.2+1" [[deps.LaTeXStrings]] -git-tree-sha1 = "50901ebc375ed41dbf8058da26f9de442febbbec" +git-tree-sha1 = "dda21b8cbd6a6c40d9d02a73230f9d70fed6918c" uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" -version = "1.3.1" +version = "1.4.0" [[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] @@ -862,33 +881,45 @@ version = "3.2.2+1" [[deps.Libgcrypt_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll"] -git-tree-sha1 = "9fd170c4bbfd8b935fdc5f8b7aa33532c991a673" +git-tree-sha1 = "8be878062e0ffa2c3f67bb58a595375eda5de80b" uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" -version = "1.8.11+0" +version = "1.11.0+0" + +[[deps.Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "ff3b4b9d35de638936a525ecd36e86a8bb919d11" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.7.0+0" [[deps.Libgpg_error_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "fbb1f2bef882392312feb1ede3615ddc1e9b99ed" +git-tree-sha1 = "c6ce1e19f3aec9b59186bdf06cdf3c4fc5f5f3e6" uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" -version = "1.49.0+0" +version = "1.50.0+0" [[deps.Libiconv_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175" +git-tree-sha1 = "61dfdba58e585066d8bce214c5a51eaa0539f269" uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" -version = "1.17.0+0" +version = "1.17.0+1" [[deps.Libmount_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "0c4f9c4f1a50d8f35048fa0532dabbadf702f81e" +git-tree-sha1 = "84eef7acd508ee5b3e956a2ae51b05024181dee0" uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" -version = "2.40.1+0" +version = "2.40.2+0" + +[[deps.Libtiff_jll]] +deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"] +git-tree-sha1 = "b404131d06f7886402758c9ce2214b636eb4d54a" +uuid = "89763e89-9b03-5906-acba-b20f662cd828" +version = "4.7.0+0" [[deps.Libuuid_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "5ee6203157c120d79034c748a2acba45b82b8807" +git-tree-sha1 = "edbf5309f9ddf1cab25afc344b1e8150b7c832f9" uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700" -version = "2.40.1+0" +version = "2.40.2+0" [[deps.LineSearches]] deps = ["LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "Printf"] @@ -903,9 +934,9 @@ version = "1.11.0" [[deps.LogExpFunctions]] deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] -git-tree-sha1 = "a2d09619db4e765091ee5c6ffe8872849de0feea" +git-tree-sha1 = "13ca9e2586b89836fd20cccf56e57e2b9ae7f38f" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" -version = "0.3.28" +version = "0.3.29" [deps.LogExpFunctions.extensions] LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" @@ -957,15 +988,15 @@ version = "1.11.0" [[deps.MathOptInterface]] deps = ["BenchmarkTools", "CodecBzip2", "CodecZlib", "DataStructures", "ForwardDiff", "JSON", "LinearAlgebra", "MutableArithmetics", "NaNMath", "OrderedCollections", "PrecompileTools", "Printf", "SparseArrays", "SpecialFunctions", "Test", "Unicode"] -git-tree-sha1 = "5b246fca5420ae176d65ed43a2d0ee5897775216" +git-tree-sha1 = "e065ca5234f53fd6f920efaee4940627ad991fb4" uuid = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" -version = "1.31.2" +version = "1.34.0" [[deps.MathTeXEngine]] deps = ["AbstractTrees", "Automa", "DataStructures", "FreeTypeAbstraction", "GeometryBasics", "LaTeXStrings", "REPL", "RelocatableFolders", "UnicodeFun"] -git-tree-sha1 = "e1641f32ae592e415e3dbae7f4a188b5316d4b62" +git-tree-sha1 = "f45c8916e8385976e1ccd055c9874560c257ab13" uuid = "0a4f8689-d25c-4efe-a92b-7142dfc1aa53" -version = "0.6.1" +version = "0.6.2" [[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] @@ -994,9 +1025,9 @@ version = "2023.12.12" [[deps.MutableArithmetics]] deps = ["LinearAlgebra", "SparseArrays", "Test"] -git-tree-sha1 = "3eba928678787843e504c153a9b8e80d7d73ab17" +git-tree-sha1 = "a2710df6b0931f987530f59427441b21245d8f5e" uuid = "d8a4904e-b15c-11e9-3269-09a3773c0cb0" -version = "1.5.0" +version = "1.6.0" [[deps.NLSolversBase]] deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"] @@ -1026,9 +1057,9 @@ uuid = "510215fc-4207-5dde-b226-833fc4488ee2" version = "0.5.5" [[deps.OffsetArrays]] -git-tree-sha1 = "1a27764e945a152f7ca7efa04de513d473e9542e" +git-tree-sha1 = "39d000d9c33706b8364817d8894fae1548f40295" uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" -version = "1.14.1" +version = "1.14.2" weakdeps = ["Adapt"] [deps.OffsetArrays.extensions] @@ -1053,9 +1084,9 @@ version = "0.3.27+1" [[deps.OpenEXR]] deps = ["Colors", "FileIO", "OpenEXR_jll"] -git-tree-sha1 = "327f53360fdb54df7ecd01e96ef1983536d1e633" +git-tree-sha1 = "97db9e07fe2091882c765380ef58ec553074e9c7" uuid = "52e1d378-f018-4a11-a4be-720524705ac7" -version = "0.3.2" +version = "0.3.3" [[deps.OpenEXR_jll]] deps = ["Artifacts", "Imath_jll", "JLLWrappers", "Libdl", "Zlib_jll"] @@ -1082,9 +1113,9 @@ version = "0.5.5+0" [[deps.Optim]] deps = ["Compat", "FillArrays", "ForwardDiff", "LineSearches", "LinearAlgebra", "NLSolversBase", "NaNMath", "Parameters", "PositiveFactorizations", "Printf", "SparseArrays", "StatsBase"] -git-tree-sha1 = "d9b79c4eed437421ac4285148fcadf42e0700e89" +git-tree-sha1 = "ab7edad78cdef22099f43c54ef77ac63c2c9cc64" uuid = "429524aa-4258-5aef-a3af-852621145aeb" -version = "1.9.4" +version = "1.10.0" weakdeps = ["MathOptInterface"] [deps.Optim.extensions] @@ -1097,9 +1128,9 @@ uuid = "91d4177d-7536-5919-b921-800302f37372" version = "1.3.3+0" [[deps.OrderedCollections]] -git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" +git-tree-sha1 = "12f1439c4f986bb868acda6ea33ebc78e19b95ad" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.6.3" +version = "1.7.0" [[deps.PCRE2_jll]] deps = ["Artifacts", "Libdl"] @@ -1120,9 +1151,9 @@ version = "0.4.3" [[deps.Packing]] deps = ["GeometryBasics"] -git-tree-sha1 = "ec3edfe723df33528e085e632414499f26650501" +git-tree-sha1 = "bc5bf2ea3d5351edf285a06b0016788a121ce92c" uuid = "19eb6ba3-879d-56ad-ad62-d5c202156566" -version = "0.5.0" +version = "0.5.1" [[deps.PaddedViews]] deps = ["OffsetArrays"] @@ -1170,10 +1201,10 @@ uuid = "eebad327-c553-4316-9ea0-9fa01ccd7688" version = "0.3.3" [[deps.PlotUtils]] -deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "Statistics"] -git-tree-sha1 = "7b1a9df27f072ac4c9c7cbe5efb198489258d1f5" +deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "StableRNGs", "Statistics"] +git-tree-sha1 = "3ca9a356cd2e113c420f2c13bea19f8d3fb1cb18" uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.4.1" +version = "1.4.3" [[deps.PolygonOps]] git-tree-sha1 = "77b3d3605fc1cd0b42d95eba87dfcd2bf67d5ff6" @@ -1220,9 +1251,9 @@ version = "1.2.1" [[deps.QOI]] deps = ["ColorTypes", "FileIO", "FixedPointNumbers"] -git-tree-sha1 = "18e8f4d1426e965c7b532ddd260599e1510d26ce" +git-tree-sha1 = "8b3fc30bc0390abdce15f8822c889f669baed73d" uuid = "4b34888f-f399-49d4-9bb3-47ed5cae4e65" -version = "1.0.0" +version = "1.0.1" [[deps.QuadGK]] deps = ["DataStructures", "LinearAlgebra"] @@ -1308,24 +1339,22 @@ uuid = "5eaf0fd0-dfba-4ccb-bf02-d820a40db705" version = "0.2.1" [[deps.SCS]] -deps = ["MathOptInterface", "Requires", "SCS_jll", "SparseArrays"] -git-tree-sha1 = "0dfe49eaa058ce905a4199af379b8e411e6126e5" +deps = ["MathOptInterface", "Requires", "SCS_GPU_jll", "SCS_jll", "SparseArrays"] +git-tree-sha1 = "8d908b7c81e199ee92d17b6192849e8c43d2f31d" uuid = "c946c3f1-0d1f-5ce8-9dea-7daa1f7e2d13" -version = "2.0.1" - - [deps.SCS.extensions] - SCSSCS_GPU_jllExt = ["SCS_GPU_jll"] - SCSSCS_MKL_jllExt = ["SCS_MKL_jll"] +version = "1.1.2" - [deps.SCS.weakdeps] - SCS_GPU_jll = "af6e375f-46ec-5fa0-b791-491b0dfa44a4" - SCS_MKL_jll = "3f2553a9-4106-52be-b7dd-865123654657" +[[deps.SCS_GPU_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "OpenBLAS32_jll", "Pkg"] +git-tree-sha1 = "f912271ecccb00acaddfab2943e9b33d5ec36d3b" +uuid = "af6e375f-46ec-5fa0-b791-491b0dfa44a4" +version = "3.2.0+0" [[deps.SCS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LLVMOpenMP_jll", "Libdl", "OpenBLAS32_jll"] -git-tree-sha1 = "668bcf4b25cf992564321ccb70b205f9a7487cfa" +deps = ["Artifacts", "JLLWrappers", "Libdl", "OpenBLAS32_jll", "Pkg"] +git-tree-sha1 = "ba5c0d3b23220d3598d2877b4cf913e3fcf8add3" uuid = "f4f2fc5b-1d94-523c-97ea-2ab488bedf4b" -version = "3.2.6+0" +version = "3.2.0+0" [[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -1333,9 +1362,9 @@ version = "0.7.0" [[deps.SIMD]] deps = ["PrecompileTools"] -git-tree-sha1 = "98ca7c29edd6fc79cd74c61accb7010a4e7aee33" +git-tree-sha1 = "52af86e35dd1b177d051b12681e1c581f53c281b" uuid = "fdea26ae-647d-5447-a871-4b548cad5224" -version = "3.6.0" +version = "3.7.0" [[deps.Scratch]] deps = ["Dates"] @@ -1405,14 +1434,20 @@ version = "1.11.0" [[deps.SpecialFunctions]] deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "2f5d4697f21388cbe1ff299430dd169ef97d7e14" +git-tree-sha1 = "64cca0c26b4f31ba18f13f6c12af7c85f478cfde" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.4.0" +version = "2.5.0" weakdeps = ["ChainRulesCore"] [deps.SpecialFunctions.extensions] SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" +[[deps.StableRNGs]] +deps = ["Random"] +git-tree-sha1 = "83e6cce8324d49dfaf9ef059227f91ed4441a8e5" +uuid = "860ef19b-820b-49d6-a774-d7a799459cd3" +version = "1.0.2" + [[deps.StackViews]] deps = ["OffsetArrays"] git-tree-sha1 = "46e589465204cd0c08b4bd97385e4fa79a0c770c" @@ -1421,9 +1456,9 @@ version = "0.1.1" [[deps.StaticArrays]] deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"] -git-tree-sha1 = "eeafab08ae20c62c44c8399ccb9354a04b80db50" +git-tree-sha1 = "777657803913ffc7e8cc20f0fd04b634f871af8f" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.9.7" +version = "1.9.8" weakdeps = ["ChainRulesCore", "Statistics"] [deps.StaticArrays.extensions] @@ -1479,19 +1514,22 @@ version = "0.3.7" [[deps.StructArrays]] deps = ["ConstructionBase", "DataAPI", "Tables"] -git-tree-sha1 = "f4dc295e983502292c4c3f951dbb4e985e35b3be" +git-tree-sha1 = "9537ef82c42cdd8c5d443cbc359110cbb36bae10" uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" -version = "0.6.18" +version = "0.6.21" [deps.StructArrays.extensions] StructArraysAdaptExt = "Adapt" - StructArraysGPUArraysCoreExt = "GPUArraysCore" + StructArraysGPUArraysCoreExt = ["GPUArraysCore", "KernelAbstractions"] + StructArraysLinearAlgebraExt = "LinearAlgebra" StructArraysSparseArraysExt = "SparseArrays" StructArraysStaticArraysExt = "StaticArrays" [deps.StructArrays.weakdeps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" + KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" + LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" @@ -1563,14 +1601,14 @@ weakdeps = ["ClimaParams"] [[deps.TiffImages]] deps = ["ColorTypes", "DataStructures", "DocStringExtensions", "FileIO", "FixedPointNumbers", "IndirectArrays", "Inflate", "Mmap", "OffsetArrays", "PkgVersion", "ProgressMeter", "SIMD", "UUIDs"] -git-tree-sha1 = "bc7fd5c91041f44636b2c134041f7e5263ce58ae" +git-tree-sha1 = "0248b1b2210285652fbc67fd6ced9bf0394bcfec" uuid = "731e570b-9d59-4bfa-96dc-6df516fadf69" -version = "0.10.0" +version = "0.11.1" [[deps.TranscodingStreams]] -git-tree-sha1 = "e84b3a11b9bece70d14cce63406bbc79ed3464d2" +git-tree-sha1 = "0c45878dcfdcfa8480052b6ab162cdd138781742" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.11.2" +version = "0.11.3" [[deps.TriplotBase]] git-tree-sha1 = "4d4ed7f294cda19382ff7de4c137d24d16adc89b" @@ -1599,9 +1637,9 @@ version = "0.4.1" [[deps.Unitful]] deps = ["Dates", "LinearAlgebra", "Random"] -git-tree-sha1 = "d95fe458f26209c66a187b1114df96fd70839efd" +git-tree-sha1 = "01915bfcd62be15329c9a07235447a89d588327c" uuid = "1986cc42-f94f-5a68-af5c-568840ba703d" -version = "1.21.0" +version = "1.21.1" [deps.Unitful.extensions] ConstructionBaseUnitfulExt = "ConstructionBase" @@ -1616,11 +1654,11 @@ git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" version = "0.2.1" -[[deps.UnsafeAtomicsLLVM]] -deps = ["LLVM", "UnsafeAtomics"] -git-tree-sha1 = "2d17fabcd17e67d7625ce9c531fb9f40b7c42ce4" -uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" -version = "0.2.1" +[[deps.WebP]] +deps = ["CEnum", "ColorTypes", "FileIO", "FixedPointNumbers", "ImageCore", "libwebp_jll"] +git-tree-sha1 = "aa1ca3c47f119fbdae8770c29820e5e6119b83f2" +uuid = "e3aaa7dc-3e4b-44e0-be63-ffb868ccd7c1" +version = "0.1.3" [[deps.WoodburyMatrices]] deps = ["LinearAlgebra", "SparseArrays"] @@ -1630,39 +1668,45 @@ version = "1.0.0" [[deps.XML2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] -git-tree-sha1 = "1165b0443d0eca63ac1e32b8c0eb69ed2f4f8127" +git-tree-sha1 = "a2fccc6559132927d4c5dc183e3e01048c6dcbd6" uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.13.3+0" +version = "2.13.5+0" [[deps.XSLT_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Libgpg_error_jll", "Libiconv_jll", "XML2_jll", "Zlib_jll"] -git-tree-sha1 = "a54ee957f4c86b526460a720dbc882fa5edcbefc" +git-tree-sha1 = "7d1671acbe47ac88e981868a078bd6b4e27c5191" uuid = "aed1982a-8fda-507f-9586-7b0439959a61" -version = "1.1.41+0" +version = "1.1.42+0" + +[[deps.XZ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "15e637a697345f6743674f1322beefbc5dcd5cfc" +uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" +version = "5.6.3+0" [[deps.Xorg_libX11_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] -git-tree-sha1 = "afead5aba5aa507ad5a3bf01f58f82c8d1403495" +git-tree-sha1 = "9dafcee1d24c4f024e7edc92603cedba72118283" uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" -version = "1.8.6+0" +version = "1.8.6+1" [[deps.Xorg_libXau_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "6035850dcc70518ca32f012e46015b9beeda49d8" +git-tree-sha1 = "2b0e27d52ec9d8d483e2ca0b72b3cb1a8df5c27a" uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" -version = "1.0.11+0" +version = "1.0.11+1" [[deps.Xorg_libXdmcp_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "34d526d318358a859d7de23da945578e8e8727b7" +git-tree-sha1 = "02054ee01980c90297412e4c809c8694d7323af3" uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" -version = "1.1.4+0" +version = "1.1.4+1" [[deps.Xorg_libXext_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"] -git-tree-sha1 = "d2d1a5c49fae4ba39983f63de6afcbea47194e85" +git-tree-sha1 = "d7155fea91a4123ef59f42c4afb5ab3b4ca95058" uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" -version = "1.3.6+0" +version = "1.3.6+1" [[deps.Xorg_libXrender_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"] @@ -1672,21 +1716,21 @@ version = "0.9.11+0" [[deps.Xorg_libpthread_stubs_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8fdda4c692503d44d04a0603d9ac0982054635f9" +git-tree-sha1 = "fee57a273563e273f0f53275101cd41a8153517a" uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" -version = "0.1.1+0" +version = "0.1.1+1" [[deps.Xorg_libxcb_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] -git-tree-sha1 = "bcd466676fef0878338c61e655629fa7bbc69d8e" +git-tree-sha1 = "1a74296303b6524a0472a8cb12d3d87a78eb3612" uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" -version = "1.17.0+0" +version = "1.17.0+1" [[deps.Xorg_xtrans_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "e92a1a012a10506618f10b7047e478403a046c77" +git-tree-sha1 = "b9ead2d2bdb27330545eb14234a2e300da61232e" uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" -version = "1.5.0+0" +version = "1.5.0+1" [[deps.YAML]] deps = ["Base64", "Dates", "Printf", "StringEncodings"] @@ -1699,6 +1743,12 @@ deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" version = "1.2.13+1" +[[deps.Zstd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "555d1076590a6cc2fdee2ef1469451f872d8b41b" +uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" +version = "1.5.6+1" + [[deps.isoband_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "51b5eeb3f98367157a7a12a1fb0aa5328946c03c" @@ -1746,6 +1796,12 @@ git-tree-sha1 = "490376214c4721cdaca654041f635213c6165cb3" uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" version = "1.3.7+2" +[[deps.libwebp_jll]] +deps = ["Artifacts", "Giflib_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libglvnd_jll", "Libtiff_jll", "libpng_jll"] +git-tree-sha1 = "ccbb625a89ec6195856a50aa2b668a5c08712c94" +uuid = "c5f90fcd-3b7e-5836-afba-fc50a0988cb2" +version = "1.4.0+0" + [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" diff --git a/experiments/surface_fluxes_perfect_model/experiment_config.yml b/experiments/surface_fluxes_perfect_model/experiment_config.yml index c4cbf698..2448ffe6 100644 --- a/experiments/surface_fluxes_perfect_model/experiment_config.yml +++ b/experiments/surface_fluxes_perfect_model/experiment_config.yml @@ -1,6 +1,6 @@ output_dir: output/surface_fluxes_perfect_model prior: prior.toml -ensemble_size: 10 +ensemble_size: 20 n_iterations: 6 observations: data/obs_mean.jld2 noise: data/obs_noise_cov.jld2 diff --git a/experiments/surface_fluxes_perfect_model/generate_data.jl b/experiments/surface_fluxes_perfect_model/generate_data.jl index 4b7ccc17..2127dd04 100644 --- a/experiments/surface_fluxes_perfect_model/generate_data.jl +++ b/experiments/surface_fluxes_perfect_model/generate_data.jl @@ -4,13 +4,11 @@ import SurfaceFluxes as SF import SurfaceFluxes.Parameters as SFPP import SurfaceFluxes.UniversalFunctions as UF import Thermodynamics as TD -using YAML import SurfaceFluxes.Parameters: SurfaceFluxesParameters using ClimaCalibrate pkg_dir = pkgdir(ClimaCalibrate) -experiment_path = - joinpath(pkg_dir, "experiments", "surface_fluxes_perfect_model") +experiment_path = dirname(Base.active_project()) data_path = joinpath(experiment_path, "data") include(joinpath(experiment_path, "model_interface.jl")) @@ -81,7 +79,8 @@ Generate synthetic observed y from the model truth. """ function synthetic_observed_y(x_inputs; data_path = "data", apply_noise = false) FT = typeof(x_inputs.profiles_int[1].T) - config = YAML.load_file("$experiment_path/model_config.yml") + config = Dict() + config["toml"] = [] config["output_dir"] = data_path y = obtain_ustar(FT, x_inputs, config, return_ustar = true) if apply_noise diff --git a/experiments/surface_fluxes_perfect_model/model_config.yml b/experiments/surface_fluxes_perfect_model/model_config.yml deleted file mode 100644 index 23b8f1e7..00000000 --- a/experiments/surface_fluxes_perfect_model/model_config.yml +++ /dev/null @@ -1,3 +0,0 @@ -output_dir: output/surface_fluxes_perfect_model -x_data_file: experiments/surface_fluxes_perfect_model/data/synthetic_profile_data.jld2 -toml: [] \ No newline at end of file diff --git a/experiments/surface_fluxes_perfect_model/model_interface.jl b/experiments/surface_fluxes_perfect_model/model_interface.jl index 6c9806f1..d62772b9 100644 --- a/experiments/surface_fluxes_perfect_model/model_interface.jl +++ b/experiments/surface_fluxes_perfect_model/model_interface.jl @@ -1,13 +1,10 @@ import EnsembleKalmanProcesses as EKP using ClimaCalibrate -import ClimaCalibrate: set_up_forward_model, run_forward_model, ExperimentConfig -import YAML +import ClimaCalibrate: forward_model -""" - SurfaceFluxModel - -A type representing the surface fluxes perfect model. +pkgdir_CC = pkgdir(ClimaCalibrate) +""" We are using the inverse of the following problem y(x) = G(θ, x) + ε to obtain the posterior distribution of θ given y, x, and G. @@ -25,69 +22,30 @@ We need to follow the following steps for the calibration: - we let the profiles to be the input data x, while the roughness length are stationary model preliminaries (uncalibrated stationary parameters) 3. obtain the observed data y (in this case of a perfect model, we are generating it using model G. We add some noise so we can see slightly slower convergence as we calibrate the model. In a real world scenario, we would obtain this from observations where each y vector observation would have an x input associated with it.) 4. define the prior distributions for θ (this is subjective and can be based on expert knowledge or previous studies) - """ -experiment_dir = joinpath( - pkgdir(ClimaCalibrate), - "experiments", - "surface_fluxes_perfect_model", -) +experiment_dir = + joinpath(pkgdir_CC, "experiments", "surface_fluxes_perfect_model") include(joinpath(experiment_dir, "sf_model.jl")) include(joinpath(experiment_dir, "observation_map.jl")) -function set_up_forward_model(member, iteration, experiment_dir::AbstractString) - return set_up_forward_model( - member, - iteration, - ExperimentConfig(experiment_dir), - ) -end - -""" - set_up_forward_model(member, iteration, experiment_dir::AbstractString) - set_up_forward_model(member, iteration, experiment_config::ExperimentConfig) - -Returns an config dictionary object for the given member and iteration. -Given an experiment dir, it will load the ExperimentConfig -This assumes that the config dictionary has the `output_dir` key. -""" -function set_up_forward_model( - member, - iteration, - experiment_config::ExperimentConfig, -) +function forward_model(iteration, member) # Specify member path for output_dir - model_config = YAML.load_file( - joinpath( - "experiments", - "surface_fluxes_perfect_model", - "model_config.yml", - ), - ) - output_dir = (experiment_config.output_dir) + model_config = Dict() + output_dir = joinpath(pkgdir_CC, "output", "surface_fluxes_perfect_model") # Set TOML to use EKP parameter(s) member_path = EKP.TOMLInterface.path_to_ensemble_member(output_dir, iteration, member) model_config["output_dir"] = member_path - parameter_path = joinpath(member_path, "parameters.toml") - if haskey(model_config, "toml") - push!(model_config["toml"], parameter_path) - else - model_config["toml"] = [parameter_path] - end - - return model_config -end - -""" - run_forward_model(config::AbstractDict) - -Runs the model with the given an AbstractDict object. -""" - -function run_forward_model(config::AbstractDict) - x_inputs = load_profiles(config["x_data_file"]) + model_config["toml"] = [joinpath(member_path, "parameters.toml")] + x_data_file = joinpath( + pkgdir_CC, + "experiments", + "surface_fluxes_perfect_model", + "data", + "synthetic_profile_data.jld2", + ) + x_inputs = load_profiles(x_data_file) FT = typeof(x_inputs.profiles_int[1].T) - obtain_ustar(FT, x_inputs, config) + obtain_ustar(FT, x_inputs, model_config) end diff --git a/experiments/surface_fluxes_perfect_model/postprocessing.jl b/experiments/surface_fluxes_perfect_model/postprocessing.jl index d5b0ba19..7680b484 100644 --- a/experiments/surface_fluxes_perfect_model/postprocessing.jl +++ b/experiments/surface_fluxes_perfect_model/postprocessing.jl @@ -12,17 +12,17 @@ using Statistics using ClimaCalibrate experiment_dir = dirname(Base.active_project()) -experiment_config = ClimaCalibrate.ExperimentConfig(experiment_dir) -output_dir = experiment_config.output_dir +experiment_config = ExperimentConfig(experiment_dir) N_iter = experiment_config.n_iterations N_mem = experiment_config.ensemble_size +output_dir = experiment_config.output_dir function convergence_plot( eki, prior, theta_star_vec, param_names, - output_dir = experiment_config.output_dir, + output_dir = output_dir, ) # per parameter @@ -97,12 +97,12 @@ function convergence_plot( Makie.hlines!(ax, [theta_star], color = :red, linestyle = :dash) Makie.save(joinpath(output_dir, "convergence_$param_name.png"), f) + println(joinpath(output_dir, "convergence_$param_name.png")) end end pkg_dir = pkgdir(ClimaCalibrate) -model_config = YAML.load_file(joinpath(experiment_dir, "model_config.yml")) eki_path = joinpath( ClimaCalibrate.path_to_iteration(output_dir, N_iter), @@ -110,7 +110,6 @@ eki_path = joinpath( ); eki = JLD2.load_object(eki_path); EKP.get_u(eki) -prior = experiment_config.prior theta_star_vec = (; coefficient_a_m_businger = 4.7, coefficient_a_h_businger = 4.7) @@ -130,11 +129,21 @@ include(joinpath(experiment_dir, "model_interface.jl")) f = Makie.Figure() ax = Makie.Axis(f[1, 1], xlabel = "Iteration", ylabel = "Model Ustar") ustar_obs = JLD2.load_object( - joinpath(pkg_dir, "$experiment_dir/data/synthetic_ustar_array_noisy.jld2"), + joinpath(experiment_dir, "data", "synthetic_ustar_array_noisy.jld2"), +) + +x_data_file = joinpath( + pkgdir_CC, + "experiments", + "surface_fluxes_perfect_model", + "data", + "synthetic_profile_data.jld2", ) -x_inputs = load_profiles(model_config["x_data_file"]) +x_inputs = load_profiles(x_data_file) ustar_mod = 0 +model_config = Dict() +model_config["output_dir"] = output_dir for iter in 0:N_iter for i in 1:N_mem model_config["toml"] = [ @@ -173,3 +182,4 @@ Makie.lines!( ) Makie.save(joinpath(output_dir, "scatter_iter.png"), f) +println(joinpath(output_dir, "scatter_iter.png")) diff --git a/src/ClimaCalibrate.jl b/src/ClimaCalibrate.jl index f4cfae51..bff9697a 100644 --- a/src/ClimaCalibrate.jl +++ b/src/ClimaCalibrate.jl @@ -1,10 +1,13 @@ module ClimaCalibrate +project_dir() = dirname(Base.active_project()) + include("ekp_interface.jl") include("model_interface.jl") include("slurm.jl") include("pbs.jl") include("backends.jl") include("emulate_sample.jl") +include("slurm_workers.jl") end # module ClimaCalibrate diff --git a/src/backends.jl b/src/backends.jl index fc1c90cc..87f17d7b 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -100,7 +100,7 @@ function calibrate( for i in 0:(n_iterations - 1) @info "Running iteration $i" pmap(1:ensemble_size; retry_delays = reruns, on_error) do m - run_forward_model(set_up_forward_model(m, i, config)) + forward_model(i, m) @info "Completed member $m" end G_ensemble = observation_map(i) diff --git a/src/ekp_interface.jl b/src/ekp_interface.jl index a9625893..db944c57 100644 --- a/src/ekp_interface.jl +++ b/src/ekp_interface.jl @@ -82,7 +82,7 @@ end """ path_to_ensemble_member(output_dir, iteration, member) -Constructs the path to an ensemble member's directory for a given iteration and member number. +Return the path to an ensemble member's directory for a given iteration and member number. """ path_to_ensemble_member(output_dir, iteration, member) = EKP.TOMLInterface.path_to_ensemble_member(output_dir, iteration, member) @@ -90,7 +90,7 @@ path_to_ensemble_member(output_dir, iteration, member) = """ path_to_model_log(output_dir, iteration, member) -Constructs the path to an ensemble member's forward model log for a given iteration and member number. +Return the path to an ensemble member's forward model log for a given iteration and member number. """ path_to_model_log(output_dir, iteration, member) = joinpath( path_to_ensemble_member(output_dir, iteration, member), @@ -100,7 +100,7 @@ path_to_model_log(output_dir, iteration, member) = joinpath( """ path_to_iteration(output_dir, iteration) -Creates the path to the directory for a specific iteration within the specified output directory. +Return the path to the directory for a given iteration within the specified output directory. """ path_to_iteration(output_dir, iteration) = joinpath(output_dir, join(["iteration", lpad(iteration, 3, "0")], "_")) @@ -268,14 +268,23 @@ function _initialize( initial_ensemble = EKP.construct_initial_ensemble(rng_ekp, prior, ensemble_size) - ekp_str_kwargs = Dict([string(k) => v for (k, v) in ekp_kwargs]) - eki_constructor = + # EKP 2.0 and later require the `default_options_dict` + eki_constructor = if hasproperty(EKP, :default_options_dict) + ekp_kwargs = Dict([string(k) => v for (k, v) in ekp_kwargs]) (args...) -> EKP.EnsembleKalmanProcess( args..., - merge(EKP.default_options_dict(EKP.Inversion()), ekp_str_kwargs); + merge(EKP.default_options_dict(EKP.Inversion()), ekp_kwargs); rng = rng_ekp, ) - + else + eki_constructor = + (args...) -> EKP.EnsembleKalmanProcess( + args...; + rng = rng_ekp, + failure_handler_method = EKP.SampleSuccGauss(), + ekp_kwargs..., + ) + end eki = if isnothing(noise) eki_constructor(initial_ensemble, observations, EKP.Inversion()) else diff --git a/src/model_interface.jl b/src/model_interface.jl index f0bc50f4..522fa80e 100644 --- a/src/model_interface.jl +++ b/src/model_interface.jl @@ -1,33 +1,19 @@ import EnsembleKalmanProcesses as EKP import YAML -export set_up_forward_model, run_forward_model, observation_map +export forward_model, observation_map """ - set_up_forward_model(member, iteration, experiment_dir::AbstractString) - set_up_forward_model(member, iteration, experiment_config::ExperimentConfig) - -Set up and configure a single member's forward model. Used in conjunction with `run_forward_model`. - -This function must be overriden by a component's model interface and -should set things like the parameter path and other member-specific settings. -""" -set_up_forward_model(member, iteration, experiment_dir::AbstractString) = - set_up_forward_model(member, iteration, ExperimentConfig(experiment_dir)) - -set_up_forward_model(member, iteration, experiment_config::ExperimentConfig) = - error("set_up_forward_model not implemented") - -""" - run_forward_model(model_config) + forward_model(iteration, member) Execute the forward model simulation with the given configuration. -This function should be overridden with model-specific implementation details. -`config` should be obtained from `set_up_forward_model`: -`run_forward_model(set_up_forward_model(member, iter, experiment_dir))` +This function must be overridden by a component's model interface and +should set things like the parameter path and other member-specific settings. """ -run_forward_model(model_config) = error("run_forward_model not implemented") +function forward_model(iteration, member) + error("forward_model not implemented") +end """ observation_map(iteration) diff --git a/src/pbs.jl b/src/pbs.jl index 261cebff..8a1da7e8 100644 --- a/src/pbs.jl +++ b/src/pbs.jl @@ -71,7 +71,7 @@ export CLIMACOMMS_CONTEXT="MPI" julia_script = """\ import ClimaCalibrate as CAL include("$(abspath(model_interface))") - CAL.run_forward_model(CAL.set_up_forward_model($member, $iter, "$experiment_dir")) + CAL.forward_model($iter, $member) """ return pbs_script, julia_script end diff --git a/src/slurm.jl b/src/slurm.jl index a7686b95..2afcbfc3 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -141,15 +141,36 @@ Submit a job to the Slurm scheduler using sbatch, removing unwanted environment Unset variables: "SLURM_MEM_PER_CPU", "SLURM_MEM_PER_GPU", "SLURM_MEM_PER_NODE" """ -function submit_slurm_job(sbatch_filepath; env = deepcopy(ENV)) - # Ensure that we don't inherit unwanted environment variables - unset_env_vars = - ("SLURM_MEM_PER_CPU", "SLURM_MEM_PER_GPU", "SLURM_MEM_PER_NODE") - for k in unset_env_vars - haskey(env, k) && delete!(env, k) +function submit_slurm_job(sbatch_filepath; env=deepcopy(ENV)) + # List of SLURM environment variables to unset + unset_env_vars = [ + "SLURM_MEM_PER_CPU", + "SLURM_MEM_PER_GPU", + "SLURM_MEM_PER_NODE", + "SLURM_CPUS_PER_TASK", + "SLURM_NTASKS", + "SLURM_JOB_NAME", + "SLURM_SUBMIT_DIR", + "SLURM_JOB_ID" + ] + # Create a new environment without the SLURM variables + for var in unset_env_vars + delete!(clean_env, var) + end + + try + cmd = `sbatch --parsable $sbatch_filepath` + output = readchomp(setenv(cmd, clean_env)) + # Parse job ID, handling potential format issues + jobid = match(r"^\d+", output) + if jobid === nothing + error("Failed to parse job ID from output: $output") + end + + return parse(Int, jobid.match) + catch e + error("Failed to submit SLURM job: $e") end - jobid = readchomp(setenv(`sbatch --parsable $sbatch_filepath`, env)) - return parse(Int, jobid) end """ @@ -205,7 +226,7 @@ function generate_sbatch_script( model_interface = "$model_interface"; include(model_interface) experiment_dir = "$experiment_dir" - CAL.run_forward_model(CAL.set_up_forward_model(member, iteration, experiment_dir))' + CAL.forward_model(iteration, member)' exit 0 """ return sbatch_contents diff --git a/src/slurm_workers.jl b/src/slurm_workers.jl new file mode 100644 index 00000000..69556060 --- /dev/null +++ b/src/slurm_workers.jl @@ -0,0 +1,307 @@ +using Distributed +import EnsembleKalmanProcesses as EKP +export worker_calibrate, add_slurm_workers + +function run_iteration( + iter, + ensemble_size, + output_dir; + worker_pool, + failure_rate, +) + # Create a channel to collect results + results = Channel{Any}(ensemble_size) + nfailures = 0 + @sync begin + for m in 1:(ensemble_size) + @async begin + worker = take!(worker_pool) + @info "Running particle $m on worker $worker" + try + remotecall_wait(forward_model, worker, iter, m) + catch e + @warn "Error running member $m" exception = e + nfailures += 1 + finally + # Always return worker to pool + put!(worker_pool, worker) + end + end + end + end + iter_failure_rate = nfailures / ensemble_size + if iter_failure_rate > failure_rate + error( + "Ensemble for iter $iter had a $(iter_failure_rate * 100)% failure rate", + ) + end +end + +function worker_calibrate( + config; + failure_rate = 0.5, + worker_pool = default_worker_pool(), + ekp_kwargs..., +) + (; ensemble_size, n_iterations, observations, noise, prior, output_dir) = + config + return worker_calibrate( + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + failure_rate, + worker_pool, + ekp_kwargs..., + ) +end + +function worker_calibrate( + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + failure_rate = 0.5, + worker_pool = default_worker_pool(), + ekp_kwargs..., +) + initialize( + ensemble_size, + observations, + noise, + prior, + output_dir; + rng_seed = 1234, + ekp_kwargs..., + ) + for iter in 0:(n_iterations) + (; time) = @timed run_iteration( + iter, + ensemble_size, + output_dir; + worker_pool, + failure_rate, + ) + @info "Iteration $iter time: $time" + # Process results + G_ensemble = observation_map(iter) + save_G_ensemble(output_dir, iter, G_ensemble) + update_ensemble(output_dir, iter, prior) + iter_path = path_to_iteration(output_dir, iter) + end + return JLD2.load_object( + joinpath(path_to_iteration(output_dir, n_iterations), "eki_file.jld2"), + ) +end + +function worker_calibrate( + ekp::EKP.EnsembleKalmanProcess, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + failure_rate = 0.5, + worker_pool = default_worker_pool(), + ekp_kwargs..., +) + initialize(ekp, prior, output_dir; rng_seed = 1234) + for iter in 0:n_iterations + (; time) = @timed run_iteration( + iter, + ensemble_size, + output_dir; + worker_pool, + failure_rate, + ) + @info "Iteration $iter time: $time" + # Process results + G_ensemble = observation_map(iter) + save_G_ensemble(output_dir, iter, G_ensemble) + update_ensemble(output_dir, iter, prior) + iter_path = path_to_iteration(output_dir, iter) + end + return JLD2.load_object(path_to_iteration(output_dir, n_iterations)) +end + +worker_cookie() = begin + Distributed.init_multi() + cluster_cookie() +end +worker_arg() = `--worker=$(worker_cookie())` + +struct SlurmManager <: ClusterManager + ntasks::Integer + + function SlurmManager( + ntasks::Integer = parse(Int, get(ENV, "SLURM_NTASKS", "1")), + ) + new(ntasks) + end +end + +function Distributed.manage( + manager::SlurmManager, + id::Integer, + config::WorkerConfig, + op::Symbol, +) + # This function needs to exist, but so far we don't do anything +end + +# Main SlurmManager function, mostly copied from the unmaintained ClusterManagers.jl +# Original code: https://github.com/JuliaParallel/ClusterManagers.jl +# TODO: Log per member +function Distributed.launch( + sm::SlurmManager, + params::Dict, + instances_arr::Array, + c::Condition, +) + default_params = Distributed.default_addprocs_params() + params = merge(default_params, Dict{Symbol, Any}(params)) + exehome = params[:dir] + exename = params[:exename] + exeflags = params[:exeflags] + + exeflags = exeflags == `` ? "--project=$(project_dir())" : exeflags + + stdkeys = keys(Distributed.default_addprocs_params()) + slurm_params = + filter(x -> (!(x[1] in stdkeys) && x[1] != :job_file_loc), params) + srunargs = [] + + for (k, v) in slurm_params + if length(string(k)) == 1 + push!(srunargs, "-$k") + if length(v) > 0 + push!(srunargs, v) + end + else + k2 = replace(string(k), "_" => "-") + if length(v) > 0 + push!(srunargs, "--$k2=$v)") + else + push!(srunargs, "--$k2") + end + end + end + + # Get job file location from parameter dictionary + job_file_loc = joinpath(exehome, get(params, :job_file_loc, ".")) + + # Make directory if not already made + if !isdir(job_file_loc) + mkdir(job_file_loc) + end + # Check for given output file name + jobname = "julia-$(getpid())" + + default_template = ".$jobname-$(trunc(Int, Base.time() * 10))" + default_output(x) = joinpath(job_file_loc, "$default_template-$x.out") + + # Set output name + has_output_name = + any(arg -> occursin("-o", arg) || occursin("--output", arg), srunargs) + if has_output_name + # if has_output_name, ensure there is only one output arg + locs = findall( + x -> startswith(x, "-o") || startswith(x, "--output"), + srunargs, + ) + length(locs) > 1 && + error("Slurm Error: Multiple output files specified: $srunargs") + job_output_file = srunargs[locs[1] + 1] + else + # Slurm interpolates %4t to the task ID padded with up to four zeros + push!(srunargs, "-o", default_output("%4t")) + end + + ntasks = sm.ntasks + srun_cmd = `srun -J $jobname -n $ntasks -D $exehome $(srunargs) $exename $exeflags $(worker_arg())` + + @info "Starting SLURM job $jobname: $srun_cmd" + srun_proc = open(srun_cmd) + # This Regex will match the worker's socket and IP address + # Example: julia_worker:9015#169.254.3.1 + slurm_spec_regex = r"([\w]+):([\d]+)#(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})" + could_not_connect_regex = r"could not connect" + exiting_regex = r"exiting." + + # Wait for workers to start + t_start = time() + t_waited = round(Int, time() - t_start) + retry_delays = ExponentialBackOff(10, 1.0, 512.0, 2.0, 0.1) + for i in 0:(ntasks - 1) + slurm_spec_match = nothing + worker_errors = String[] + if !has_output_name + job_output_file = default_output(lpad(i, 4, "0")) + end + for retry_delay in push!(collect(retry_delays), 0) + t_waited = round(Int, time() - t_start) + + # Wait for output log to be created and populated, then parse + if isfile(job_output_file) + if filesize(job_output_file) > 0 + open(job_output_file) do f + # Due to error and warning messages, we need to check + # for a regex match on each line + for line in eachline(f) + re_match = match(slurm_spec_regex, line) + if !isnothing(re_match) + slurm_spec_match = re_match + break # We have found the match + end + for expr in [could_not_connect_regex, exiting_regex] + if !isnothing(match(expr, line)) + slurm_spec_match = nothing + push!(worker_errors, line) + end + end + end + end + end + if !isempty(worker_errors) || !isnothing(slurm_spec_match) + break # break if error or specification found + else + @info "Worker $i (after $t_waited s): Output file found, but no connection details yet" + end + else + @info "Worker $i (after $t_waited s): No output file \"$job_output_file\" yet" + end + + # Sleep for some time to limit resource usage while waiting for the job to start + sleep(retry_delay) + end + + if !isempty(worker_errors) + throw( + ErrorException( + "Worker $i failed after $t_waited s: $(join(worker_errors, " "))", + ), + ) + elseif isnothing(slurm_spec_match) + throw( + ErrorException( + "Timeout after $t_waited s while waiting for worker $i to get ready.", + ), + ) + end + + config = WorkerConfig() + config.port = parse(Int, slurm_spec_match[2]) + config.host = strip(slurm_spec_match[3]) + @info "Worker $i ready after $t_waited s on host $(config.host), port $(config.port)" + # Keep a reference to the proc, so it's properly closed once + # the last worker exits. + config.userdata = srun_proc + push!(instances_arr, config) + notify(c) + end +end diff --git a/test/hpc_backend_e2e.jl b/test/hpc_backend_e2e.jl index 865a2031..c93aa2d4 100644 --- a/test/hpc_backend_e2e.jl +++ b/test/hpc_backend_e2e.jl @@ -14,6 +14,7 @@ import ClimaCalibrate: DerechoBackend using Test import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final +import Statistics: var experiment_dir = dirname(Base.active_project()) model_interface = joinpath(experiment_dir, "model_interface.jl") @@ -23,31 +24,30 @@ include(joinpath(experiment_dir, "generate_data.jl")) include(joinpath(experiment_dir, "observation_map.jl")) include(model_interface) -prior = get_prior(joinpath(experiment_dir, "prior.toml")) +experiment_config = ExperimentConfig(experiment_dir) +(; observations, prior) = experiment_config function test_sf_calibration_output(eki, prior) @testset "End to end test using file config (surface fluxes perfect model)" begin - parameter_values = get_ϕ_mean_final(prior, eki) - test_parameter_values = [4.778584250117946, 3.7295665619234697] - @test all( - isapprox.(parameter_values, test_parameter_values; rtol = 1e-3), - ) + params = EKP.get_ϕ(prior, eki) + spread = map(var, params) + + # Spread should be heavily decreased as particles have converged + @test last(spread) / first(spread) < 0.15 forward_model_output = get_g_mean_final(eki) - test_model_output = [0.05228473730385304] - @test all( - isapprox.(forward_model_output, test_model_output; rtol = 1e-3), - ) + @show forward_model_output + @test all(isapprox.(forward_model_output, observations; rtol = 1e-2)) end end -@assert get_backend() <: HPCBackend -hpc_kwargs = kwargs(time = 5, ntasks = 1, cpus_per_task = 1) -if get_backend() == DerechoBackend - hpc_kwargs[:queue] = "develop" -end -eki = calibrate(experiment_dir; model_interface, hpc_kwargs, verbose = true) -test_sf_calibration_output(eki, prior) +# @assert get_backend() <: HPCBackend +# hpc_kwargs = kwargs(time = 5, ntasks = 1, cpus_per_task = 1) +# if get_backend() == DerechoBackend +# hpc_kwargs[:queue] = "develop" +# end +# eki = calibrate(experiment_dir; model_interface, hpc_kwargs, verbose = true) +# test_sf_calibration_output(eki, prior) # Pure Julia calibration, this should run anywhere eki = calibrate(JuliaBackend, experiment_dir) diff --git a/test/model_interface.jl b/test/model_interface.jl index a52be523..95538753 100644 --- a/test/model_interface.jl +++ b/test/model_interface.jl @@ -7,38 +7,11 @@ using Test # otherwise ClimaCalibrate will throw an error. @testset "Model Interface stubs" begin - - @testset "set_up_forward_model" begin - prior_path = joinpath( - pkgdir(ClimaCalibrate), - "experiments", - "surface_fluxes_perfect_model", - "prior.toml", - ) - experiment_dir = ClimaCalibrate.ExperimentConfig( - 1, - 1, - [1], - [1], - ClimaCalibrate.get_prior(prior_path), - "output", - ) - @test_throws ErrorException("set_up_forward_model not implemented") ClimaCalibrate.set_up_forward_model( - 1, - 1, - experiment_dir, - ) - end - - @testset "run_forward_model" begin - @test_throws ErrorException("run_forward_model not implemented") ClimaCalibrate.run_forward_model( - nothing, - ) - end - - @testset "observation_map" begin - @test_throws ErrorException("observation_map not implemented") ClimaCalibrate.observation_map( - 1, - ) - end + @test_throws ErrorException("forward_model not implemented") ClimaCalibrate.forward_model( + 1, + 1, + ) + @test_throws ErrorException("observation_map not implemented") ClimaCalibrate.observation_map( + 1, + ) end diff --git a/test/pbs_unit_tests.jl b/test/pbs_unit_tests.jl index c6d029ca..158e9bbf 100644 --- a/test/pbs_unit_tests.jl +++ b/test/pbs_unit_tests.jl @@ -67,7 +67,7 @@ end original_julia_file = """\ import ClimaCalibrate as CAL include("/glade/u/home/nefrathe/clima/ClimaCalibrate.jl/model_interface.jl") -CAL.run_forward_model(CAL.set_up_forward_model(1, 1, "exp/dir")) +CAL.forward_model(1, 1) """ @test julia_file == original_julia_file diff --git a/test/pure_julia_e2e.jl b/test/pure_julia_e2e.jl index f2698a10..3cf2031a 100644 --- a/test/pure_julia_e2e.jl +++ b/test/pure_julia_e2e.jl @@ -6,11 +6,11 @@ using EnsembleKalmanProcesses.TOMLInterface import ClimaParams as CP import ClimaCalibrate: - run_forward_model, - set_up_forward_model, + forward_model, JuliaBackend, ExperimentConfig, calibrate, + project_dir, observation_map import JLD2 @@ -22,7 +22,7 @@ n_iterations = 1 ensemble_size = 20 observations = [20.0] noise = [0.01;;] -output_dir = joinpath("test", "e2e_test_output") +output_dir = mktempdir() experiment_config = ExperimentConfig( n_iterations, @@ -36,25 +36,12 @@ experiment_config = ExperimentConfig( # Model interface # This "model" just samples parameters and returns them, we are checking that the # results are reproducible. -function set_up_forward_model( - member, - iteration, - experiment_config::ExperimentConfig, -) - model_config = Dict() - output_dir = (experiment_config.output_dir) +function forward_model(iteration, member) member_path = path_to_ensemble_member(output_dir, iteration, member) - model_config["output_dir"] = member_path parameter_path = joinpath(member_path, "parameters.toml") - model_config["toml"] = parameter_path - return model_config -end - -function run_forward_model(config) - toml_dict = CP.create_toml_dict(Float64; override_file = config["toml"]) + toml_dict = CP.create_toml_dict(Float64; override_file = parameter_path) (; test_param) = CP.get_parameter_values(toml_dict, "test_param") - output = test_param - JLD2.save_object(joinpath(config["output_dir"], output_file), output) + JLD2.save_object(joinpath(member_path, output_file), test_param) end function observation_map(iteration) @@ -79,5 +66,3 @@ ekp = calibrate(JuliaBackend, experiment_config) @test parameter_values[1][1] ≈ 8.507 rtol = 0.01 @test parameter_values[end][1] ≈ 11.852161842745355 rtol = 0.01 end - -rm(output_dir; recursive = true) diff --git a/test/slurm_unit_tests.jl b/test/slurm_unit_tests.jl index 92ca6c36..ce3adfa1 100644 --- a/test/slurm_unit_tests.jl +++ b/test/slurm_unit_tests.jl @@ -55,12 +55,13 @@ iteration = 1; member = 1 model_interface = "model_interface.jl"; include(model_interface) experiment_dir = "exp/dir" -CAL.run_forward_model(CAL.set_up_forward_model(member, iteration, experiment_dir))' +CAL.forward_model(iteration, member)' exit 0 """ for (generated_str, test_str) in zip(split(sbatch_file, "\n"), split(expected_sbatch_contents, "\n")) + # Test one line at a time to see discrepancies @test generated_str == test_str end diff --git a/test/slurm_workers.jl b/test/slurm_workers.jl new file mode 100644 index 00000000..0d57798b --- /dev/null +++ b/test/slurm_workers.jl @@ -0,0 +1,87 @@ +# Tests for SurfaceFluxes example calibration on HPC, used in buildkite testing +# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/hpc_backend_e2e.jl + +using ClimaCalibrate +using Distributed +using Test +import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final + +function test_sf_calibration_output(eki, prior) + @testset "End to end test using file config (surface fluxes perfect model)" begin + parameter_values = get_ϕ_mean_final(prior, eki) + test_parameter_values = [4.778584250117946, 3.7295665619234697] + @test all( + isapprox.(parameter_values, test_parameter_values; rtol = 1e-3), + ) + + forward_model_output = get_g_mean_final(eki) + test_model_output = [0.05228473730385304] + @test all( + isapprox.(forward_model_output, test_model_output; rtol = 1e-3), + ) + end +end + +experiment_dir = dirname(Base.active_project()) +addprocs( + ClimaCalibrate.SlurmManager(10); + exeflags = "--project=$(dirname(Base.active_project()))", +) +include(joinpath(experiment_dir, "generate_data.jl")) + +@everywhere begin + using ClimaCalibrate + experiment_dir = dirname(Base.active_project()) + output_dir = joinpath("output", "surface_fluxes_perfect_model") + prior = get_prior(joinpath(experiment_dir, "prior.toml")) + ensemble_size = 10 + n_iterations = 6 +end + +@everywhere begin + include(joinpath(experiment_dir, "observation_map.jl")) + ustar = JLD2.load_object( + joinpath(experiment_dir, "data", "synthetic_ustar_array_noisy.jld2"), + ) + (; observation, variance) = + process_member_data(ustar; output_variance = true) + + model_interface = joinpath(experiment_dir, "model_interface.jl") + include(model_interface) +end + +eki = worker_calibrate( + ensemble_size, + n_iterations, + observation, + variance, + prior, + output_dir, +) + +test_sf_calibration_output(eki, prior) + +include(joinpath(experiment_dir, "postprocessing.jl")) + +# Slurm Worker Unit Tests +@testset "Slurm Worker Unit Tests" begin + out_file = "my_slurm_job.out" + p = addprocs(ClimaCalibrate.SlurmManager(1); o = out_file) + @test nprocs() == 2 + @test workers() == p + @test fetch(@spawnat :any myid()) == p[1] + @test remotecall_fetch(+, p[1], 1, 1) == 2 + rmprocs(p) + @test nprocs() == 1 + @test workers() == [1] + + # Check output file creation + @test isfile(out_file) + rm(out_file) + + @test_throws TaskFailedException p = addprocs( + ClimaCalibrate.SlurmManager(1); + o = out_file, + output = out_file, + ) +end From 4c8e0571a2e689bc22bd5891535651313ce5a94b Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Mon, 16 Dec 2024 10:25:37 -0800 Subject: [PATCH 2/3] Fix surface fluxes calibration --- docs/src/atmos_setup_guide.md | 2 +- src/ClimaCalibrate.jl | 4 +++- src/slurm.jl | 9 +++++---- src/{slurm_workers.jl => workers.jl} | 2 +- test/slurm_workers.jl | 30 +++++++++++++--------------- 5 files changed, 24 insertions(+), 23 deletions(-) rename src/{slurm_workers.jl => workers.jl} (99%) diff --git a/docs/src/atmos_setup_guide.md b/docs/src/atmos_setup_guide.md index e827afce..6dfb191d 100644 --- a/docs/src/atmos_setup_guide.md +++ b/docs/src/atmos_setup_guide.md @@ -185,7 +185,7 @@ Example YAML file: ``` output_dir: output/sphere_held_suarez_rhoe_equilmoist prior: prior.toml -ensemble_size: 10 +ensemble_size: 20 n_iterations: 3 observations: obs_mean.jld2 noise: obs_noise_cov.jld2 diff --git a/src/ClimaCalibrate.jl b/src/ClimaCalibrate.jl index bff9697a..e758a082 100644 --- a/src/ClimaCalibrate.jl +++ b/src/ClimaCalibrate.jl @@ -1,13 +1,15 @@ module ClimaCalibrate +export project_dir + project_dir() = dirname(Base.active_project()) include("ekp_interface.jl") include("model_interface.jl") include("slurm.jl") include("pbs.jl") +include("workers.jl") include("backends.jl") include("emulate_sample.jl") -include("slurm_workers.jl") end # module ClimaCalibrate diff --git a/src/slurm.jl b/src/slurm.jl index 2afcbfc3..6faffb09 100644 --- a/src/slurm.jl +++ b/src/slurm.jl @@ -141,7 +141,8 @@ Submit a job to the Slurm scheduler using sbatch, removing unwanted environment Unset variables: "SLURM_MEM_PER_CPU", "SLURM_MEM_PER_GPU", "SLURM_MEM_PER_NODE" """ -function submit_slurm_job(sbatch_filepath; env=deepcopy(ENV)) +function submit_slurm_job(sbatch_filepath; env = ENV) + clean_env = deepcopy(env) # List of SLURM environment variables to unset unset_env_vars = [ "SLURM_MEM_PER_CPU", @@ -151,13 +152,13 @@ function submit_slurm_job(sbatch_filepath; env=deepcopy(ENV)) "SLURM_NTASKS", "SLURM_JOB_NAME", "SLURM_SUBMIT_DIR", - "SLURM_JOB_ID" + "SLURM_JOB_ID", ] # Create a new environment without the SLURM variables for var in unset_env_vars delete!(clean_env, var) end - + try cmd = `sbatch --parsable $sbatch_filepath` output = readchomp(setenv(cmd, clean_env)) @@ -166,7 +167,7 @@ function submit_slurm_job(sbatch_filepath; env=deepcopy(ENV)) if jobid === nothing error("Failed to parse job ID from output: $output") end - + return parse(Int, jobid.match) catch e error("Failed to submit SLURM job: $e") diff --git a/src/slurm_workers.jl b/src/workers.jl similarity index 99% rename from src/slurm_workers.jl rename to src/workers.jl index 69556060..1c298c2f 100644 --- a/src/slurm_workers.jl +++ b/src/workers.jl @@ -1,6 +1,6 @@ using Distributed import EnsembleKalmanProcesses as EKP -export worker_calibrate, add_slurm_workers +export worker_calibrate, SlurmManager function run_iteration( iter, diff --git a/test/slurm_workers.jl b/test/slurm_workers.jl index 0d57798b..3e6a1497 100644 --- a/test/slurm_workers.jl +++ b/test/slurm_workers.jl @@ -1,32 +1,30 @@ # Tests for SurfaceFluxes example calibration on HPC, used in buildkite testing -# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/hpc_backend_e2e.jl +# To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/slurm_workers.jl using ClimaCalibrate using Distributed using Test -import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final +import EnsembleKalmanProcesses: get_ϕ, get_g_mean_final +import Statistics: var -function test_sf_calibration_output(eki, prior) +function test_sf_calibration_output(eki, prior, observation) @testset "End to end test using file config (surface fluxes perfect model)" begin - parameter_values = get_ϕ_mean_final(prior, eki) - test_parameter_values = [4.778584250117946, 3.7295665619234697] - @test all( - isapprox.(parameter_values, test_parameter_values; rtol = 1e-3), - ) + params = get_ϕ(prior, eki) + spread = map(var, params) + + # Spread should be heavily decreased as particles have converged + @test last(spread) / first(spread) < 0.15 forward_model_output = get_g_mean_final(eki) - test_model_output = [0.05228473730385304] + @show forward_model_output @test all( - isapprox.(forward_model_output, test_model_output; rtol = 1e-3), + isapprox.(forward_model_output, observation; rtol = 1e-2), ) end end experiment_dir = dirname(Base.active_project()) -addprocs( - ClimaCalibrate.SlurmManager(10); - exeflags = "--project=$(dirname(Base.active_project()))", -) +addprocs(ClimaCalibrate.SlurmManager(10)) include(joinpath(experiment_dir, "generate_data.jl")) @everywhere begin @@ -34,7 +32,7 @@ include(joinpath(experiment_dir, "generate_data.jl")) experiment_dir = dirname(Base.active_project()) output_dir = joinpath("output", "surface_fluxes_perfect_model") prior = get_prior(joinpath(experiment_dir, "prior.toml")) - ensemble_size = 10 + ensemble_size = 20 n_iterations = 6 end @@ -59,7 +57,7 @@ eki = worker_calibrate( output_dir, ) -test_sf_calibration_output(eki, prior) +test_sf_calibration_output(eki, prior, observation) include(joinpath(experiment_dir, "postprocessing.jl")) From 0b0ef01c88a397dd378133b5cb18bf1d9381d430 Mon Sep 17 00:00:00 2001 From: nefrathenrici Date: Mon, 16 Dec 2024 16:41:40 -0800 Subject: [PATCH 3/3] Add WorkerBackend, clean up constructors --- src/backends.jl | 271 ++++++++++++++++++++++++++++++---------- src/ekp_interface.jl | 135 ++++++++++---------- src/workers.jl | 98 +-------------- test/hpc_backend_e2e.jl | 28 ++--- test/slurm_workers.jl | 61 +++++---- 5 files changed, 310 insertions(+), 283 deletions(-) diff --git a/src/backends.jl b/src/backends.jl index 87f17d7b..b165a867 100644 --- a/src/backends.jl +++ b/src/backends.jl @@ -4,6 +4,9 @@ import EnsembleKalmanProcesses as EKP export get_backend, calibrate, model_run +export JuliaBackend, WorkerBackend +export HPCBackend, ClimaGPUBackend, DerechoBackend, CaltechHPCBackend + abstract type AbstractBackend end struct JuliaBackend <: AbstractBackend end @@ -16,6 +19,8 @@ struct ClimaGPUBackend <: SlurmBackend end struct DerechoBackend <: HPCBackend end +struct WorkerBackend <: AbstractBackend end + """ get_backend() @@ -62,44 +67,52 @@ function module_load_string(::Type{DerechoBackend}) """ end -calibrate(config::ExperimentConfig; reruns = 0, ekp_kwargs...) = - calibrate(get_backend(), config; reruns, ekp_kwargs...) +calibrate( + config::ExperimentConfig; + model_interface = nothing, + hpc_kwargs = Dict(), + ekp_kwargs..., +) = calibrate(get_backend(), config; model_interface, hpc_kwargs, ekp_kwargs...) -calibrate(experiment_dir::AbstractString; reruns = 0, ekp_kwargs...) = - calibrate( +function calibrate( + ensemble_size::Int, + n_iterations::Int, + observations, + noise, + prior, + output_dir; + model_interface = nothing, + hpc_kwargs = Dict(), + ekp_kwargs..., +) + return calibrate( get_backend(), - ExperimentConfig(experiment_dir); - reruns, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + model_interface, + hpc_kwargs, ekp_kwargs..., ) - -calibrate( - b::Type{JuliaBackend}, - experiment_dir::AbstractString; - reruns = 0, - ekp_kwargs..., -) = calibrate(b, ExperimentConfig(experiment_dir); reruns, ekp_kwargs...) +end function calibrate( ::Type{JuliaBackend}, config::ExperimentConfig; - reruns = 0, - ekp = nothing, ekp_kwargs..., ) (; n_iterations, output_dir, ensemble_size) = config - ekp = if ekp isa EKP.EnsembleKalmanProcess - initialize(ekp, prior, output_dir) - else - initialize(config; ekp_kwargs...) - end + ekp = initialize(config; ekp_kwargs...) on_error(e::InterruptException) = rethrow(e) on_error(e) = @error "Single ensemble member has errored. See stacktrace" exception = (e, catch_backtrace()) for i in 0:(n_iterations - 1) @info "Running iteration $i" - pmap(1:ensemble_size; retry_delays = reruns, on_error) do m + pmap(1:ensemble_size; retry_delays = 0, on_error) do m forward_model(i, m) @info "Completed member $m" end @@ -113,73 +126,195 @@ function calibrate( return ekp end +const DEFAULT_FAILURE_RATE = 0.5 + """ - calibrate(::Type{AbstractBackend}, config::ExperimentConfig; kwargs...) - calibrate(::Type{AbstractBackend}, experiment_dir; kwargs...) - calibrate(::Type{AbstractBackend}, ekp::EnsembleKalmanProcess, experiment_dir; kwargs...) + calibrate(backend, ensemble_size, n_iterations, observations, noise, prior, output_dir; ekp_kwargs...) + calibrate(backend, ekp::EnsembleKalmanProcess, ensemble_size, n_iterations, observations, noise, prior, output_dir) + calibrate(backend, config::ExperimentConfig; ekp_kwargs...) + +Run a full calibration on the given backend. -Run a full calibration, scheduling the forward model runs on Caltech's HPC cluster. +If the EKP struct is not given, it will be constructed upon initialization. +The experiment configuration (ensemble size, prior, observations, etc) can be +wrapped in an ExperimentConfig or passed in as arguments to the function. -Takes either an ExperimentConfig or an experiment folder. +Available Backends: WorkerBackend, CaltechHPCBackend, ClimaGPUBackend, DerechoBackend, JuliaBackend -Available Backends: CaltechHPCBackend, ClimaGPUBackend, DerechoBackend, JuliaBackend +Derecho, ClimaGPU, and CaltechHPC backends are designed to run on a specific high-performance computing cluster. +WorkerBackend uses Distributed.jl to run the forward model on workers. -# Keyword Arguments -- `experiment_dir: Directory containing experiment configurations. +## Keyword Arguments for HPC backends - `model_interface: Path to the model interface file. -- `hpc_kwargs`: Dictionary of resource arguments, passed to the job scheduler. -- `reruns`: Number of times to retry a failed ensemble member. +- `hpc_kwargs`: Dictionary of resource arguments for HPC clusters, passed to the job scheduler. - `verbose::Bool`: Enable verbose logging. - Any keyword arguments for the EnsembleKalmanProcess constructor, such as `scheduler` +""" +function calibrate( + b::Type{WorkerBackend}, + config::ExperimentConfig; + failure_rate = DEFAULT_FAILURE_RATE, + worker_pool = default_worker_pool(), + ekp_kwargs..., +) + (; ensemble_size, n_iterations, observations, noise, prior, output_dir) = + config + return calibrate( + b, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + failure_rate, + worker_pool, + ekp_kwargs..., + ) +end -# Usage -Open julia: `julia --project=experiments/surface_fluxes_perfect_model` -```julia -using ClimaCalibrate +function calibrate( + b::Type{WorkerBackend}, + ensemble_size::Int, + n_iterations::Int, + observations, + noise, + prior, + output_dir; + failure_rate = DEFAULT_FAILURE_RATE, + worker_pool = default_worker_pool(), + ekp_kwargs..., +) + eki = ekp_constructor(ensemble_size, prior, observations, noise) + return calibrate( + b, + eki, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + worker_pool, + ekp_kwargs..., + ) +end -experiment_dir = joinpath(pkgdir(ClimaCalibrate), "experiments", "surface_fluxes_perfect_model") -model_interface = joinpath(experiment_dir, "model_interface.jl") +function calibrate( + b::Type{WorkerBackend}, + ekp::EKP.EnsembleKalmanProcess, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + failure_rate = DEFAULT_FAILURE_RATE, + worker_pool = default_worker_pool(), +) + initialize(ekp, prior, output_dir) + for iter in 0:n_iterations + (; time) = @timed run_worker_iteration( + iter, + ensemble_size, + output_dir; + worker_pool, + failure_rate, + ) + @info "Iteration $iter time: $time" + # Process results + G_ensemble = observation_map(iter) + save_G_ensemble(output_dir, iter, G_ensemble) + update_ensemble(output_dir, iter, prior) + iter_path = path_to_iteration(output_dir, iter) + end + return JLD2.load_object( + joinpath(path_to_iteration(output_dir, n_iterations), "eki_file.jld2"), + ) +end -# Generate observational data and load interface -include(joinpath(experiment_dir, "generate_data.jl")) -include(joinpath(experiment_dir, "observation_map.jl")) -include(model_interface) +function calibrate( + b::Type{<:HPCBackend}, + config::ExperimentConfig; + experiment_dir = project_dir(), + model_interface = abspath( + joinpath(experiment_dir, "..", "..", "model_interface.jl"), + ), + verbose = false, + hpc_kwargs = Dict(), + ekp_kwargs..., +) + (; ensemble_size, n_iterations, observations, noise, prior, output_dir) = + config + return calibrate( + b, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + model_interface, + verbose, + hpc_kwargs, + ekp_kwargs..., + ) +end -hpc_kwargs = kwargs(time = 3) -backend = get_backend() -eki = calibrate(backend, experiment_dir; model_interface, hpc_kwargs); -``` -""" function calibrate( b::Type{<:HPCBackend}, - experiment_dir::AbstractString; + ensemble_size::Int, + n_iterations::Int, + observations, + noise, + prior, + output_dir; + experiment_dir = project_dir(), + model_interface = abspath( + joinpath(experiment_dir, "..", "..", "model_interface.jl"), + ), + verbose = false, hpc_kwargs, ekp_kwargs..., ) - calibrate(b, ExperimentConfig(experiment_dir); hpc_kwargs, ekp_kwargs...) + ekp = ekp_constructor(ensemble_size, prior, observations, noise) + return calibrate( + b, + ekp, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + experiment_dir, + model_interface, + verbose, + hpc_kwargs, + ekp_kwargs..., + ) end function calibrate( b::Type{<:HPCBackend}, - config::ExperimentConfig; - experiment_dir = dirname(Base.active_project()), + ekp::EKP.EnsembleKalmanProcess, + ensemble_size, + n_iterations, + observations, + noise, + prior, + output_dir; + experiment_dir = project_dir(), model_interface = abspath( joinpath(experiment_dir, "..", "..", "model_interface.jl"), ), verbose = false, - reruns = 1, - ekp = nothing, hpc_kwargs, ekp_kwargs..., ) - (; n_iterations, output_dir, prior, ensemble_size) = config @info "Initializing calibration" n_iterations ensemble_size output_dir - ekp = if ekp isa EKP.EnsembleKalmanProcess - initialize(ekp, prior, output_dir) - else - initialize(config; ekp_kwargs...) - end + initialize(ekp, prior, output_dir) module_load_str = module_load_string(b) for i in 0:(n_iterations - 1) @info "Iteration $i" @@ -206,12 +341,12 @@ function calibrate( module_load_str; hpc_kwargs, verbose, - reruns, + reruns = 0, ) @info "Completed iteration $i, updating ensemble" G_ensemble = observation_map(i) - save_G_ensemble(config, i, G_ensemble) - terminate = update_ensemble(config, i) + save_G_ensemble(output_dir, i, G_ensemble) + terminate = update_ensemble(output_dir, i, prior) !isnothing(terminate) && break iter_path = path_to_iteration(output_dir, i + 1) ekp = JLD2.load_object(joinpath(iter_path, "eki_file.jld2")) @@ -223,7 +358,7 @@ end # Scheduler interfaces should not depend on backend struct """ model_run(backend, iter, member, output_dir, experiment_dir; model_interface, verbose, hpc_kwargs) - + Construct and execute a command to run a single forward model on a given job scheduler. Dispatches on `backend` to run [`slurm_model_run`](@ref) or [`pbs_model_run`](@ref). @@ -232,8 +367,8 @@ Arguments: - iter: Iteration number - member: Member number - output_dir: Calibration experiment output directory -- experiment_dir: Directory containing the experiment's Project.toml -- model_interface: File containing the model interface +- project_dir: Directory containing the experiment's Project.toml +- model_interface: Model interface file - module_load_str: Commands which load the necessary modules - hpc_kwargs: Dictionary containing the resources for the job. Easily generated using [`kwargs`](@ref). """ @@ -242,7 +377,7 @@ model_run( iter, member, output_dir, - experiment_dir, + project_dir, model_interface, module_load_str; hpc_kwargs, @@ -250,7 +385,7 @@ model_run( iter, member, output_dir, - experiment_dir, + project_dir, model_interface, module_load_str; hpc_kwargs, @@ -260,7 +395,7 @@ model_run( iter, member, output_dir, - experiment_dir, + project_dir, model_interface, module_load_str; hpc_kwargs, @@ -268,7 +403,7 @@ model_run( iter, member, output_dir, - experiment_dir, + project_dir, model_interface, module_load_str; hpc_kwargs, diff --git a/src/ekp_interface.jl b/src/ekp_interface.jl index db944c57..e194f0eb 100644 --- a/src/ekp_interface.jl +++ b/src/ekp_interface.jl @@ -188,11 +188,7 @@ function env_member_number(env = ENV) end """ - initialize(ensemble_size, observations, noise, prior, output_dir; kwargs...) - initialize(ensemble_size, observations, prior, output_dir; kwargs...) - initialize(eki::EnsembleKalmanProcess, prior, output_dir) - initialize(config::ExperimentConfig; kwargs...) - initialize(filepath::AbstractString; kwargs...) + ekp_constructor(ensemble_size, prior, observations, noise = nothing) Initialize the EnsembleKalmanProcess object and parameter files. @@ -203,73 +199,23 @@ Noise is optional when the observation is an EKP.ObservationSeries. Additional kwargs may be passed through to the EnsembleKalmanProcess constructor. """ -initialize(filepath::AbstractString; kwargs...) = - initialize(ExperimentConfig(filepath); kwargs...) - -initialize(config::ExperimentConfig; kwargs...) = initialize( - config.ensemble_size, - config.observations, - config.noise, - config.prior, - config.output_dir; - kwargs..., -) -initialize( +function ekp_constructor( ensemble_size, - observations, prior, - output_dir; - rng_seed = 1234, - ekp_kwargs..., -) = _initialize( - ensemble_size, observations, - prior, - output_dir; - rng_seed, - ekp_kwargs..., -) - -initialize( - ensemble_size, - observations, - noise, - prior, - output_dir; + noise = nothing; rng_seed = 1234, ekp_kwargs..., -) = _initialize( - ensemble_size, - observations, - prior, - output_dir; - noise, - rng_seed, - ekp_kwargs..., ) -function initialize(eki::EKP.EnsembleKalmanProcess, prior, output_dir) - save_eki_state(eki, output_dir, 0, prior) - return eki -end - -function _initialize( - ensemble_size, - observations, - prior, - output_dir; - noise = nothing, - rng_seed, - ekp_kwargs..., -) Random.seed!(rng_seed) rng_ekp = Random.MersenneTwister(rng_seed) initial_ensemble = EKP.construct_initial_ensemble(rng_ekp, prior, ensemble_size) # EKP 2.0 and later require the `default_options_dict` - eki_constructor = if hasproperty(EKP, :default_options_dict) + eki_constr = if hasproperty(EKP, :default_options_dict) ekp_kwargs = Dict([string(k) => v for (k, v) in ekp_kwargs]) (args...) -> EKP.EnsembleKalmanProcess( args..., @@ -277,30 +223,76 @@ function _initialize( rng = rng_ekp, ) else - eki_constructor = - (args...) -> EKP.EnsembleKalmanProcess( - args...; - rng = rng_ekp, - failure_handler_method = EKP.SampleSuccGauss(), - ekp_kwargs..., - ) + (args...) -> EKP.EnsembleKalmanProcess( + args...; + rng = rng_ekp, + failure_handler_method = EKP.SampleSuccGauss(), + ekp_kwargs..., + ) end + eki = if isnothing(noise) - eki_constructor(initial_ensemble, observations, EKP.Inversion()) + eki_constr(initial_ensemble, observations, EKP.Inversion()) else - eki_constructor(initial_ensemble, observations, noise, EKP.Inversion()) + eki_constr(initial_ensemble, observations, noise, EKP.Inversion()) end + return eki +end + +""" + initialize(ensemble_size, observations, noise, prior, output_dir) + initialize(eki::EKP.EnsembleKalmanProcess, prior, output_dir) + initialize(config) + +Initialize a calibration, saving the initial parameter ensemble to a folder within `output_dir`. + +If no EKP struct is given, construct an EKP struct and return it. +""" +function initialize(config::ExperimentConfig; rng_seed = 1234, ekp_kwargs...) + (; ensemble_size, observations, noise, prior, output_dir) = config + return initialize( + ensemble_size, + observations, + noise, + prior, + output_dir; + ekp_kwargs..., + rng_seed, + ) +end - save_eki_state(eki, output_dir, 0, prior) +function initialize( + ensemble_size, + observations, + noise, + prior, + output_dir; + rng_seed = 1234, + ekp_kwargs..., +) + eki = ekp_constructor( + ensemble_size, + prior, + observations, + noise; + rng_seed, + ekp_kwargs..., + ) + save_eki_and_parameters(eki, output_dir, 0, prior) + return eki +end + +function initialize(eki::EKP.EnsembleKalmanProcess, prior, output_dir) + save_eki_and_parameters(eki, output_dir, 0, prior) return eki end """ - save_eki_state(eki, output_dir, iteration, prior) + save_eki_and_parameters(eki, output_dir, iteration, prior) Save EKI state and parameters. Helper function for [`initialize`](@ref) and [`update_ensemble`](@ref) """ -function save_eki_state(eki, output_dir, iteration, prior) +function save_eki_and_parameters(eki, output_dir, iteration, prior) param_dict = get_param_dict(prior) save_parameter_ensemble( EKP.get_u_final(eki), @@ -338,6 +330,7 @@ function update_ensemble(output_dir::AbstractString, iteration, prior) G_ens = JLD2.load_object(joinpath(iter_path, "G_ensemble.jld2")) terminate = EKP.update_ensemble!(eki, G_ens) - save_eki_state(eki, output_dir, iteration + 1, prior) + save_eki_and_parameters(eki, output_dir, iteration + 1, prior) + # TODO: Return EKI struct again return terminate end diff --git a/src/workers.jl b/src/workers.jl index 1c298c2f..d00bd661 100644 --- a/src/workers.jl +++ b/src/workers.jl @@ -1,8 +1,10 @@ using Distributed import EnsembleKalmanProcesses as EKP -export worker_calibrate, SlurmManager +export SlurmManager, default_worker_pool -function run_iteration( +default_worker_pool() = WorkerPool(workers()) + +function run_worker_iteration( iter, ensemble_size, output_dir; @@ -37,98 +39,6 @@ function run_iteration( end end -function worker_calibrate( - config; - failure_rate = 0.5, - worker_pool = default_worker_pool(), - ekp_kwargs..., -) - (; ensemble_size, n_iterations, observations, noise, prior, output_dir) = - config - return worker_calibrate( - ensemble_size, - n_iterations, - observations, - noise, - prior, - output_dir; - failure_rate, - worker_pool, - ekp_kwargs..., - ) -end - -function worker_calibrate( - ensemble_size, - n_iterations, - observations, - noise, - prior, - output_dir; - failure_rate = 0.5, - worker_pool = default_worker_pool(), - ekp_kwargs..., -) - initialize( - ensemble_size, - observations, - noise, - prior, - output_dir; - rng_seed = 1234, - ekp_kwargs..., - ) - for iter in 0:(n_iterations) - (; time) = @timed run_iteration( - iter, - ensemble_size, - output_dir; - worker_pool, - failure_rate, - ) - @info "Iteration $iter time: $time" - # Process results - G_ensemble = observation_map(iter) - save_G_ensemble(output_dir, iter, G_ensemble) - update_ensemble(output_dir, iter, prior) - iter_path = path_to_iteration(output_dir, iter) - end - return JLD2.load_object( - joinpath(path_to_iteration(output_dir, n_iterations), "eki_file.jld2"), - ) -end - -function worker_calibrate( - ekp::EKP.EnsembleKalmanProcess, - ensemble_size, - n_iterations, - observations, - noise, - prior, - output_dir; - failure_rate = 0.5, - worker_pool = default_worker_pool(), - ekp_kwargs..., -) - initialize(ekp, prior, output_dir; rng_seed = 1234) - for iter in 0:n_iterations - (; time) = @timed run_iteration( - iter, - ensemble_size, - output_dir; - worker_pool, - failure_rate, - ) - @info "Iteration $iter time: $time" - # Process results - G_ensemble = observation_map(iter) - save_G_ensemble(output_dir, iter, G_ensemble) - update_ensemble(output_dir, iter, prior) - iter_path = path_to_iteration(output_dir, iter) - end - return JLD2.load_object(path_to_iteration(output_dir, n_iterations)) -end - worker_cookie() = begin Distributed.init_multi() cluster_cookie() diff --git a/test/hpc_backend_e2e.jl b/test/hpc_backend_e2e.jl index c93aa2d4..0aea2b50 100644 --- a/test/hpc_backend_e2e.jl +++ b/test/hpc_backend_e2e.jl @@ -1,17 +1,7 @@ # Tests for SurfaceFluxes example calibration on HPC, used in buildkite testing # To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/hpc_backend_e2e.jl -using Pkg -Pkg.instantiate(; verbose = true) - -import ClimaCalibrate: - get_backend, - HPCBackend, - JuliaBackend, - calibrate, - get_prior, - kwargs, - DerechoBackend +using ClimaCalibrate using Test import EnsembleKalmanProcesses: get_ϕ_mean_final, get_g_mean_final import Statistics: var @@ -41,16 +31,16 @@ function test_sf_calibration_output(eki, prior) end end -# @assert get_backend() <: HPCBackend -# hpc_kwargs = kwargs(time = 5, ntasks = 1, cpus_per_task = 1) -# if get_backend() == DerechoBackend -# hpc_kwargs[:queue] = "develop" -# end -# eki = calibrate(experiment_dir; model_interface, hpc_kwargs, verbose = true) -# test_sf_calibration_output(eki, prior) +@assert get_backend() <: HPCBackend +hpc_kwargs = kwargs(time = 5, ntasks = 1, cpus_per_task = 1) +if get_backend() == DerechoBackend + hpc_kwargs[:queue] = "develop" +end +eki = calibrate(experiment_config; model_interface, hpc_kwargs, verbose = true) +test_sf_calibration_output(eki, prior) # Pure Julia calibration, this should run anywhere -eki = calibrate(JuliaBackend, experiment_dir) +eki = calibrate(JuliaBackend, experiment_config) test_sf_calibration_output(eki, prior) include(joinpath(experiment_dir, "postprocessing.jl")) diff --git a/test/slurm_workers.jl b/test/slurm_workers.jl index 3e6a1497..4365c645 100644 --- a/test/slurm_workers.jl +++ b/test/slurm_workers.jl @@ -1,30 +1,51 @@ # Tests for SurfaceFluxes example calibration on HPC, used in buildkite testing # To run, open the REPL: julia --project=experiments/surface_fluxes_perfect_model test/slurm_workers.jl - +# TODO: Unify test/slurm_workers.jl and test/hpc_backend_e2e.jl using ClimaCalibrate using Distributed using Test import EnsembleKalmanProcesses: get_ϕ, get_g_mean_final import Statistics: var +# Slurm Worker Unit Tests +@testset "Slurm Worker Unit Tests" begin + out_file = "my_slurm_job.out" + p = addprocs(ClimaCalibrate.SlurmManager(1); o = out_file) + @test nprocs() == 2 + @test workers() == p + @test fetch(@spawnat :any myid()) == p[1] + @test remotecall_fetch(+, p[1], 1, 1) == 2 + rmprocs(p) + @test nprocs() == 1 + @test workers() == [1] + + # Check output file creation + @test isfile(out_file) + rm(out_file) + + @test_throws TaskFailedException p = addprocs( + ClimaCalibrate.SlurmManager(1); + o = out_file, + output = out_file, + ) +end + function test_sf_calibration_output(eki, prior, observation) @testset "End to end test using file config (surface fluxes perfect model)" begin params = get_ϕ(prior, eki) spread = map(var, params) - + # Spread should be heavily decreased as particles have converged @test last(spread) / first(spread) < 0.15 forward_model_output = get_g_mean_final(eki) @show forward_model_output - @test all( - isapprox.(forward_model_output, observation; rtol = 1e-2), - ) + @test all(isapprox.(forward_model_output, observation; rtol = 1e-2)) end end -experiment_dir = dirname(Base.active_project()) -addprocs(ClimaCalibrate.SlurmManager(10)) +experiment_dir = project_dir() +addprocs(SlurmManager(10)) include(joinpath(experiment_dir, "generate_data.jl")) @everywhere begin @@ -48,7 +69,8 @@ end include(model_interface) end -eki = worker_calibrate( +eki = calibrate( + WorkerBackend, ensemble_size, n_iterations, observation, @@ -60,26 +82,3 @@ eki = worker_calibrate( test_sf_calibration_output(eki, prior, observation) include(joinpath(experiment_dir, "postprocessing.jl")) - -# Slurm Worker Unit Tests -@testset "Slurm Worker Unit Tests" begin - out_file = "my_slurm_job.out" - p = addprocs(ClimaCalibrate.SlurmManager(1); o = out_file) - @test nprocs() == 2 - @test workers() == p - @test fetch(@spawnat :any myid()) == p[1] - @test remotecall_fetch(+, p[1], 1, 1) == 2 - rmprocs(p) - @test nprocs() == 1 - @test workers() == [1] - - # Check output file creation - @test isfile(out_file) - rm(out_file) - - @test_throws TaskFailedException p = addprocs( - ClimaCalibrate.SlurmManager(1); - o = out_file, - output = out_file, - ) -end