From 41dff6f821a3f21bb0db2a96fbd9973fedd56956 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 09:57:56 -0800 Subject: [PATCH 01/28] add strong scaling GPU AMIP --- .buildkite/gpu/pipeline.yml | 87 +++++++++++++++++++ config/gpu_configs/gpu_amip_chap.yml | 22 +++++ config/gpu_configs/gpu_amip_chap_2process.yml | 22 +++++ config/gpu_configs/gpu_amip_chap_4process.yml | 22 +++++ experiments/AMIP/coupler_driver.jl | 8 +- 5 files changed, 159 insertions(+), 2 deletions(-) create mode 100644 .buildkite/gpu/pipeline.yml create mode 100644 config/gpu_configs/gpu_amip_chap.yml create mode 100644 config/gpu_configs/gpu_amip_chap_2process.yml create mode 100644 config/gpu_configs/gpu_amip_chap_4process.yml diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml new file mode 100644 index 0000000000..95901fae9d --- /dev/null +++ b/.buildkite/gpu/pipeline.yml @@ -0,0 +1,87 @@ +agents: + queue: clima + slurm_mem: 8G + modules: common nsight-systems/2023.4.1 + +env: + JULIA_CUDA_MEMORY_POOL: none + JULIA_MPI_HAS_CUDA: "true" + JULIA_NVTX_CALLBACKS: gc + JULIA_MAX_NUM_PRECOMPILE_FILES: 100 + OPENBLAS_NUM_THREADS: 1 + OMPI_MCA_opal_warn_on_missing_libcuda: 0 + SLURM_KILL_BAD_EXIT: 1 + SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291 + GPU_CONFIG_PATH: "config/gpu_configs" + CLIMAATMOS_GC_NSTEPS: 10 + +steps: + - label: "init :GPU:" + key: "init_gpu_env" + command: + - echo "--- Instantiate experiments/AMIP" + - julia --project=experiments/AMIP -e 'using Pkg; Pkg.instantiate(;verbose=true)' + - julia --project=experiments/AMIP -e 'using Pkg; Pkg.precompile()' + - julia --project=experiments/AMIP -e 'using Pkg; Pkg.status()' + + - echo "--- Download artifacts" + - "julia --project=artifacts -e 'using Pkg; Pkg.instantiate(;verbose=true)'" + - "julia --project=artifacts -e 'using Pkg; Pkg.precompile()'" + - "julia --project=artifacts -e 'using Pkg; Pkg.status()'" + - "julia --project=artifacts artifacts/download_artifacts.jl" + + agents: + slurm_gpus: 1 + slurm_cpus_per_task: 8 + env: + JULIA_NUM_PRECOMPILE_TASKS: 8 + JULIA_MAX_NUM_PRECOMPILE_FILES: 50 + + - wait + + - group: "CHAP GPU strong scaling" + steps: + + - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + key: "gpu_amip_chap" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + artifact_paths: "gpu_amip_chap/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: + + - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + key: "gpu_amip_chap_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + artifact_paths: "gpu_amip_chap_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_exclusive: + + - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + key: "gpu_amip_chap_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/hybrid/driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + artifact_paths: "gpu_amip_chap_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap.yml b/config/gpu_configs/gpu_amip_chap.yml new file mode 100644 index 0000000000..c0a122f78a --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_2process.yml b/config/gpu_configs/gpu_amip_chap_2process.yml new file mode 100644 index 0000000000..bb8075fae1 --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_2process.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_2process.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_2process" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_2process" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_4process.yml b/config/gpu_configs/gpu_amip_chap_4process.yml new file mode 100644 index 0000000000..000bf8e85d --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_4process.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_4process.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_4process" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_4process" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl index 15301cae09..a4aea6f38f 100644 --- a/experiments/AMIP/coupler_driver.jl +++ b/experiments/AMIP/coupler_driver.jl @@ -678,7 +678,7 @@ function solve_coupler!(cs) end @show walltime - return cs + return walltime end ## exit if running performance anaysis #hide @@ -687,7 +687,11 @@ if haskey(ENV, "CI_PERF_SKIP_COUPLED_RUN") #hide end #hide ## run the coupled simulation -solve_coupler!(cs); +walltime = solve_coupler!(cs); + +# Show the simulated years per day of the simulation +es = CA.EfficiencyStats(tspan, walltime) +@info "SYPD: $(CA.simulated_years_per_day(es))" #= ## Postprocessing From 3d8f4bb2ba8ccaab08273b8de596139b0201d8d8 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 10:55:35 -0800 Subject: [PATCH 02/28] add weak scaling --- .buildkite/gpu/pipeline.yml | 49 +++++++++++++++++++ config/gpu_configs/gpu_amip_chap_ws.yml | 22 +++++++++ .../gpu_configs/gpu_amip_chap_ws_2process.yml | 22 +++++++++ .../gpu_configs/gpu_amip_chap_ws_4process.yml | 22 +++++++++ 4 files changed, 115 insertions(+) create mode 100644 config/gpu_configs/gpu_amip_chap_ws.yml create mode 100644 config/gpu_configs/gpu_amip_chap_ws_2process.yml create mode 100644 config/gpu_configs/gpu_amip_chap_ws_4process.yml diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 95901fae9d..694eb2628c 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -85,3 +85,52 @@ steps: slurm_ntasks: 4 slurm_mem: 32G slurm_exclusive: + + - group: "CHAP GPU weak scaling" + steps: + + - label: "GPU AMIP CHAP - weak scaling - 1 GPU" + key: "gpu_amip_chap_ws" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml + artifact_paths: "gpu_amip_chap_ws/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: + + - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" + key: "gpu_amip_chap_ws_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml + artifact_paths: "gpu_amip_chap_ws_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: + + - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" + key: "gpu_amip_chap_ws_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/hybrid/driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml + artifact_paths: "gpu_amip_chap_ws_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap_ws.yml b/config/gpu_configs/gpu_amip_chap_ws.yml new file mode 100644 index 0000000000..b665a49223 --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_ws.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml" +dt: "100secs" +dt_cloud_fraction: "1hours" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_ws" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_ws" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_ws_2process.yml b/config/gpu_configs/gpu_amip_chap_ws_2process.yml new file mode 100644 index 0000000000..99e81bcd2e --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_ws_2process.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_ws_2process.yml" +dt: "50secs" +dt_cloud_fraction: "1hours" +dt_cpl: 50 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_ws_2process" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_ws_2process" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_ws_4process.yml b/config/gpu_configs/gpu_amip_chap_ws_4process.yml new file mode 100644 index 0000000000..a22e858bf6 --- /dev/null +++ b/config/gpu_configs/gpu_amip_chap_ws_4process.yml @@ -0,0 +1,22 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_ws_4process.yml" +dt: "50secs" +dt_cloud_fraction: "1hours" +dt_cpl: 50 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +hourly_checkpoint: false +job_id: "gpu_amip_chap_ws_4process" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_chap_ws_4process" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "1days" +turb_flux_partition: "CombinedStateFluxes" +vert_diff: "true" From e9b769c698f5841b57271defc866f84ae651131a Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 13:38:59 -0800 Subject: [PATCH 03/28] use atmos branch --- experiments/AMIP/Manifest.toml | 85 ++++++++++++++++------------------ 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/experiments/AMIP/Manifest.toml b/experiments/AMIP/Manifest.toml index 517b986272..f00efd13d4 100644 --- a/experiments/AMIP/Manifest.toml +++ b/experiments/AMIP/Manifest.toml @@ -80,9 +80,9 @@ version = "7.7.1" [[deps.ArrayLayouts]] deps = ["FillArrays", "LinearAlgebra"] -git-tree-sha1 = "64d582bcb9c93ac741234789eeb4f16812413efb" +git-tree-sha1 = "e46675dbc095ddfdf2b5fba247d5a25f34e1f8a2" uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" -version = "1.6.0" +version = "1.6.1" weakdeps = ["SparseArrays"] [deps.ArrayLayouts.extensions] @@ -213,9 +213,9 @@ version = "0.2.4" [[deps.CSV]] deps = ["CodecZlib", "Dates", "FilePathsBase", "InlineStrings", "Mmap", "Parsers", "PooledArrays", "PrecompileTools", "SentinelArrays", "Tables", "Unicode", "WeakRefStrings", "WorkerUtilities"] -git-tree-sha1 = "679e69c611fff422038e9e21e270c4197d49d918" +git-tree-sha1 = "a44910ceb69b0d44fe262dd451ab11ead3ed0be8" uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -version = "0.10.12" +version = "0.10.13" [[deps.CUDA]] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "Statistics", "UnsafeAtomicsLLVM"] @@ -266,9 +266,9 @@ version = "1.63.0" [[deps.ChainRulesCore]] deps = ["Compat", "LinearAlgebra"] -git-tree-sha1 = "aef70bb349b20aa81a82a19704c3ef339d4ee494" +git-tree-sha1 = "575cd02e080939a33b6df6c5853d14924c08e35b" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.22.1" +version = "1.23.0" weakdeps = ["SparseArrays"] [deps.ChainRulesCore.extensions] @@ -276,8 +276,8 @@ weakdeps = ["SparseArrays"] [[deps.ClimaAtmos]] deps = ["Adapt", "ArgParse", "ArtifactWrappers", "Artifacts", "AtmosphericProfilesLibrary", "CLIMAParameters", "ClimaComms", "ClimaCore", "ClimaTimeSteppers", "CloudMicrophysics", "Colors", "Dates", "Dierckx", "DiffEqBase", "DiffEqCallbacks", "DocStringExtensions", "FastGaussQuadrature", "Insolation", "Interpolations", "IntervalSets", "Krylov", "LinearAlgebra", "Logging", "NCDatasets", "NVTX", "Pkg", "Printf", "RRTMGP", "Random", "RootSolvers", "SciMLBase", "StaticArrays", "Statistics", "StatsBase", "SurfaceFluxes", "Thermodynamics", "YAML"] -git-tree-sha1 = "7cc34a68cd660aa134d8e8a49c9c8dcd2e556ba7" -repo-rev = "main" +git-tree-sha1 = "fce749aa788c7ccb5a2c7d812d02abc7cc327d6b" +repo-rev = "fdf1df4" repo-url = "https://github.com/CliMA/ClimaAtmos.jl.git" uuid = "b2c96348-7fb7-4fe0-8da9-78d88439e717" version = "0.21.0" @@ -300,15 +300,15 @@ weakdeps = ["Krylov"] [[deps.ClimaCorePlots]] deps = ["ClimaCore", "RecipesBase", "StaticArrays", "TriplotBase"] -git-tree-sha1 = "e86fd9242e89b526c9fb29e05db3071ce64e3a8e" +git-tree-sha1 = "ded3e0f3e7069f7c807f7b56caff232921bc2f5f" uuid = "cf7c7e5a-b407-4c48-9047-11a94a308626" -version = "0.2.7" +version = "0.2.8" [[deps.ClimaCoreTempestRemap]] deps = ["ClimaComms", "ClimaCore", "CommonDataModel", "Dates", "LinearAlgebra", "NCDatasets", "PkgVersion", "TempestRemap_jll"] -git-tree-sha1 = "2267e018c34f44fa8300b8d550d59f3eecef6094" +git-tree-sha1 = "ac11cc8ad2c043ab753d6888c224c7e2f35f42c0" uuid = "d934ef94-cdd4-4710-83d6-720549644b70" -version = "0.3.13" +version = "0.3.14" [[deps.ClimaCoupler]] deps = ["CLIMAParameters", "ClimaAtmos", "ClimaComms", "ClimaCore", "ClimaCoreTempestRemap", "ClimaLand", "Dates", "DocStringExtensions", "Insolation", "JLD2", "NCDatasets", "Plots", "SciMLBase", "StaticArrays", "Statistics", "SurfaceFluxes", "TempestRemap_jll", "Thermodynamics"] @@ -328,9 +328,9 @@ weakdeps = ["CLIMAParameters"] [[deps.ClimaTimeSteppers]] deps = ["ClimaComms", "Colors", "DataStructures", "DiffEqBase", "DiffEqCallbacks", "KernelAbstractions", "Krylov", "LinearAlgebra", "LinearOperators", "NVTX", "SciMLBase", "StaticArrays"] -git-tree-sha1 = "96bbba6d14467a2b9512ba0a536395350bb361ff" +git-tree-sha1 = "9c203f39784c968700c55f555754a7771b3410df" uuid = "595c0a79-7f3d-439a-bc5a-b232dc3bde79" -version = "0.7.17" +version = "0.7.19" [[deps.CloseOpenIntervals]] deps = ["Static", "StaticArrayInterface"] @@ -479,9 +479,9 @@ version = "1.6.1" [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "1fb174f0d48fe7d142e1109a10636bc1d14f5ac2" +git-tree-sha1 = "0f4b5d62a88d8f59003e43c25a8a90de9eb76317" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.17" +version = "0.18.18" [[deps.DataValueInterfaces]] git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" @@ -517,9 +517,9 @@ version = "0.1.0+0" [[deps.DiffEqBase]] deps = ["ArrayInterface", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces"] -git-tree-sha1 = "2ad3a2dcd5f28f535aa884d199cc2f0a9d335729" +git-tree-sha1 = "aee5798bd4f1ed1260bd1741221f9589e5ee8a90" uuid = "2b5f629d-d688-5b77-993f-72d75c75574e" -version = "6.147.1" +version = "6.147.3" [deps.DiffEqBase.extensions] DiffEqBaseChainRulesCoreExt = "ChainRulesCore" @@ -610,9 +610,9 @@ uuid = "4e289a0a-7415-4d19-859d-a7e5c4648b56" version = "1.0.4" [[deps.EnzymeCore]] -git-tree-sha1 = "59c44d8fbc651c0395d8a6eda64b05ce316f58b4" +git-tree-sha1 = "496c5455d6a61c2a6f2233ce07c1fcdbe4995ab6" uuid = "f151be2c-9106-41f4-ab19-57ee4f262869" -version = "0.6.5" +version = "0.7.0" weakdeps = ["Adapt"] [deps.EnzymeCore.extensions] @@ -771,11 +771,10 @@ git-tree-sha1 = "21efd19106a55620a188615da6d3d06cd7f6ee03" uuid = "a3f928ae-7b40-5064-980b-68af3947d34b" version = "2.13.93+0" -[[deps.Formatting]] -deps = ["Logging", "Printf"] -git-tree-sha1 = "fb409abab2caf118986fc597ba84b50cbaf00b87" -uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" -version = "0.4.3" +[[deps.Format]] +git-tree-sha1 = "f3cf88025f6d03c194d73f5d13fee9004a108329" +uuid = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8" +version = "1.3.6" [[deps.ForwardDiff]] deps = ["CommonSubexpressions", "DiffResults", "DiffRules", "LinearAlgebra", "LogExpFunctions", "NaNMath", "Preferences", "Printf", "Random", "SpecialFunctions"] @@ -924,9 +923,9 @@ version = "1.14.3+1" [[deps.HTTP]] deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] -git-tree-sha1 = "ac7b73d562b8f4287c3b67b4c66a5395a19c1ae8" +git-tree-sha1 = "db864f2d91f68a5912937af80327d288ea1f3aee" uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "1.10.2" +version = "1.10.3" [[deps.HarfBuzz_jll]] deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg"] @@ -1062,9 +1061,9 @@ version = "0.2.4" [[deps.KernelAbstractions]] deps = ["Adapt", "Atomix", "InteractiveUtils", "LinearAlgebra", "MacroTools", "PrecompileTools", "Requires", "SparseArrays", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] -git-tree-sha1 = "c7753cc3febe006708ce6798482004241f7d890b" +git-tree-sha1 = "ed7167240f40e62d97c1f5f7735dea6de3cc5c49" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" -version = "0.9.17" +version = "0.9.18" weakdeps = ["EnzymeCore"] [deps.KernelAbstractions.extensions] @@ -1148,10 +1147,10 @@ uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" version = "1.3.1" [[deps.Latexify]] -deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Printf", "Requires"] -git-tree-sha1 = "f428ae552340899a935973270b8d98e5a31c49fe" +deps = ["Format", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"] +git-tree-sha1 = "cad560042a7cc108f5a4c24ea1431a9221f22c1b" uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" -version = "0.16.1" +version = "0.16.2" [deps.Latexify.extensions] DataFramesExt = "DataFrames" @@ -1433,20 +1432,18 @@ version = "4.5.1" [[deps.NNlib]] deps = ["Adapt", "Atomix", "ChainRulesCore", "GPUArraysCore", "KernelAbstractions", "LinearAlgebra", "Pkg", "Random", "Requires", "Statistics"] -git-tree-sha1 = "877f15c331337d54cf24c797d5bcb2e48ce21221" +git-tree-sha1 = "6e4e90c2e2ef091ef50b91af65fa4bb09c3d0728" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" -version = "0.9.12" +version = "0.9.6" [deps.NNlib.extensions] NNlibAMDGPUExt = "AMDGPU" NNlibCUDACUDNNExt = ["CUDA", "cuDNN"] NNlibCUDAExt = "CUDA" - NNlibEnzymeCoreExt = "EnzymeCore" [deps.NNlib.weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" - EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [[deps.NVTX]] @@ -1534,9 +1531,9 @@ version = "4.1.6+0" [[deps.OpenSSL]] deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"] -git-tree-sha1 = "51901a49222b09e3743c65b8847687ae5fc78eb2" +git-tree-sha1 = "af81a32750ebc831ee28bdaaba6e1067decef51e" uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c" -version = "1.4.1" +version = "1.4.2" [[deps.OpenSSL_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -1620,9 +1617,9 @@ version = "3.1.0" [[deps.PlotUtils]] deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "Statistics"] -git-tree-sha1 = "862942baf5663da528f66d24996eb6da85218e76" +git-tree-sha1 = "7b1a9df27f072ac4c9c7cbe5efb198489258d1f5" uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.4.0" +version = "1.4.1" [[deps.Plots]] deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"] @@ -1682,9 +1679,9 @@ version = "1.2.0" [[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e" +git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.4.1" +version = "1.4.3" [[deps.PrettyPrint]] git-tree-sha1 = "632eb4abab3449ab30c5e1afaa874f0b98b586e4" @@ -2142,9 +2139,9 @@ uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" version = "0.5.23" [[deps.TranscodingStreams]] -git-tree-sha1 = "54194d92959d8ebaa8e26227dbe3cdefcdcd594f" +git-tree-sha1 = "3caa21522e7efac1ba21834a03734c57b4611c7e" uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.3" +version = "0.10.4" weakdeps = ["Random", "Test"] [deps.TranscodingStreams.extensions] From 9fec2659d781e52dcb04abd4f3c356f1b7988ef9 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 13:46:46 -0800 Subject: [PATCH 04/28] don't use vert_diff: true [skip ci] --- config/gpu_configs/gpu_amip_chap.yml | 1 - config/gpu_configs/gpu_amip_chap_2process.yml | 1 - config/gpu_configs/gpu_amip_chap_4process.yml | 1 - config/gpu_configs/gpu_amip_chap_ws.yml | 1 - config/gpu_configs/gpu_amip_chap_ws_2process.yml | 1 - config/gpu_configs/gpu_amip_chap_ws_4process.yml | 1 - 6 files changed, 6 deletions(-) diff --git a/config/gpu_configs/gpu_amip_chap.yml b/config/gpu_configs/gpu_amip_chap.yml index c0a122f78a..099141f9ea 100644 --- a/config/gpu_configs/gpu_amip_chap.yml +++ b/config/gpu_configs/gpu_amip_chap.yml @@ -19,4 +19,3 @@ start_date: "19790301" surface_setup: "PrescribedSurface" t_end: "1days" turb_flux_partition: "CombinedStateFluxes" -vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_2process.yml b/config/gpu_configs/gpu_amip_chap_2process.yml index bb8075fae1..1b33937817 100644 --- a/config/gpu_configs/gpu_amip_chap_2process.yml +++ b/config/gpu_configs/gpu_amip_chap_2process.yml @@ -19,4 +19,3 @@ start_date: "19790301" surface_setup: "PrescribedSurface" t_end: "1days" turb_flux_partition: "CombinedStateFluxes" -vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_4process.yml b/config/gpu_configs/gpu_amip_chap_4process.yml index 000bf8e85d..2d95d784e6 100644 --- a/config/gpu_configs/gpu_amip_chap_4process.yml +++ b/config/gpu_configs/gpu_amip_chap_4process.yml @@ -19,4 +19,3 @@ start_date: "19790301" surface_setup: "PrescribedSurface" t_end: "1days" turb_flux_partition: "CombinedStateFluxes" -vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_ws.yml b/config/gpu_configs/gpu_amip_chap_ws.yml index b665a49223..bcb7a54060 100644 --- a/config/gpu_configs/gpu_amip_chap_ws.yml +++ b/config/gpu_configs/gpu_amip_chap_ws.yml @@ -19,4 +19,3 @@ start_date: "19790301" surface_setup: "PrescribedSurface" t_end: "1days" turb_flux_partition: "CombinedStateFluxes" -vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_ws_2process.yml b/config/gpu_configs/gpu_amip_chap_ws_2process.yml index 99e81bcd2e..5641cef39e 100644 --- a/config/gpu_configs/gpu_amip_chap_ws_2process.yml +++ b/config/gpu_configs/gpu_amip_chap_ws_2process.yml @@ -19,4 +19,3 @@ start_date: "19790301" surface_setup: "PrescribedSurface" t_end: "1days" turb_flux_partition: "CombinedStateFluxes" -vert_diff: "true" diff --git a/config/gpu_configs/gpu_amip_chap_ws_4process.yml b/config/gpu_configs/gpu_amip_chap_ws_4process.yml index a22e858bf6..0453144643 100644 --- a/config/gpu_configs/gpu_amip_chap_ws_4process.yml +++ b/config/gpu_configs/gpu_amip_chap_ws_4process.yml @@ -19,4 +19,3 @@ start_date: "19790301" surface_setup: "PrescribedSurface" t_end: "1days" turb_flux_partition: "CombinedStateFluxes" -vert_diff: "true" From 416b053b8beaa165a62f56b51c991341956b2572 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 14:30:00 -0800 Subject: [PATCH 05/28] use correct driver [skip ci] --- .buildkite/gpu/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 694eb2628c..ca141cdbe3 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -76,7 +76,7 @@ steps: command: - > srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/hybrid/driver.jl + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml artifact_paths: "gpu_amip_chap_4process/*" agents: @@ -124,7 +124,7 @@ steps: command: - > srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/hybrid/driver.jl + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml artifact_paths: "gpu_amip_chap_ws_4process/*" agents: From 9d6991313f34ffaf19dc01c017a0e9cb115b88cc Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 16:32:05 -0800 Subject: [PATCH 06/28] wait after each job [skip ci] --- .buildkite/gpu/pipeline.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index ca141cdbe3..99f5167b3f 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -55,6 +55,7 @@ steps: slurm_ntasks: 1 slurm_mem: 32G slurm_exclusive: + - wait - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" key: "gpu_amip_chap_2process" @@ -70,6 +71,7 @@ steps: slurm_ntasks: 2 slurm_mem: 32G slurm_exclusive: + - wait - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" key: "gpu_amip_chap_4process" @@ -85,6 +87,8 @@ steps: slurm_ntasks: 4 slurm_mem: 32G slurm_exclusive: + - wait + - wait - group: "CHAP GPU weak scaling" steps: @@ -102,6 +106,7 @@ steps: slurm_ntasks: 1 slurm_mem: 32G slurm_exclusive: + - wait - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" key: "gpu_amip_chap_ws_2process" @@ -118,6 +123,7 @@ steps: slurm_mem: 32G slurm_time: 8:00:00 slurm_exclusive: + - wait - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" key: "gpu_amip_chap_ws_4process" @@ -134,3 +140,4 @@ steps: slurm_mem: 32G slurm_time: 8:00:00 slurm_exclusive: + - wait From 50c45ccbd6360adacd0dd120b8decc9a53608411 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Wed, 6 Mar 2024 22:30:26 -0800 Subject: [PATCH 07/28] weak scaling only [skip ci] --- .buildkite/gpu/pipeline.yml | 94 ++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 99f5167b3f..c5ddb579fc 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,56 +39,56 @@ steps: - wait - - group: "CHAP GPU strong scaling" - steps: + # - group: "CHAP GPU strong scaling" + # steps: - - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - key: "gpu_amip_chap" - command: - - > - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - artifact_paths: "gpu_amip_chap/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 1 - slurm_mem: 32G - slurm_exclusive: - - wait + # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + # key: "gpu_amip_chap" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + # artifact_paths: "gpu_amip_chap/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G + # slurm_exclusive: + # - wait - - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - key: "gpu_amip_chap_2process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - artifact_paths: "gpu_amip_chap_2process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 2 - slurm_mem: 32G - slurm_exclusive: - - wait + # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + # key: "gpu_amip_chap_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + # artifact_paths: "gpu_amip_chap_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + # slurm_exclusive: + # - wait - - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - key: "gpu_amip_chap_4process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - artifact_paths: "gpu_amip_chap_4process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 4 - slurm_mem: 32G - slurm_exclusive: - - wait - - wait + # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + # key: "gpu_amip_chap_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + # artifact_paths: "gpu_amip_chap_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + # slurm_exclusive: + # - wait + # - wait - group: "CHAP GPU weak scaling" steps: From 9e0df961838d2a254f573c8ec52cfc31039a4058 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Thu, 7 Mar 2024 09:55:41 -0800 Subject: [PATCH 08/28] add barrier --- experiments/AMIP/coupler_driver.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/experiments/AMIP/coupler_driver.jl b/experiments/AMIP/coupler_driver.jl index a4aea6f38f..0962c58a1d 100644 --- a/experiments/AMIP/coupler_driver.jl +++ b/experiments/AMIP/coupler_driver.jl @@ -553,6 +553,7 @@ elseif config_dict["turb_flux_partition"] == "CombinedStateFluxes" else error("turb_flux_partition must be either PartitionedStateFluxes or CombinedStateFluxes") end +ClimaComms.barrier(comms_ctx) # 1) coupler combines surface states and calculates rho_sfc using surface and atmos variables update_surface_fractions!(cs) From 9d994e4d1eb1e0bc28dcd31b56631eea85d66c51 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Thu, 7 Mar 2024 09:56:00 -0800 Subject: [PATCH 09/28] strong scaling only [skip ci] --- .buildkite/gpu/pipeline.yml | 132 ++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index c5ddb579fc..bc831f3362 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,67 +39,16 @@ steps: - wait - # - group: "CHAP GPU strong scaling" - # steps: - - # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - # key: "gpu_amip_chap" - # command: - # - > - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - # artifact_paths: "gpu_amip_chap/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 1 - # slurm_mem: 32G - # slurm_exclusive: - # - wait - - # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - # key: "gpu_amip_chap_2process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - # artifact_paths: "gpu_amip_chap_2process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 2 - # slurm_mem: 32G - # slurm_exclusive: - # - wait - - # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - # key: "gpu_amip_chap_4process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - # artifact_paths: "gpu_amip_chap_4process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 4 - # slurm_mem: 32G - # slurm_exclusive: - # - wait - # - wait - - - group: "CHAP GPU weak scaling" + - group: "CHAP GPU strong scaling" steps: - - label: "GPU AMIP CHAP - weak scaling - 1 GPU" - key: "gpu_amip_chap_ws" + - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + key: "gpu_amip_chap" command: - > julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml - artifact_paths: "gpu_amip_chap_ws/*" + --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + artifact_paths: "gpu_amip_chap/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 @@ -108,36 +57,87 @@ steps: slurm_exclusive: - wait - - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" - key: "gpu_amip_chap_ws_2process" + - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + key: "gpu_amip_chap_2process" command: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml - artifact_paths: "gpu_amip_chap_ws_2process/*" + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + artifact_paths: "gpu_amip_chap_2process/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 2 slurm_mem: 32G - slurm_time: 8:00:00 slurm_exclusive: - wait - - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" - key: "gpu_amip_chap_ws_4process" + - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + key: "gpu_amip_chap_4process" command: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml - artifact_paths: "gpu_amip_chap_ws_4process/*" + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + artifact_paths: "gpu_amip_chap_4process/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 4 slurm_mem: 32G - slurm_time: 8:00:00 slurm_exclusive: - wait + # - wait + + # - group: "CHAP GPU weak scaling" + # steps: + + # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" + # key: "gpu_amip_chap_ws" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml + # artifact_paths: "gpu_amip_chap_ws/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G + # slurm_exclusive: + # - wait + + # - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" + # key: "gpu_amip_chap_ws_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml + # artifact_paths: "gpu_amip_chap_ws_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + # slurm_time: 8:00:00 + # slurm_exclusive: + # - wait + + # - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" + # key: "gpu_amip_chap_ws_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml + # artifact_paths: "gpu_amip_chap_ws_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + # slurm_time: 8:00:00 + # slurm_exclusive: + # - wait From 686f44ca8908e0f957cdb314afede5dfeb425d4b Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Thu, 7 Mar 2024 17:07:26 -0800 Subject: [PATCH 10/28] no waits; update ws resolutions [skip ci] --- .buildkite/gpu/pipeline.yml | 99 +++++++++---------- config/gpu_configs/gpu_amip_chap_ws.yml | 1 + .../gpu_configs/gpu_amip_chap_ws_2process.yml | 1 + .../gpu_configs/gpu_amip_chap_ws_4process.yml | 1 + 4 files changed, 49 insertions(+), 53 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index bc831f3362..1543377c2c 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -11,7 +11,7 @@ env: OPENBLAS_NUM_THREADS: 1 OMPI_MCA_opal_warn_on_missing_libcuda: 0 SLURM_KILL_BAD_EXIT: 1 - SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291 + SLURM_GRES_FLAGS: "allow-task-sharing" GPU_CONFIG_PATH: "config/gpu_configs" CLIMAATMOS_GC_NSTEPS: 10 @@ -55,7 +55,6 @@ steps: slurm_ntasks: 1 slurm_mem: 32G slurm_exclusive: - - wait - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" key: "gpu_amip_chap_2process" @@ -71,7 +70,6 @@ steps: slurm_ntasks: 2 slurm_mem: 32G slurm_exclusive: - - wait - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" key: "gpu_amip_chap_4process" @@ -87,57 +85,52 @@ steps: slurm_ntasks: 4 slurm_mem: 32G slurm_exclusive: - - wait - # - wait - # - group: "CHAP GPU weak scaling" - # steps: + - group: "CHAP GPU weak scaling" + steps: - # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" - # key: "gpu_amip_chap_ws" - # command: - # - > - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml - # artifact_paths: "gpu_amip_chap_ws/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 1 - # slurm_mem: 32G - # slurm_exclusive: - # - wait + - label: "GPU AMIP CHAP - weak scaling - 1 GPU" + key: "gpu_amip_chap_ws" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml + artifact_paths: "gpu_amip_chap_ws/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: - # - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" - # key: "gpu_amip_chap_ws_2process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml - # artifact_paths: "gpu_amip_chap_ws_2process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 2 - # slurm_mem: 32G - # slurm_time: 8:00:00 - # slurm_exclusive: - # - wait + - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" + key: "gpu_amip_chap_ws_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml + artifact_paths: "gpu_amip_chap_ws_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: - # - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" - # key: "gpu_amip_chap_ws_4process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml - # artifact_paths: "gpu_amip_chap_ws_4process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 4 - # slurm_mem: 32G - # slurm_time: 8:00:00 - # slurm_exclusive: - # - wait + - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" + key: "gpu_amip_chap_ws_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml + artifact_paths: "gpu_amip_chap_ws_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap_ws.yml b/config/gpu_configs/gpu_amip_chap_ws.yml index bcb7a54060..f76cf72963 100644 --- a/config/gpu_configs/gpu_amip_chap_ws.yml +++ b/config/gpu_configs/gpu_amip_chap_ws.yml @@ -9,6 +9,7 @@ dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false +h_elem: 42 hourly_checkpoint: false job_id: "gpu_amip_chap_ws" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_amip_chap_ws_2process.yml b/config/gpu_configs/gpu_amip_chap_ws_2process.yml index 5641cef39e..5f65d1ac15 100644 --- a/config/gpu_configs/gpu_amip_chap_ws_2process.yml +++ b/config/gpu_configs/gpu_amip_chap_ws_2process.yml @@ -9,6 +9,7 @@ dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false +h_elem: 60 hourly_checkpoint: false job_id: "gpu_amip_chap_ws_2process" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_amip_chap_ws_4process.yml b/config/gpu_configs/gpu_amip_chap_ws_4process.yml index 0453144643..45443df254 100644 --- a/config/gpu_configs/gpu_amip_chap_ws_4process.yml +++ b/config/gpu_configs/gpu_amip_chap_ws_4process.yml @@ -9,6 +9,7 @@ dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false +h_elem: 86 hourly_checkpoint: false job_id: "gpu_amip_chap_ws_4process" land_albedo_type: "map_static" From 33811a66ee7324e24e00cfc992a76e4e0db4f09f Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 10:11:58 -0800 Subject: [PATCH 11/28] decrease ws 1 GPU dt [skip ci] --- .buildkite/gpu/pipeline.yml | 146 ++++++++++++------------ config/gpu_configs/gpu_amip_chap_ws.yml | 4 +- 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 1543377c2c..e701a6e08c 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,52 +39,52 @@ steps: - wait - - group: "CHAP GPU strong scaling" - steps: + # - group: "CHAP GPU strong scaling" + # steps: - - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - key: "gpu_amip_chap" - command: - - > - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - artifact_paths: "gpu_amip_chap/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 1 - slurm_mem: 32G - slurm_exclusive: + # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + # key: "gpu_amip_chap" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + # artifact_paths: "gpu_amip_chap/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G + # slurm_exclusive: - - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - key: "gpu_amip_chap_2process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - artifact_paths: "gpu_amip_chap_2process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 2 - slurm_mem: 32G - slurm_exclusive: + # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + # key: "gpu_amip_chap_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + # artifact_paths: "gpu_amip_chap_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + # slurm_exclusive: - - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - key: "gpu_amip_chap_4process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - artifact_paths: "gpu_amip_chap_4process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 4 - slurm_mem: 32G - slurm_exclusive: + # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + # key: "gpu_amip_chap_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + # artifact_paths: "gpu_amip_chap_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + # slurm_exclusive: - group: "CHAP GPU weak scaling" steps: @@ -103,34 +103,34 @@ steps: slurm_mem: 32G slurm_exclusive: - - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" - key: "gpu_amip_chap_ws_2process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml - artifact_paths: "gpu_amip_chap_ws_2process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 2 - slurm_mem: 32G - slurm_time: 8:00:00 - slurm_exclusive: + # - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" + # key: "gpu_amip_chap_ws_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_2process.yml + # artifact_paths: "gpu_amip_chap_ws_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + # slurm_time: 8:00:00 + # slurm_exclusive: - - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" - key: "gpu_amip_chap_ws_4process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml - artifact_paths: "gpu_amip_chap_ws_4process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 4 - slurm_mem: 32G - slurm_time: 8:00:00 - slurm_exclusive: + # - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" + # key: "gpu_amip_chap_ws_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml + # artifact_paths: "gpu_amip_chap_ws_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + # slurm_time: 8:00:00 + # slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap_ws.yml b/config/gpu_configs/gpu_amip_chap_ws.yml index f76cf72963..d22cad8259 100644 --- a/config/gpu_configs/gpu_amip_chap_ws.yml +++ b/config/gpu_configs/gpu_amip_chap_ws.yml @@ -1,9 +1,9 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml" -dt: "100secs" +dt: "50secs" dt_cloud_fraction: "1hours" -dt_cpl: 100 +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" From 91be5d30a416f71fa17a05981503de71a9d805e7 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 11:24:35 -0800 Subject: [PATCH 12/28] show surface fractions [skip ci] --- .buildkite/gpu/pipeline.yml | 60 ++++++++++++++++++------------------- src/Regridder.jl | 3 ++ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index e701a6e08c..b2a7fb6922 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,8 +39,8 @@ steps: - wait - # - group: "CHAP GPU strong scaling" - # steps: + - group: "CHAP GPU strong scaling" + steps: # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" # key: "gpu_amip_chap" @@ -56,53 +56,53 @@ steps: # slurm_mem: 32G # slurm_exclusive: - # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - # key: "gpu_amip_chap_2process" + - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + key: "gpu_amip_chap_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + artifact_paths: "gpu_amip_chap_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_exclusive: + + # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + # key: "gpu_amip_chap_4process" # command: # - > # srun --cpu-bind=threads --cpus-per-task=4 # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - # artifact_paths: "gpu_amip_chap_2process/*" + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + # artifact_paths: "gpu_amip_chap_4process/*" # agents: # slurm_gpus_per_task: 1 # slurm_cpus_per_task: 4 - # slurm_ntasks: 2 + # slurm_ntasks: 4 # slurm_mem: 32G # slurm_exclusive: - # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - # key: "gpu_amip_chap_4process" + # - group: "CHAP GPU weak scaling" + # steps: + + # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" + # key: "gpu_amip_chap_ws" # command: # - > - # srun --cpu-bind=threads --cpus-per-task=4 # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - # artifact_paths: "gpu_amip_chap_4process/*" + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml + # artifact_paths: "gpu_amip_chap_ws/*" # agents: # slurm_gpus_per_task: 1 # slurm_cpus_per_task: 4 - # slurm_ntasks: 4 + # slurm_ntasks: 1 # slurm_mem: 32G # slurm_exclusive: - - group: "CHAP GPU weak scaling" - steps: - - - label: "GPU AMIP CHAP - weak scaling - 1 GPU" - key: "gpu_amip_chap_ws" - command: - - > - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml - artifact_paths: "gpu_amip_chap_ws/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 1 - slurm_mem: 32G - slurm_exclusive: - # - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" # key: "gpu_amip_chap_ws_2process" # command: diff --git a/src/Regridder.jl b/src/Regridder.jl index ce166a9323..5b6556ed42 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -507,6 +507,9 @@ function update_surface_fractions!(cs::CoupledSimulation) cs.surface_fractions.ice .= max.(min.(ice_d, FT(1) .- land_s), FT(0)) cs.surface_fractions.ocean .= max.(FT(1) .- (cs.surface_fractions.ice .+ land_s), FT(0)) + @show cs.surface_fractions.ice + @show cs.surface_fractions.land + @show cs.surface_fractions.ocean @assert minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) @assert maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) From 7b521dbc03e86268014db07678530d533f6e5404 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 11:53:21 -0800 Subject: [PATCH 13/28] show surface fraction sums [skip ci] --- src/Regridder.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Regridder.jl b/src/Regridder.jl index 5b6556ed42..5beebb5e99 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -510,6 +510,9 @@ function update_surface_fractions!(cs::CoupledSimulation) @show cs.surface_fractions.ice @show cs.surface_fractions.land @show cs.surface_fractions.ocean + @show cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean + @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) @assert minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) @assert maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) From ed2768d8572a80381fae12cfd3c0d32042b57ca0 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 13:01:58 -0800 Subject: [PATCH 14/28] more barrier [skip ci] --- src/Regridder.jl | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Regridder.jl b/src/Regridder.jl index 5beebb5e99..1737e7c789 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -507,12 +507,17 @@ function update_surface_fractions!(cs::CoupledSimulation) cs.surface_fractions.ice .= max.(min.(ice_d, FT(1) .- land_s), FT(0)) cs.surface_fractions.ocean .= max.(FT(1) .- (cs.surface_fractions.ice .+ land_s), FT(0)) - @show cs.surface_fractions.ice - @show cs.surface_fractions.land - @show cs.surface_fractions.ocean - @show cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean - @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) - @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + # sf_sum = cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean + # if abs(minimum(sf_sum) - FT(1)) > eps(FT) || abs(maximum(sf_sum) - FT(1)) > eps(FT) + # @show cs.surface_fractions.ice + # @show cs.surface_fractions.land + # @show cs.surface_fractions.ocean + # @show cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean + # @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + # @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + # end + comms_ctx = axes(land_s).grid.topology.context + ClimaComms.barrier(comms_ctx) @assert minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) @assert maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) From 0b22c9ba0d155891f7828e3959b096ccf2c776b6 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 14:05:27 -0800 Subject: [PATCH 15/28] add scaling plot [skip ci] --- experiments/AMIP/plot_sypd.jl | 43 +++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 experiments/AMIP/plot_sypd.jl diff --git a/experiments/AMIP/plot_sypd.jl b/experiments/AMIP/plot_sypd.jl new file mode 100644 index 0000000000..2033dc2e92 --- /dev/null +++ b/experiments/AMIP/plot_sypd.jl @@ -0,0 +1,43 @@ +using Plots + +num_gpus = [1, 2, 4] + +# h_elem: 42, 60, 84 +ws_sypd = [0.8, 1.2, 2.1] +ws_res = ["77 km" "55 km" "39 km"] + +# h_elem: 30 +ss_30_sypd = [1.0, 1.7, 3.2] +ss_30_res = 110 + +# h_elem: 60 +ss_60_sypd = [0.7, 1.5, 2.8] +ss_60_res = 55 + + +scatter(num_gpus, + ws_sypd, + title = "AMIP GPU scaling", + xlabel = "Number of GPUs", + ylabel = "Simulated years per day (SYPD)", + labels = "Weak Scaling [various res.]", + right_margin = 20Plots.mm, +) +annotate!(num_gpus[1] + 0.38, ws_sypd[1], text(ws_res[1], 1, :right, 10)) +annotate!(num_gpus[2] + 0.38, ws_sypd[2], text(ws_res[2], 1, :right, 10)) +annotate!(num_gpus[3] + 0.38, ws_sypd[3], text(ws_res[3], 1, :right, 10)) + +scatter!(num_gpus, + ss_30_sypd, + label = "Strong Scaling [$ss_30_res km]", + color = 3, +) +# annotate!(num_gpus[1] + 0.52, ss_sypd[1], text(ss_res[1], 1, :right, 10)) +# annotate!(num_gpus[2] + 0.52, ss_sypd[2], text(ss_res[2], 1, :right, 10)) +# annotate!(num_gpus[3] + 0.52, ss_sypd[3], text(ss_res[3], 1, :right, 10)) + +scatter!(num_gpus, + ss_60_sypd, + label = "Strong Scaling [$ss_60_res km]", + color = 4, +) From 1f675c1c05e645a92cdc6fefa90f574b132381a0 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 14:54:20 -0800 Subject: [PATCH 16/28] ws 1 GPU h_elem 30 [skip ci] --- .buildkite/gpu/pipeline.yml | 61 ++++++++++++------------- config/gpu_configs/gpu_amip_chap_ws.yml | 6 +-- 2 files changed, 33 insertions(+), 34 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index b2a7fb6922..ee78539191 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,8 +39,8 @@ steps: - wait - - group: "CHAP GPU strong scaling" - steps: + # - group: "CHAP GPU strong scaling" + # steps: # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" # key: "gpu_amip_chap" @@ -56,20 +56,20 @@ steps: # slurm_mem: 32G # slurm_exclusive: - - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - key: "gpu_amip_chap_2process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - artifact_paths: "gpu_amip_chap_2process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 2 - slurm_mem: 32G - slurm_exclusive: + # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + # key: "gpu_amip_chap_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + # artifact_paths: "gpu_amip_chap_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + # slurm_exclusive: # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" # key: "gpu_amip_chap_4process" @@ -86,22 +86,21 @@ steps: # slurm_mem: 32G # slurm_exclusive: - # - group: "CHAP GPU weak scaling" - # steps: + - group: "CHAP GPU weak scaling" + steps: - # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" - # key: "gpu_amip_chap_ws" - # command: - # - > - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml - # artifact_paths: "gpu_amip_chap_ws/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 1 - # slurm_mem: 32G - # slurm_exclusive: + - label: "GPU AMIP CHAP - weak scaling - 1 GPU" + key: "gpu_amip_chap_ws" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml + artifact_paths: "gpu_amip_chap_ws/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G + slurm_exclusive: # - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" # key: "gpu_amip_chap_ws_2process" diff --git a/config/gpu_configs/gpu_amip_chap_ws.yml b/config/gpu_configs/gpu_amip_chap_ws.yml index d22cad8259..e24b359bd9 100644 --- a/config/gpu_configs/gpu_amip_chap_ws.yml +++ b/config/gpu_configs/gpu_amip_chap_ws.yml @@ -1,15 +1,15 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_ws_1process.yml" -dt: "50secs" +dt: "100secs" dt_cloud_fraction: "1hours" -dt_cpl: 50 +dt_cpl: 100 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 42 +h_elem: 30 hourly_checkpoint: false job_id: "gpu_amip_chap_ws" land_albedo_type: "map_static" From 9031045358c1d058f3105081a5958ee5c6ef4a5f Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 15:37:37 -0800 Subject: [PATCH 17/28] 2 gpu ss sum before max [skip ci] --- .buildkite/gpu/pipeline.yml | 60 ++++++++++++++++++------------------- src/Regridder.jl | 25 ++++++++++------ 2 files changed, 46 insertions(+), 39 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index ee78539191..5007acbaa6 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,8 +39,8 @@ steps: - wait - # - group: "CHAP GPU strong scaling" - # steps: + - group: "CHAP GPU strong scaling" + steps: # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" # key: "gpu_amip_chap" @@ -56,20 +56,20 @@ steps: # slurm_mem: 32G # slurm_exclusive: - # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - # key: "gpu_amip_chap_2process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - # artifact_paths: "gpu_amip_chap_2process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 2 - # slurm_mem: 32G - # slurm_exclusive: + - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + key: "gpu_amip_chap_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + artifact_paths: "gpu_amip_chap_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G + slurm_exclusive: # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" # key: "gpu_amip_chap_4process" @@ -86,21 +86,21 @@ steps: # slurm_mem: 32G # slurm_exclusive: - - group: "CHAP GPU weak scaling" - steps: + # - group: "CHAP GPU weak scaling" + # steps: - - label: "GPU AMIP CHAP - weak scaling - 1 GPU" - key: "gpu_amip_chap_ws" - command: - - > - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml - artifact_paths: "gpu_amip_chap_ws/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 1 - slurm_mem: 32G - slurm_exclusive: + # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" + # key: "gpu_amip_chap_ws" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws.yml + # artifact_paths: "gpu_amip_chap_ws/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G + # slurm_exclusive: # - label: "GPU AMIP CHAP - weak scaling - 2 GPUs" # key: "gpu_amip_chap_ws_2process" diff --git a/src/Regridder.jl b/src/Regridder.jl index 1737e7c789..dbc8e91f34 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -507,17 +507,24 @@ function update_surface_fractions!(cs::CoupledSimulation) cs.surface_fractions.ice .= max.(min.(ice_d, FT(1) .- land_s), FT(0)) cs.surface_fractions.ocean .= max.(FT(1) .- (cs.surface_fractions.ice .+ land_s), FT(0)) - # sf_sum = cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean - # if abs(minimum(sf_sum) - FT(1)) > eps(FT) || abs(maximum(sf_sum) - FT(1)) > eps(FT) - # @show cs.surface_fractions.ice - # @show cs.surface_fractions.land - # @show cs.surface_fractions.ocean - # @show cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean - # @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) - # @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) - # end + sf_sum = cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean + if abs(minimum(sf_sum) - FT(1)) > eps(FT) || abs(maximum(sf_sum) - FT(1)) > eps(FT) + @show minimum(FT(1) .- (cs.surface_fractions.ice .+ land_s)) + @show maximum(FT(1) .- (cs.surface_fractions.ice .+ land_s)) + # @show cs.surface_fractions.ice + # @show cs.surface_fractions.land + # @show cs.surface_fractions.ocean + # @show cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean + # @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + # @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) + end comms_ctx = axes(land_s).grid.topology.context ClimaComms.barrier(comms_ctx) + + @show minimum(cs.surface_fractions.ice) >= FT(0) + @show minimum(cs.surface_fractions.land) >= FT(0) + @show minimum(cs.surface_fractions.ocean) >= FT(0) + @assert minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) @assert maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) From 1d9206fe7a7a82860e8da42a0e11f2c87f94dfd0 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 16:52:16 -0800 Subject: [PATCH 18/28] strong scaling only h_elem 30 [skip ci] --- .buildkite/gpu/pipeline.yml | 53 +++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 5007acbaa6..07c0708d8c 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -42,19 +42,18 @@ steps: - group: "CHAP GPU strong scaling" steps: - # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - # key: "gpu_amip_chap" - # command: - # - > - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - # artifact_paths: "gpu_amip_chap/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 1 - # slurm_mem: 32G - # slurm_exclusive: + - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + key: "gpu_amip_chap" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + artifact_paths: "gpu_amip_chap/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" key: "gpu_amip_chap_2process" @@ -69,22 +68,20 @@ steps: slurm_cpus_per_task: 4 slurm_ntasks: 2 slurm_mem: 32G - slurm_exclusive: - # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - # key: "gpu_amip_chap_4process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - # artifact_paths: "gpu_amip_chap_4process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 4 - # slurm_mem: 32G - # slurm_exclusive: + - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + key: "gpu_amip_chap_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + artifact_paths: "gpu_amip_chap_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G # - group: "CHAP GPU weak scaling" # steps: From 885737d94bc1f15dcfd3cb47a0a6cc04f33f9d6f Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 8 Mar 2024 16:53:45 -0800 Subject: [PATCH 19/28] strong scaling h_elem 60, dt 50 --- config/gpu_configs/gpu_amip_chap.yml | 5 +++-- config/gpu_configs/gpu_amip_chap_2process.yml | 5 +++-- config/gpu_configs/gpu_amip_chap_4process.yml | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/config/gpu_configs/gpu_amip_chap.yml b/config/gpu_configs/gpu_amip_chap.yml index 099141f9ea..3b4ee9f0db 100644 --- a/config/gpu_configs/gpu_amip_chap.yml +++ b/config/gpu_configs/gpu_amip_chap.yml @@ -1,14 +1,15 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap.yml" -dt: "100secs" +dt: "50secs" dt_cloud_fraction: "1hours" -dt_cpl: 100 +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false +h_elem: 60 hourly_checkpoint: false job_id: "gpu_amip_chap" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_amip_chap_2process.yml b/config/gpu_configs/gpu_amip_chap_2process.yml index 1b33937817..a16827f88e 100644 --- a/config/gpu_configs/gpu_amip_chap_2process.yml +++ b/config/gpu_configs/gpu_amip_chap_2process.yml @@ -1,14 +1,15 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_2process.yml" -dt: "100secs" +dt: "50secs" dt_cloud_fraction: "1hours" -dt_cpl: 100 +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false +h_elem: 60 hourly_checkpoint: false job_id: "gpu_amip_chap_2process" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_amip_chap_4process.yml b/config/gpu_configs/gpu_amip_chap_4process.yml index 2d95d784e6..14cc457e43 100644 --- a/config/gpu_configs/gpu_amip_chap_4process.yml +++ b/config/gpu_configs/gpu_amip_chap_4process.yml @@ -1,14 +1,15 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_chap_4process.yml" -dt: "100secs" +dt: "50secs" dt_cloud_fraction: "1hours" -dt_cpl: 100 +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false +h_elem: 60 hourly_checkpoint: false job_id: "gpu_amip_chap_4process" land_albedo_type: "map_static" From f77dfabe034540c438e083117212198413ff2501 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Sun, 10 Mar 2024 21:16:37 -0700 Subject: [PATCH 20/28] weak scaling h_elem 84 dt 50 --- .buildkite/gpu/pipeline.yml | 114 +++++++++--------- .../gpu_configs/gpu_amip_chap_ws_4process.yml | 2 +- src/Regridder.jl | 15 --- 3 files changed, 58 insertions(+), 73 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 07c0708d8c..67fb964d8a 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,52 +39,52 @@ steps: - wait - - group: "CHAP GPU strong scaling" - steps: + # - group: "CHAP GPU strong scaling" + # steps: - - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - key: "gpu_amip_chap" - command: - - > - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - artifact_paths: "gpu_amip_chap/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 1 - slurm_mem: 32G + # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + # key: "gpu_amip_chap" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + # artifact_paths: "gpu_amip_chap/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G - - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - key: "gpu_amip_chap_2process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - artifact_paths: "gpu_amip_chap_2process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 2 - slurm_mem: 32G + # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + # key: "gpu_amip_chap_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + # artifact_paths: "gpu_amip_chap_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G - - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - key: "gpu_amip_chap_4process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - artifact_paths: "gpu_amip_chap_4process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 4 - slurm_mem: 32G + # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + # key: "gpu_amip_chap_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + # artifact_paths: "gpu_amip_chap_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G - # - group: "CHAP GPU weak scaling" - # steps: + - group: "CHAP GPU weak scaling" + steps: # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" # key: "gpu_amip_chap_ws" @@ -115,18 +115,18 @@ steps: # slurm_time: 8:00:00 # slurm_exclusive: - # - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" - # key: "gpu_amip_chap_ws_4process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml - # artifact_paths: "gpu_amip_chap_ws_4process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 4 - # slurm_mem: 32G - # slurm_time: 8:00:00 - # slurm_exclusive: + - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" + key: "gpu_amip_chap_ws_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml + artifact_paths: "gpu_amip_chap_ws_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G + slurm_time: 8:00:00 + slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap_ws_4process.yml b/config/gpu_configs/gpu_amip_chap_ws_4process.yml index 45443df254..8d7dc5ccad 100644 --- a/config/gpu_configs/gpu_amip_chap_ws_4process.yml +++ b/config/gpu_configs/gpu_amip_chap_ws_4process.yml @@ -9,7 +9,7 @@ dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 86 +h_elem: 84 hourly_checkpoint: false job_id: "gpu_amip_chap_ws_4process" land_albedo_type: "map_static" diff --git a/src/Regridder.jl b/src/Regridder.jl index dbc8e91f34..64b78cb4d6 100644 --- a/src/Regridder.jl +++ b/src/Regridder.jl @@ -507,24 +507,9 @@ function update_surface_fractions!(cs::CoupledSimulation) cs.surface_fractions.ice .= max.(min.(ice_d, FT(1) .- land_s), FT(0)) cs.surface_fractions.ocean .= max.(FT(1) .- (cs.surface_fractions.ice .+ land_s), FT(0)) - sf_sum = cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean - if abs(minimum(sf_sum) - FT(1)) > eps(FT) || abs(maximum(sf_sum) - FT(1)) > eps(FT) - @show minimum(FT(1) .- (cs.surface_fractions.ice .+ land_s)) - @show maximum(FT(1) .- (cs.surface_fractions.ice .+ land_s)) - # @show cs.surface_fractions.ice - # @show cs.surface_fractions.land - # @show cs.surface_fractions.ocean - # @show cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean - # @show minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) - # @show maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) - end comms_ctx = axes(land_s).grid.topology.context ClimaComms.barrier(comms_ctx) - @show minimum(cs.surface_fractions.ice) >= FT(0) - @show minimum(cs.surface_fractions.land) >= FT(0) - @show minimum(cs.surface_fractions.ocean) >= FT(0) - @assert minimum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) @assert maximum(cs.surface_fractions.ice .+ cs.surface_fractions.land .+ cs.surface_fractions.ocean) ≈ FT(1) From dbbdfee91775b3d3aab1e12cbb5b85fef7920d34 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 11:18:53 -0700 Subject: [PATCH 21/28] ss 1 gpu @ 60, 4 gpu @ 42 --- .buildkite/gpu/pipeline.yml | 88 +++++++++---------- config/gpu_configs/gpu_amip_chap_4process.yml | 2 +- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 67fb964d8a..9f0ff0774a 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -39,21 +39,21 @@ steps: - wait - # - group: "CHAP GPU strong scaling" - # steps: + - group: "CHAP GPU strong scaling" + steps: - # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - # key: "gpu_amip_chap" - # command: - # - > - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - # artifact_paths: "gpu_amip_chap/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 1 - # slurm_mem: 32G + - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + key: "gpu_amip_chap" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + artifact_paths: "gpu_amip_chap/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" # key: "gpu_amip_chap_2process" @@ -69,22 +69,22 @@ steps: # slurm_ntasks: 2 # slurm_mem: 32G - # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - # key: "gpu_amip_chap_4process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - # artifact_paths: "gpu_amip_chap_4process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 4 - # slurm_mem: 32G + - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + key: "gpu_amip_chap_4process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + artifact_paths: "gpu_amip_chap_4process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 4 + slurm_mem: 32G - - group: "CHAP GPU weak scaling" - steps: + # - group: "CHAP GPU weak scaling" + # steps: # - label: "GPU AMIP CHAP - weak scaling - 1 GPU" # key: "gpu_amip_chap_ws" @@ -115,18 +115,18 @@ steps: # slurm_time: 8:00:00 # slurm_exclusive: - - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" - key: "gpu_amip_chap_ws_4process" - command: - - > - srun --cpu-bind=threads --cpus-per-task=4 - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml - artifact_paths: "gpu_amip_chap_ws_4process/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 4 - slurm_mem: 32G - slurm_time: 8:00:00 - slurm_exclusive: + # - label: "GPU AMIP CHAP - weak scaling - 4 GPUs" + # key: "gpu_amip_chap_ws_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_ws_4process.yml + # artifact_paths: "gpu_amip_chap_ws_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + # slurm_time: 8:00:00 + # slurm_exclusive: diff --git a/config/gpu_configs/gpu_amip_chap_4process.yml b/config/gpu_configs/gpu_amip_chap_4process.yml index 14cc457e43..d76fdcfdc3 100644 --- a/config/gpu_configs/gpu_amip_chap_4process.yml +++ b/config/gpu_configs/gpu_amip_chap_4process.yml @@ -9,7 +9,7 @@ dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 60 +h_elem: 42 hourly_checkpoint: false job_id: "gpu_amip_chap_4process" land_albedo_type: "map_static" From c64b44201703d1aa18466b120a9db93510002763 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 22:25:33 -0700 Subject: [PATCH 22/28] dyamond strong scaling [skip ci] --- .buildkite/gpu/pipeline.yml | 89 ++++++++++++++----- .../gpu_dyamond/gpu_amip_dyamond.yml | 21 +++++ .../gpu_dyamond/gpu_amip_dyamond_2process.yml | 21 +++++ .../gpu_dyamond/gpu_amip_dyamond_4process.yml | 21 +++++ 4 files changed, 130 insertions(+), 22 deletions(-) create mode 100644 config/gpu_configs/gpu_dyamond/gpu_amip_dyamond.yml create mode 100644 config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_2process.yml create mode 100644 config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_4process.yml diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 9f0ff0774a..8716d7cca7 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -13,6 +13,7 @@ env: SLURM_KILL_BAD_EXIT: 1 SLURM_GRES_FLAGS: "allow-task-sharing" GPU_CONFIG_PATH: "config/gpu_configs" + GPU_DYAMOND_CONFIG_PATH: "config/gpu_configs/gpu_dyamond" CLIMAATMOS_GC_NSTEPS: 10 steps: @@ -39,50 +40,94 @@ steps: - wait - - group: "CHAP GPU strong scaling" + - group: "DYAMOND GPU strong scaling" steps: - - label: "GPU AMIP CHAP - strong scaling - 1 GPU" - key: "gpu_amip_chap" + - label: "GPU AMIP DYAMOND - strong scaling - 1 GPU" + key: "gpu_amip_dyamond" command: - > julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml - artifact_paths: "gpu_amip_chap/*" + --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond.yml + artifact_paths: "gpu_amip_dyamond/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 1 slurm_mem: 32G - # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" - # key: "gpu_amip_chap_2process" - # command: - # - > - # srun --cpu-bind=threads --cpus-per-task=4 - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml - # artifact_paths: "gpu_amip_chap_2process/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 2 - # slurm_mem: 32G + - label: "GPU AMIP DYAMOND - strong scaling - 2 GPUs" + key: "gpu_amip_dyamond_2process" + command: + - > + srun --cpu-bind=threads --cpus-per-task=4 + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond_2process.yml + artifact_paths: "gpu_amip_dyamond_2process/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 2 + slurm_mem: 32G - - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" - key: "gpu_amip_chap_4process" + - label: "GPU AMIP DYAMOND - strong scaling - 4 GPUs" + key: "gpu_amip_dyamond_4process" command: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml - artifact_paths: "gpu_amip_chap_4process/*" + --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond_4process.yml + artifact_paths: "gpu_amip_dyamond_4process/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 4 slurm_mem: 32G + # - group: "CHAP GPU strong scaling" + # steps: + + # - label: "GPU AMIP CHAP - strong scaling - 1 GPU" + # key: "gpu_amip_chap" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap.yml + # artifact_paths: "gpu_amip_chap/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G + + # - label: "GPU AMIP CHAP - strong scaling - 2 GPUs" + # key: "gpu_amip_chap_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_2process.yml + # artifact_paths: "gpu_amip_chap_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + + # - label: "GPU AMIP CHAP - strong scaling - 4 GPUs" + # key: "gpu_amip_chap_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_CONFIG_PATH/gpu_amip_chap_4process.yml + # artifact_paths: "gpu_amip_chap_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + # - group: "CHAP GPU weak scaling" # steps: diff --git a/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond.yml b/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond.yml new file mode 100644 index 0000000000..eb0709ba99 --- /dev/null +++ b/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond.yml @@ -0,0 +1,21 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond.yml" +dt: "100secs" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +h_elem: 30 +hourly_checkpoint: false +job_id: "gpu_amip_dyamond" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_dyamond" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "12hours" +turb_flux_partition: "CombinedStateFluxes" diff --git a/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_2process.yml b/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_2process.yml new file mode 100644 index 0000000000..b0a3f8a1ac --- /dev/null +++ b/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_2process.yml @@ -0,0 +1,21 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml" +dt: "100secs" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +h_elem: 30 +hourly_checkpoint: false +job_id: "gpu_amip_dyamond" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_dyamond" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "12hours" +turb_flux_partition: "CombinedStateFluxes" diff --git a/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_4process.yml b/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_4process.yml new file mode 100644 index 0000000000..2e9263706e --- /dev/null +++ b/config/gpu_configs/gpu_dyamond/gpu_amip_dyamond_4process.yml @@ -0,0 +1,21 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml" +dt: "100secs" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +h_elem: 30 +hourly_checkpoint: false +job_id: "gpu_amip_dyamond" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_dyamond" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "12hours" +turb_flux_partition: "CombinedStateFluxes" From 6c220463fa3721d8f635834e7cacfd11338a60b7 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 23:24:42 -0700 Subject: [PATCH 23/28] DYAMOND ws [skip ci] --- .buildkite/gpu/pipeline.yml | 87 ++++++++++++++----- .../gpu_dyamond_ws/gpu_amip_dyamond_ws.yml | 21 +++++ .../gpu_amip_dyamond_ws_2process.yml | 21 +++++ .../gpu_amip_dyamond_ws_4process.yml | 21 +++++ 4 files changed, 129 insertions(+), 21 deletions(-) create mode 100644 config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml create mode 100644 config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml create mode 100644 config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index 8716d7cca7..c6ee119010 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -14,6 +14,7 @@ env: SLURM_GRES_FLAGS: "allow-task-sharing" GPU_CONFIG_PATH: "config/gpu_configs" GPU_DYAMOND_CONFIG_PATH: "config/gpu_configs/gpu_dyamond" + GPU_DYAMOND_WS_CONFIG_PATH: "config/gpu_configs/gpu_dyamond_ws" CLIMAATMOS_GC_NSTEPS: 10 steps: @@ -40,44 +41,88 @@ steps: - wait - - group: "DYAMOND GPU strong scaling" + # - group: "DYAMOND GPU strong scaling" + # steps: + + # - label: "GPU AMIP DYAMOND - strong scaling - 1 GPU" + # key: "gpu_amip_dyamond" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond.yml + # artifact_paths: "gpu_amip_dyamond/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G + + # - label: "GPU AMIP DYAMOND - strong scaling - 2 GPUs" + # key: "gpu_amip_dyamond_2process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond_2process.yml + # artifact_paths: "gpu_amip_dyamond_2process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 2 + # slurm_mem: 32G + + # - label: "GPU AMIP DYAMOND - strong scaling - 4 GPUs" + # key: "gpu_amip_dyamond_4process" + # command: + # - > + # srun --cpu-bind=threads --cpus-per-task=4 + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond_4process.yml + # artifact_paths: "gpu_amip_dyamond_4process/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 4 + # slurm_mem: 32G + + - group: "DYAMOND GPU weak scaling" steps: - - label: "GPU AMIP DYAMOND - strong scaling - 1 GPU" - key: "gpu_amip_dyamond" - command: - - > - julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond.yml - artifact_paths: "gpu_amip_dyamond/*" - agents: - slurm_gpus_per_task: 1 - slurm_cpus_per_task: 4 - slurm_ntasks: 1 - slurm_mem: 32G + # - label: "GPU AMIP DYAMOND - weak scaling - 1 GPU" + # key: "gpu_amip_dyamond_ws" + # command: + # - > + # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + # --config_file $GPU_DYAMOND_CONFIG_PATH_WS/gpu_amip_dyamond_ws.yml + # artifact_paths: "gpu_amip_dyamond_ws/*" + # agents: + # slurm_gpus_per_task: 1 + # slurm_cpus_per_task: 4 + # slurm_ntasks: 1 + # slurm_mem: 32G - - label: "GPU AMIP DYAMOND - strong scaling - 2 GPUs" - key: "gpu_amip_dyamond_2process" + - label: "GPU AMIP DYAMOND - weak scaling - 2 GPUs" + key: "gpu_amip_dyamond_ws_2process" command: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond_2process.yml - artifact_paths: "gpu_amip_dyamond_2process/*" + --config_file $GPU_DYAMOND_CONFIG_PATH_WS/gpu_amip_dyamond_ws_2process.yml + artifact_paths: "gpu_amip_dyamond_ws_2process/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 slurm_ntasks: 2 slurm_mem: 32G - - label: "GPU AMIP DYAMOND - strong scaling - 4 GPUs" - key: "gpu_amip_dyamond_4process" + - label: "GPU AMIP DYAMOND - weak scaling - 4 GPUs" + key: "gpu_amip_dyamond_ws_4process" command: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_DYAMOND_CONFIG_PATH/gpu_amip_dyamond_4process.yml - artifact_paths: "gpu_amip_dyamond_4process/*" + --config_file $GPU_DYAMOND_CONFIG_PATH_WS/gpu_amip_dyamond_ws_4process.yml + artifact_paths: "gpu_amip_dyamond_ws_4process/*" agents: slurm_gpus_per_task: 1 slurm_cpus_per_task: 4 diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml new file mode 100644 index 0000000000..eb0709ba99 --- /dev/null +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml @@ -0,0 +1,21 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond.yml" +dt: "100secs" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +h_elem: 30 +hourly_checkpoint: false +job_id: "gpu_amip_dyamond" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_dyamond" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "12hours" +turb_flux_partition: "CombinedStateFluxes" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml new file mode 100644 index 0000000000..123af4d8be --- /dev/null +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml @@ -0,0 +1,21 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml" +dt: "100secs" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +h_elem: 42 +hourly_checkpoint: false +job_id: "gpu_amip_dyamond" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_dyamond" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "12hours" +turb_flux_partition: "CombinedStateFluxes" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml new file mode 100644 index 0000000000..1cad3532b9 --- /dev/null +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml @@ -0,0 +1,21 @@ +anim: false +apply_limiter: false +atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml" +dt: "100secs" +dt_cpl: 100 +dt_rad: "1hours" +dt_save_state_to_disk: "Inf" +dt_save_to_sol: "Inf" +energy_check: false +evolving_ocean: false +h_elem: 60 +hourly_checkpoint: false +job_id: "gpu_amip_dyamond" +land_albedo_type: "map_static" +mode_name: "amip" +mono_surface: false +run_name: "gpu_amip_dyamond" +start_date: "19790301" +surface_setup: "PrescribedSurface" +t_end: "12hours" +turb_flux_partition: "CombinedStateFluxes" From 4a21815d42ef2ecd4cf00a80625a754d50130bdb Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 23:26:23 -0700 Subject: [PATCH 24/28] dyamond ws higher res [skip ci] --- config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml | 6 +++--- .../gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml | 6 +++--- .../gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml index eb0709ba99..226d4081dc 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml @@ -1,14 +1,14 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond.yml" -dt: "100secs" -dt_cpl: 100 +dt: "50secs" +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 30 +h_elem: 42 hourly_checkpoint: false job_id: "gpu_amip_dyamond" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml index 123af4d8be..2eb6629f6b 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml @@ -1,14 +1,14 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml" -dt: "100secs" -dt_cpl: 100 +dt: "50secs" +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 42 +h_elem: 60 hourly_checkpoint: false job_id: "gpu_amip_dyamond" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml index 1cad3532b9..aa99ec5f7b 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml @@ -1,14 +1,14 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml" -dt: "100secs" -dt_cpl: 100 +dt: "50secs" +dt_cpl: 50 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 60 +h_elem: 84 hourly_checkpoint: false job_id: "gpu_amip_dyamond" land_albedo_type: "map_static" From f963c31a8c03fa4e0b29168b2a07e966cb6dd76a Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 23:45:09 -0700 Subject: [PATCH 25/28] fix pipeline [skip ci] --- .buildkite/gpu/pipeline.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index c6ee119010..fadb5223ea 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -93,7 +93,7 @@ steps: # command: # - > # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_DYAMOND_CONFIG_PATH_WS/gpu_amip_dyamond_ws.yml + # --config_file $GPU_DYAMOND_WS_CONFIG_PATH/gpu_amip_dyamond_ws.yml # artifact_paths: "gpu_amip_dyamond_ws/*" # agents: # slurm_gpus_per_task: 1 @@ -107,7 +107,7 @@ steps: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_DYAMOND_CONFIG_PATH_WS/gpu_amip_dyamond_ws_2process.yml + --config_file $GPU_DYAMOND_WS_CONFIG_PATH/gpu_amip_dyamond_ws_2process.yml artifact_paths: "gpu_amip_dyamond_ws_2process/*" agents: slurm_gpus_per_task: 1 @@ -121,7 +121,7 @@ steps: - > srun --cpu-bind=threads --cpus-per-task=4 julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - --config_file $GPU_DYAMOND_CONFIG_PATH_WS/gpu_amip_dyamond_ws_4process.yml + --config_file $GPU_DYAMOND_WS_CONFIG_PATH/gpu_amip_dyamond_ws_4process.yml artifact_paths: "gpu_amip_dyamond_ws_4process/*" agents: slurm_gpus_per_task: 1 From 55c6dc12187dbe1e093fd36912d447019b48a17c Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 23:47:05 -0700 Subject: [PATCH 26/28] include 1 GPU [skip ci] --- .buildkite/gpu/pipeline.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.buildkite/gpu/pipeline.yml b/.buildkite/gpu/pipeline.yml index fadb5223ea..b95d5ac97a 100644 --- a/.buildkite/gpu/pipeline.yml +++ b/.buildkite/gpu/pipeline.yml @@ -88,18 +88,18 @@ steps: - group: "DYAMOND GPU weak scaling" steps: - # - label: "GPU AMIP DYAMOND - weak scaling - 1 GPU" - # key: "gpu_amip_dyamond_ws" - # command: - # - > - # julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl - # --config_file $GPU_DYAMOND_WS_CONFIG_PATH/gpu_amip_dyamond_ws.yml - # artifact_paths: "gpu_amip_dyamond_ws/*" - # agents: - # slurm_gpus_per_task: 1 - # slurm_cpus_per_task: 4 - # slurm_ntasks: 1 - # slurm_mem: 32G + - label: "GPU AMIP DYAMOND - weak scaling - 1 GPU" + key: "gpu_amip_dyamond_ws" + command: + - > + julia --threads=3 --color=yes --project=experiments/AMIP experiments/AMIP/coupler_driver.jl + --config_file $GPU_DYAMOND_WS_CONFIG_PATH/gpu_amip_dyamond_ws.yml + artifact_paths: "gpu_amip_dyamond_ws/*" + agents: + slurm_gpus_per_task: 1 + slurm_cpus_per_task: 4 + slurm_ntasks: 1 + slurm_mem: 32G - label: "GPU AMIP DYAMOND - weak scaling - 2 GPUs" key: "gpu_amip_dyamond_ws_2process" From 8e68e7f6a8c8350a2d435c9ab5675cc45ad3ed2c Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Mon, 11 Mar 2024 23:48:31 -0700 Subject: [PATCH 27/28] dyamond ws helem 30,42,60 [skip ci] --- config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml | 6 +++--- .../gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml | 6 +++--- .../gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml index 226d4081dc..eb0709ba99 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml @@ -1,14 +1,14 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond.yml" -dt: "50secs" -dt_cpl: 50 +dt: "100secs" +dt_cpl: 100 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 42 +h_elem: 30 hourly_checkpoint: false job_id: "gpu_amip_dyamond" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml index 2eb6629f6b..123af4d8be 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml @@ -1,14 +1,14 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_2process.yml" -dt: "50secs" -dt_cpl: 50 +dt: "100secs" +dt_cpl: 100 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 60 +h_elem: 42 hourly_checkpoint: false job_id: "gpu_amip_dyamond" land_albedo_type: "map_static" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml index aa99ec5f7b..1cad3532b9 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml @@ -1,14 +1,14 @@ anim: false apply_limiter: false atmos_config_file: "config/gpu_configs/gpu_aquaplanet_dyamond_4process.yml" -dt: "50secs" -dt_cpl: 50 +dt: "100secs" +dt_cpl: 100 dt_rad: "1hours" dt_save_state_to_disk: "Inf" dt_save_to_sol: "Inf" energy_check: false evolving_ocean: false -h_elem: 84 +h_elem: 60 hourly_checkpoint: false job_id: "gpu_amip_dyamond" land_albedo_type: "map_static" From 10ac98e410b27bd3c60c3ec8c0e009fa7951dbc3 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Tue, 12 Mar 2024 08:59:08 -0700 Subject: [PATCH 28/28] run for 1 day [skip ci] --- config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml | 2 +- .../gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml | 2 +- .../gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml index eb0709ba99..3bcb7448d8 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws.yml @@ -17,5 +17,5 @@ mono_surface: false run_name: "gpu_amip_dyamond" start_date: "19790301" surface_setup: "PrescribedSurface" -t_end: "12hours" +t_end: "1days" turb_flux_partition: "CombinedStateFluxes" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml index 123af4d8be..6a50e294f8 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_2process.yml @@ -17,5 +17,5 @@ mono_surface: false run_name: "gpu_amip_dyamond" start_date: "19790301" surface_setup: "PrescribedSurface" -t_end: "12hours" +t_end: "1days" turb_flux_partition: "CombinedStateFluxes" diff --git a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml index 1cad3532b9..17a367f21b 100644 --- a/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml +++ b/config/gpu_configs/gpu_dyamond_ws/gpu_amip_dyamond_ws_4process.yml @@ -17,5 +17,5 @@ mono_surface: false run_name: "gpu_amip_dyamond" start_date: "19790301" surface_setup: "PrescribedSurface" -t_end: "12hours" +t_end: "1days" turb_flux_partition: "CombinedStateFluxes"