Skip to content

Commit

Permalink
Merge pull request #2419 from CliMA/sb/gc2
Browse files Browse the repository at this point in the history
Fix GPU scaling issues
  • Loading branch information
simonbyrne authored Dec 14, 2023
2 parents b26b78b + 9ff4062 commit 5bd29f7
Show file tree
Hide file tree
Showing 12 changed files with 245 additions and 188 deletions.
36 changes: 28 additions & 8 deletions .buildkite/gpu_pipeline/pipeline.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
agents:
queue: clima
slurm_mem: 8G
modules: julia/1.9.4 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2023.3.1
modules: julia/1.9.4 cuda/julia-pref openmpi/4.1.5-mpitrampoline nsight-systems/2023.4.1

env:
JULIA_CUDA_MEMORY_POOL: none
Expand All @@ -16,6 +16,7 @@ env:
GPU_CONFIG_PATH: "config/gpu_configs/"
PERF_CONFIG_PATH: "config/perf_configs"
MPI_CONFIG_PATH: "config/mpi_configs"
CLIMAATMOS_GC_NSTEPS: 10

steps:
- label: "init :GPU:"
Expand All @@ -32,6 +33,7 @@ steps:

agents:
slurm_gpus: 1
slurm_cpus_per_task: 8
env:
JULIA_NUM_PRECOMPILE_TASKS: 8
JULIA_MAX_NUM_PRECOMPILE_FILES: 50
Expand All @@ -46,47 +48,65 @@ steps:
command:
- mkdir -p target_gpu_implicit_baroclinic_wave
- >
nsys profile --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/report
nsys profile --trace=nvtx,mpi,cuda,osrt --output=target_gpu_implicit_baroclinic_wave/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml
artifact_paths: "target_gpu_implicit_baroclinic_wave/*"
agents:
slurm_gpus: 1
slurm_time: 23:00:00
slurm_cpus_per_task: 4

- label: "gpu_aquaplanet_dyamond"
command:
- mkdir -p gpu_aquaplanet_dyamond
- >
nsys profile --trace=nvtx,cuda --output=gpu_aquaplanet_dyamond/report
- >
nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_aquaplanet_dyamond/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_aquaplanet_dyamond.yml
artifact_paths: "gpu_aquaplanet_dyamond/*"
agents:
slurm_gpus: 1
slurm_cpus_per_task: 4

- label: "moist Held-Suarez"
key: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
command:
- mkdir -p gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km
- >
nsys profile --trace=nvtx,cuda --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report
nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml
artifact_paths: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*"
agents:
slurm_gpus: 1
slurm_cpus_per_task: 4

- label: "moist Held-Suarez - 4 gpus"
key: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process"
command:
- mkdir -p gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process
- >
srun --cpu-bind=cores
nsys profile --trace=nvtx,mpi,cuda,osrt --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process/report-%q{PMI_RANK}
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process.yml
artifact_paths: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km_4process/*"
agents:
slurm_gpus_per_task: 1
slurm_cpus_per_task: 4
slurm_ntasks: 4

- label: "dry baroclinic wave - 4 gpus"
key: "target_gpu_implicit_baroclinic_wave_4process"
command:
- mkdir -p target_gpu_implicit_baroclinic_wave_4process
- >
srun
nsys profile --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK}
srun --cpu-bind=cores
nsys profile --trace=osrt,nvtx,cuda,mpi,ucx --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK}
julia --color=yes --project=examples examples/hybrid/driver.jl
--config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave_4process.yml
artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/*"
agents:
slurm_gpus_per_task: 1
slurm_cpus_per_task: 4
slurm_ntasks: 4
4 changes: 2 additions & 2 deletions .dev/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ version = "0.21.4"

[[deps.JuliaFormatter]]
deps = ["CSTParser", "CommonMark", "DataStructures", "Glob", "Pkg", "PrecompileTools", "Tokenize"]
git-tree-sha1 = "2d2f630931dcf9cc4f753777a9c9ffb48d504116"
git-tree-sha1 = "8f5295e46f594ad2d8652f1098488a77460080cd"
uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
version = "1.0.43"
version = "1.0.45"

[[deps.LibCURL]]
deps = ["LibCURL_jll", "MozillaCACerts_jll"]
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ Artifacts = "1"
AtmosphericProfilesLibrary = "0.1"
CLIMAParameters = "0.7.25"
ClimaComms = "0.5.6"
ClimaCore = "0.11.1"
ClimaCore = "0.11.5"
ClimaTimeSteppers = "0.7.14"
CloudMicrophysics = "0.15.0"
Colors = "0.12"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
dt_save_to_disk: "10days"
dt: "100secs"
t_end: "1days"
h_elem: 30
z_elem: 63
dz_bottom: 30.0
dz_top: 3000.0
z_max: 55000.0
kappa_4: 1.0e15
vert_diff: "true"
moist: "equil"
precip_model: "0M"
rayleigh_sponge: true
forcing: "held_suarez"
job_id: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km"
toml: [toml/longrun_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.toml]
76 changes: 43 additions & 33 deletions docs/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ version = "0.1.33"

[[deps.Adapt]]
deps = ["LinearAlgebra", "Requires"]
git-tree-sha1 = "02f731463748db57cc2ebfbd9fbc9ce8280d3433"
git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
version = "3.7.1"
version = "3.7.2"
weakdeps = ["StaticArrays"]

[deps.Adapt.extensions]
Expand Down Expand Up @@ -142,9 +142,9 @@ version = "0.4.2"

[[deps.BandedMatrices]]
deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "PrecompileTools"]
git-tree-sha1 = "06a2a94d5a4979c36cc7a3c28d70800f448ae5bb"
git-tree-sha1 = "b7d2ca2461eeee5828ee9dcf6186e41fe6b6f78b"
uuid = "aae01518-5342-5314-be14-df237901396f"
version = "1.3.0"
version = "1.3.1"
weakdeps = ["SparseArrays"]

[deps.BandedMatrices.extensions]
Expand Down Expand Up @@ -178,9 +178,9 @@ version = "0.1.5"

[[deps.BlockArrays]]
deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra"]
git-tree-sha1 = "54cd829dd26330c42e1cf9df68470dd4df602c61"
git-tree-sha1 = "fc69cbdb4277042f72c6e59cbc7024fbe3034b89"
uuid = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
version = "0.16.38"
version = "0.16.39"

[[deps.Bzip2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
Expand Down Expand Up @@ -269,10 +269,10 @@ uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
version = "0.5.6"

[[deps.ClimaCore]]
deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "Memoize", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "WeakValueDicts"]
git-tree-sha1 = "ca5a3fe9269205c2b4f0ffe3b7897b1506eda494"
deps = ["Adapt", "BandedMatrices", "BlockArrays", "CUDA", "ClimaComms", "CubedSphere", "DataStructures", "DocStringExtensions", "ForwardDiff", "GaussQuadrature", "GilbertCurves", "HDF5", "InteractiveUtils", "IntervalSets", "KrylovKit", "LinearAlgebra", "Memoize", "PkgVersion", "RecursiveArrayTools", "RootSolvers", "SparseArrays", "Static", "StaticArrays", "Statistics", "Unrolled", "WeakValueDicts"]
git-tree-sha1 = "de400c8d487ca34c7a70a92b45e861408a0e7eee"
uuid = "d414da3d-4745-48bb-8d80-42e94e092884"
version = "0.11.2"
version = "0.11.6"
weakdeps = ["Krylov"]

[deps.ClimaCore.extensions]
Expand Down Expand Up @@ -448,9 +448,9 @@ version = "0.1.0+0"

[[deps.DiffEqBase]]
deps = ["ArrayInterface", "DataStructures", "DocStringExtensions", "EnumX", "EnzymeCore", "FastBroadcast", "ForwardDiff", "FunctionWrappers", "FunctionWrappersWrappers", "LinearAlgebra", "Logging", "Markdown", "MuladdMacro", "Parameters", "PreallocationTools", "PrecompileTools", "Printf", "RecursiveArrayTools", "Reexport", "SciMLBase", "SciMLOperators", "Setfield", "SparseArrays", "Static", "StaticArraysCore", "Statistics", "Tricks", "TruncatedStacktraces"]
git-tree-sha1 = "309efb205c30d43b595466283bbecf2769283e22"
git-tree-sha1 = "09ce9525b590bcdd9a807142dc493692aee85ef9"
uuid = "2b5f629d-d688-5b77-993f-72d75c75574e"
version = "6.141.0"
version = "6.143.0"

[deps.DiffEqBase.extensions]
DiffEqBaseChainRulesCoreExt = "ChainRulesCore"
Expand Down Expand Up @@ -577,9 +577,9 @@ version = "0.3.2"

[[deps.FFTW]]
deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
git-tree-sha1 = "b4fbdd20c889804969571cc589900803edda16b7"
git-tree-sha1 = "ec22cbbcd01cba8f41eecd7d44aac1f23ee985e3"
uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
version = "1.7.1"
version = "1.7.2"

[[deps.FFTW_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
Expand All @@ -600,9 +600,9 @@ version = "0.3.2"

[[deps.FastGaussQuadrature]]
deps = ["LinearAlgebra", "SpecialFunctions", "StaticArrays"]
git-tree-sha1 = "93ff6a4d5e7bfe27732259bfabbdd19940d8af1f"
git-tree-sha1 = "8ef0363cffeedb7e73339b664ce6d33cc278a3c4"
uuid = "442a2c76-b920-505d-bb47-c5924d526838"
version = "1.0.0"
version = "1.0.1"

[[deps.FileIO]]
deps = ["Pkg", "Requires", "UUIDs"]
Expand All @@ -615,9 +615,9 @@ uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"

[[deps.FillArrays]]
deps = ["LinearAlgebra", "Random"]
git-tree-sha1 = "01dba5dbad6b2766e2ddd7b9d64af0e6d68d95cd"
git-tree-sha1 = "5b93957f6dcd33fc343044af3d48c215be2562f1"
uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
version = "1.9.1"
version = "1.9.3"

[deps.FillArrays.extensions]
FillArraysPDMatsExt = "PDMats"
Expand Down Expand Up @@ -880,9 +880,9 @@ weakdeps = ["EnzymeCore"]

[[deps.Krylov]]
deps = ["LinearAlgebra", "Printf", "SparseArrays"]
git-tree-sha1 = "17e462054b42dcdda73e9a9ba0c67754170c88ae"
git-tree-sha1 = "8a6837ec02fe5fb3def1abc907bb802ef11a0729"
uuid = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
version = "0.9.4"
version = "0.9.5"

[[deps.KrylovKit]]
deps = ["ChainRulesCore", "GPUArraysCore", "LinearAlgebra", "Printf"]
Expand Down Expand Up @@ -994,10 +994,14 @@ deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

[[deps.LinearOperators]]
deps = ["FastClosures", "LDLFactorizations", "LinearAlgebra", "Printf", "SparseArrays", "TimerOutputs"]
git-tree-sha1 = "a58ab1d18efa0bcf9f0868c6d387e4126dad3e72"
deps = ["FastClosures", "LDLFactorizations", "LinearAlgebra", "Printf", "Requires", "SparseArrays", "TimerOutputs"]
git-tree-sha1 = "58e2ca62646a62e18f86253b9c2a2d821c2d934b"
uuid = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125"
version = "2.5.2"
version = "2.6.0"
weakdeps = ["ChainRulesCore"]

[deps.LinearOperators.extensions]
LinearOperatorsChainRulesCoreExt = "ChainRulesCore"

[[deps.LogExpFunctions]]
deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
Expand All @@ -1019,10 +1023,10 @@ version = "0.3.26"
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

[[deps.MKL_jll]]
deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"]
git-tree-sha1 = "eb006abbd7041c28e0d16260e50a24f8f9104913"
deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl"]
git-tree-sha1 = "72dc3cf284559eb8f53aa593fe62cb33f83ed0c0"
uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
version = "2023.2.0+0"
version = "2024.0.0+0"

[[deps.MPI]]
deps = ["Distributed", "DocStringExtensions", "Libdl", "MPICH_jll", "MPIPreferences", "MPItrampoline_jll", "MicrosoftMPI_jll", "OpenMPI_jll", "PkgVersion", "PrecompileTools", "Requires", "Serialization", "Sockets"]
Expand All @@ -1040,9 +1044,9 @@ version = "0.20.19"

[[deps.MPICH_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "MPIPreferences", "TOML"]
git-tree-sha1 = "8a5b4d2220377d1ece13f49438d71ad20cf1ba83"
git-tree-sha1 = "2ee75365ca243c1a39d467e35ffd3d4d32eef11e"
uuid = "7cb0a576-ebde-5e09-9194-50597f1243b4"
version = "4.1.2+0"
version = "4.1.2+1"

[[deps.MPIPreferences]]
deps = ["Libdl", "Preferences"]
Expand Down Expand Up @@ -1418,9 +1422,9 @@ version = "0.1.0"

[[deps.SciMLBase]]
deps = ["ADTypes", "ArrayInterface", "CommonSolve", "ConstructionBase", "Distributed", "DocStringExtensions", "EnumX", "FillArrays", "FunctionWrappersWrappers", "IteratorInterfaceExtensions", "LinearAlgebra", "Logging", "Markdown", "PrecompileTools", "Preferences", "Printf", "QuasiMonteCarlo", "RecipesBase", "RecursiveArrayTools", "Reexport", "RuntimeGeneratedFunctions", "SciMLOperators", "StaticArraysCore", "Statistics", "SymbolicIndexingInterface", "Tables", "TruncatedStacktraces"]
git-tree-sha1 = "d432b4c4cc922fb7b21b555c138aa87f9fb7beb8"
git-tree-sha1 = "32ea825941f7b58a6f48268f4b76971ae8eb9eec"
uuid = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
version = "2.9.1"
version = "2.10.0"

[deps.SciMLBase.extensions]
SciMLBaseChainRulesCoreExt = "ChainRulesCore"
Expand Down Expand Up @@ -1513,9 +1517,9 @@ version = "0.8.8"

[[deps.StaticArrayInterface]]
deps = ["ArrayInterface", "Compat", "IfElse", "LinearAlgebra", "PrecompileTools", "Requires", "SparseArrays", "Static", "SuiteSparse"]
git-tree-sha1 = "03fec6800a986d191f64f5c0996b59ed526eda25"
git-tree-sha1 = "5d66818a39bb04bf328e92bc933ec5b4ee88e436"
uuid = "0d7ed370-da01-4f52-bd93-41d350b8b718"
version = "1.4.1"
version = "1.5.0"
weakdeps = ["OffsetArrays", "StaticArrays"]

[deps.StaticArrayInterface.extensions]
Expand Down Expand Up @@ -1700,6 +1704,12 @@ version = "1.0.2"
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[deps.Unrolled]]
deps = ["MacroTools"]
git-tree-sha1 = "6cc9d682755680e0f0be87c56392b7651efc2c7b"
uuid = "9602ed7d-8fef-5bc8-8597-8f21381861e8"
version = "0.1.5"

[[deps.UnsafeAtomics]]
git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278"
uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f"
Expand All @@ -1724,9 +1734,9 @@ version = "0.5.6"

[[deps.XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
git-tree-sha1 = "da69178aacc095066bad1f69d2f59a60a1dd8ad1"
git-tree-sha1 = "801cbe47eae69adc50f36c3caec4758d2650741b"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.12.0+0"
version = "2.12.2+0"

[[deps.YAML]]
deps = ["Base64", "Dates", "Printf", "StringEncodings"]
Expand Down
Loading

0 comments on commit 5bd29f7

Please sign in to comment.