diff --git a/.buildkite/gpu_pipeline/pipeline.yml b/.buildkite/gpu_pipeline/pipeline.yml index acf5b4fdbc0..ed42a193981 100644 --- a/.buildkite/gpu_pipeline/pipeline.yml +++ b/.buildkite/gpu_pipeline/pipeline.yml @@ -8,6 +8,7 @@ env: JULIA_NVTX_CALLBACKS: gc JULIA_MAX_NUM_PRECOMPILE_FILES: 100 GPU_CONFIG_PATH: "config/gpu_configs/" + SLURM_GPU_BIND: none # https://github.com/open-mpi/ompi/issues/11949#issuecomment-1737712291 steps: - label: "init :GPU:" @@ -47,7 +48,7 @@ steps: command: > nsys profile --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave/report julia --color=yes --project=examples examples/hybrid/driver.jl - --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml + --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave.yml artifact_paths: "target_gpu_implicit_baroclinic_wave/*" agents: slurm_gpus: 1 @@ -58,7 +59,7 @@ steps: command: > nsys profile --trace=nvtx,cuda --output=gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/report julia --color=yes --project=examples examples/hybrid/driver.jl - --config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml + --config_file ${GPU_CONFIG_PATH}gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km.yml artifact_paths: "gpu_hs_rhoe_equilmoist_nz63_0M_55km_rs35km/*" agents: slurm_gpus: 1 @@ -67,10 +68,10 @@ steps: - label: "dry baroclinic wave - 4 gpus" key: "target_gpu_implicit_baroclinic_wave_4process" command: > - mpiexec - nsys profile --trace=nvtx,cuda --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{OMPI_COMM_WORLD_RANK} + srun + nsys profile --trace=nvtx,cuda,mpi --output=target_gpu_implicit_baroclinic_wave_4process/report-%q{PMI_RANK} julia --color=yes --project=examples examples/hybrid/driver.jl - --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave_4process.yml + --config_file ${GPU_CONFIG_PATH}target_gpu_implicit_baroclinic_wave_4process.yml artifact_paths: "target_gpu_implicit_baroclinic_wave_4process/*" agents: slurm_gpus_per_task: 1