From 3a4f1d99ff84b8554b7964ffbef2ee7fa45443b8 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Sun, 7 Apr 2024 14:46:59 -0400 Subject: [PATCH] More informative error messages for failing to load devices --- .buildkite/pipeline.yml | 1 - .gitignore | 1 + ext/LuxMPIExt.jl | 15 ++++++++++----- ext/LuxMPINCCLExt.jl | 3 ++- test/distributed/common_distributedtest.jl | 1 + test/distributed/data_distributedtest.jl | 1 + test/distributed/optimizer_distributedtest.jl | 1 + test/distributed/synchronize_distributedtest.jl | 1 + 8 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0f0f7f1e6..519d0ecfb 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -17,7 +17,6 @@ steps: cuda: "*" env: GROUP: "CUDA" - JULIA_MPI_TEST_NPROCS: 2 # Needs to be same as number of GPUs for NCCL if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 240 matrix: diff --git a/.gitignore b/.gitignore index b249f6420..df70afccc 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,4 @@ docs/src/tutorials/advanced *.log bench/benchmark_results.json +*.cov diff --git a/ext/LuxMPIExt.jl b/ext/LuxMPIExt.jl index 73f09fe52..a45f915ed 100644 --- a/ext/LuxMPIExt.jl +++ b/ext/LuxMPIExt.jl @@ -7,7 +7,8 @@ using LuxDeviceUtils: AbstractLuxDevice, LuxCUDADevice, LuxAMDGPUDevice, cpu_dev using MPI: MPI function DistributedUtils.__initialize( - ::Type{MPIBackend}; cuda_devices=nothing, amdgpu_devices=nothing) + ::Type{MPIBackend}; cuda_devices=nothing, amdgpu_devices=nothing, + force_cuda::Bool=false, caller::String="", force_amdgpu::Bool=false) # Undocumented internal kwarg !MPI.Initialized() && MPI.Init() DistributedUtils.MPI_Initialized[] = true @@ -15,18 +16,22 @@ function DistributedUtils.__initialize( if cuda_devices !== missing && __is_functional(LuxCUDADevice) if cuda_devices === nothing - set_device!(LuxCUDADevice, nothing, local_rank) + set_device!(LuxCUDADevice, nothing, local_rank + 1) else - set_device!(LuxCUDADevice, cuda_devices[local_rank]) + set_device!(LuxCUDADevice, cuda_devices[local_rank + 1]) end + elseif force_cuda + error(lazy"CUDA devices are not functional (or `LuxCUDA.jl` not loaded) and `force_cuda` is set to `true`. This is caused by backend: $(caller).") end if amdgpu_devices !== missing && __is_functional(LuxAMDGPUDevice) if amdgpu_devices === nothing - set_device!(LuxAMDGPUDevice, nothing, local_rank) + set_device!(LuxAMDGPUDevice, nothing, local_rank + 1) else - set_device!(LuxAMDGPUDevice, amdgpu_devices[local_rank]) + set_device!(LuxAMDGPUDevice, amdgpu_devices[local_rank + 1]) end + elseif force_amdgpu + error(lazy"AMDGPU devices are not functional (or `LuxAMDGPU.jl` not loaded) and `force_amdgpu` is set to `true`. This is caused by backend: $(caller).") end return diff --git a/ext/LuxMPINCCLExt.jl b/ext/LuxMPINCCLExt.jl index 737a30a1b..54765ba87 100644 --- a/ext/LuxMPINCCLExt.jl +++ b/ext/LuxMPINCCLExt.jl @@ -9,7 +9,8 @@ using Setfield: @set! function DistributedUtils.__initialize( ::Type{NCCLBackend}; cuda_devices=nothing, amdgpu_devices=missing) @assert amdgpu_devices===missing "`AMDGPU` is not supported by `NCCL`." - DistributedUtils.__initialize(MPIBackend; cuda_devices, amdgpu_devices) + DistributedUtils.__initialize( + MPIBackend; cuda_devices, force_cuda=true, caller="NCCLBackend", amdgpu_devices) DistributedUtils.NCCL_Initialized[] = true return end diff --git a/test/distributed/common_distributedtest.jl b/test/distributed/common_distributedtest.jl index e4eb44522..0a6c14d16 100644 --- a/test/distributed/common_distributedtest.jl +++ b/test/distributed/common_distributedtest.jl @@ -1,4 +1,5 @@ using Lux, MPI, NCCL, Test +using LuxAMDGPU, LuxCUDA const input_args = length(ARGS) == 2 ? ARGS : ("CPU", "mpi") const backend_type = input_args[2] == "nccl" ? NCCLBackend : MPIBackend diff --git a/test/distributed/data_distributedtest.jl b/test/distributed/data_distributedtest.jl index fb767d30a..deab7349e 100644 --- a/test/distributed/data_distributedtest.jl +++ b/test/distributed/data_distributedtest.jl @@ -1,4 +1,5 @@ using Lux, MLUtils, MPI, NCCL, Random, Test +using LuxAMDGPU, LuxCUDA const input_args = length(ARGS) == 2 ? ARGS : ("CPU", "mpi") const backend_type = input_args[2] == "nccl" ? NCCLBackend : MPIBackend diff --git a/test/distributed/optimizer_distributedtest.jl b/test/distributed/optimizer_distributedtest.jl index 4a59706aa..cb1ab9024 100644 --- a/test/distributed/optimizer_distributedtest.jl +++ b/test/distributed/optimizer_distributedtest.jl @@ -1,4 +1,5 @@ using Lux, MPI, NCCL, Optimisers, Random, Test +using LuxAMDGPU, LuxCUDA const input_args = length(ARGS) == 2 ? ARGS : ("CPU", "mpi") const backend_type = input_args[2] == "nccl" ? NCCLBackend : MPIBackend diff --git a/test/distributed/synchronize_distributedtest.jl b/test/distributed/synchronize_distributedtest.jl index 4e121f97e..f29130426 100644 --- a/test/distributed/synchronize_distributedtest.jl +++ b/test/distributed/synchronize_distributedtest.jl @@ -1,4 +1,5 @@ using ComponentArrays, Lux, MPI, NCCL, Optimisers, Random, Test +using LuxAMDGPU, LuxCUDA const input_args = length(ARGS) == 2 ? ARGS : ("CPU", "mpi") const backend_type = input_args[2] == "nccl" ? NCCLBackend : MPIBackend