From 352c6b933a01646bb2d4b0da8981c126559e5ba8 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Sat, 6 Apr 2024 18:58:00 -0400 Subject: [PATCH] Add documentation --- docs/make.jl | 6 ++- docs/src/.vitepress/config.mts | 9 +++-- docs/src/api/Lux/distributed_utils.md | 58 +++++++++++++++++++++++++++ docs/src/manual/distributed_utils.md | 55 +++++++++++++++++++++++++ ext/LuxFluxExt.jl | 4 +- ext/LuxMLUtilsExt.jl | 2 +- ext/LuxMPINCCLExt.jl | 6 +-- src/distributed/backend.jl | 4 +- src/distributed/public_api.jl | 8 ++++ 9 files changed, 139 insertions(+), 13 deletions(-) create mode 100644 docs/src/api/Lux/distributed_utils.md create mode 100644 docs/src/manual/distributed_utils.md diff --git a/docs/make.jl b/docs/make.jl index 6c2d1862b..bae43d1e7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -38,14 +38,16 @@ pages = [ "manual/freezing_model_parameters.md", "manual/gpu_management.md", "manual/migrate_from_flux.md", - "manual/weight_initializers.md" + "manual/weight_initializers.md", + "manual/distributed_utils.md" ], "API Reference" => [ "Lux" => [ "api/Lux/layers.md", "api/Lux/utilities.md", "api/Lux/contrib.md", - "api/Lux/switching_frameworks.md" + "api/Lux/switching_frameworks.md", + "api/Lux/distributed_utils.md", ], "Accelerator Support" => [ "api/Accelerator_Support/LuxAMDGPU.md", diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 45ca04fed..c111c1c44 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -73,7 +73,8 @@ export default defineConfig({ { text: 'Built-In Layers', link: '/api/Lux/layers' }, { text: 'Utilities', link: '/api/Lux/utilities' }, { text: 'Experimental', link: '/api/Lux/contrib' }, - { text: 'InterOp', link: '/api/Lux/switching_frameworks' } + { text: 'InterOp', link: '/api/Lux/switching_frameworks' }, + { text: 'DistributedUtils', link: '/api/Lux/distributed_utils' } ] }, { @@ -146,7 +147,8 @@ export default defineConfig({ { text: 'Freezing Model Parameters', link: '/manual/freezing_model_parameters' }, { text: 'GPU Management', link: '/manual/gpu_management' }, { text: 'Migrating from Flux to Lux', link: '/manual/migrate_from_flux' }, - { text: 'Initializing Weights', link: '/manual/weight_initializers' }] + { text: 'Initializing Weights', link: '/manual/weight_initializers' }, + { text: 'Distributed Data Parallel Training', link: '/manual/distributed_utils' },] }, "/api/": { text: 'API Reference', collapsed: false, items: [ @@ -155,7 +157,8 @@ export default defineConfig({ { text: 'Built-In Layers', link: '/api/Lux/layers' }, { text: 'Utilities', link: '/api/Lux/utilities' }, { text: 'Experimental Features', link: '/api/Lux/contrib' }, - { text: 'Switching between Deep Learning Frameworks', link: '/api/Lux/switching_frameworks' }] + { text: 'Switching between Deep Learning Frameworks', link: '/api/Lux/switching_frameworks' }, + { text: 'DistributedUtils', link: '/api/Lux/distributed_utils' }] }, { text: 'Accelerator Support', collapsed: false, items: [ diff --git a/docs/src/api/Lux/distributed_utils.md b/docs/src/api/Lux/distributed_utils.md new file mode 100644 index 000000000..e69e04106 --- /dev/null +++ b/docs/src/api/Lux/distributed_utils.md @@ -0,0 +1,58 @@ +# Distributed Utils + +!!! note + + These functionalities are available via the `Lux.DistributedUtils` module. + +```@meta +CurrentModule = Lux +``` + +## Index + +```@index +Pages = ["distributed_utils.md"] +``` + +## [Backends](@id communication-backends) + +```@docs +MPIBackend +NCCLBackend +``` + +## Initialization + +```@docs +DistributedUtils.initialize +DistributedUtils.initialized +DistributedUtils.get_distributed_backend +``` + +## Helper Functions + +```@docs +DistributedUtils.local_rank +DistributedUtils.total_workers +``` + +## Communication Primitives + +```@docs +DistributedUtils.allreduce! +DistributedUtils.bcast! +DistributedUtils.reduce! +DistributedUtils.synchronize!! +``` + +## Optimizers.jl Integration + +```@docs +DistributedUtils.DistributedOptimizer +``` + +## MLUtils.jl Integration + +```@docs +DistributedUtils.DistributedDataLoader +``` diff --git a/docs/src/manual/distributed_utils.md b/docs/src/manual/distributed_utils.md new file mode 100644 index 000000000..a727933e8 --- /dev/null +++ b/docs/src/manual/distributed_utils.md @@ -0,0 +1,55 @@ +# Distributed Data Parallel Training + +!!! tip + + For a fully functional example, see the + [ImageNet Training Example](https://github.com/LuxDL/Lux.jl/tree/main/examples/ImageNet) + +DDP Training using `Lux.DistributedUtils` is a spiritual successor to +[FluxMPI.jl](https://github.com/avik-pal/FluxMPI.jl), but has some key differences. + +## Guide to Integrating DistributedUtils into your code + +## [GPU-Aware MPI](@id gpu-aware-mpi) + +If you are using a custom MPI build that supports CUDA or ROCM, you can use the following +preferences with [Preferences.jl](https://github.com/JuliaPackaging/Preferences.jl): + +1. `LuxDistributedMPICUDAAware` - Set this to `true` if your MPI build is CUDA aware. +2. `LuxDistributedMPIROCMAware` - Set this to `true` if your MPI build is ROCM aware. + +By default, both of these values are set to `false`. + +## Migration Guide from `FluxMPI.jl` + +Let's compare the changes we need to make wrt the +[FluxMPI.jl integration guide](https://avik-pal.github.io/FluxMPI.jl/dev/guide/). + +1. `FluxMPI.Init` is now [`DistributedUtils.initialize`](@ref). +2. `FluxMPI.synchronize!(x)` needs to be changed to + `x_new = DistributedUtils.synchronize!!(backend, x)`. +3. [`DistributedUtils.DistributedDataContainer`](@ref), + [`DistributedUtils.local_rank`](@ref), and + [`DistributedUtils.DistributedOptimizer`](@ref) need `backend` as the first input. + +And that's pretty much it! + +### Removed Functionality + +1. `FluxMPI.allreduce_gradients` no longer exists. Previously this was needed when CUDA + communication was flaky, with `NCCL.jl` this is no longer the case. +2. `FluxMPIFluxModel` has been removed. `DistributedUtils` no longer works with `Flux`. + +### Key Differences + +1. `FluxMPI.synchronize!` is now `DistributedUtils.synchronize!!` to highlight the fact + that some of the inputs are not updated in-place. +2. All of the functions now require a [communication backend](@ref communication-backends) + as input. +3. We don't automatically determine if the MPI Implementation is CUDA or ROCM aware. See + [GPU-aware MPI](@ref gpu-aware-mpi) for more information. +4. Older [`Lux.gpu`](@ref) implementations used to "just work" with `FluxMPI.jl`. We expect + [`gpu_device`](@ref) to continue working as expected, however, we recommend using + [`gpu_device`](@ref) after calling [`DistributedUtils.initialize`](@ref) to avoid any + mismatch between the device set via `DistributedUtils` and the device stores in + [`LuxCUDADevice`](@ref) or [`LuxAMDGPUDevice`](@ref). diff --git a/ext/LuxFluxExt.jl b/ext/LuxFluxExt.jl index c9a5f8d53..9dc249548 100644 --- a/ext/LuxFluxExt.jl +++ b/ext/LuxFluxExt.jl @@ -143,8 +143,8 @@ function __from_flux_adaptor(l::Flux.ConvTranspose; preserve_ps_st::Bool=false, if preserve_ps_st _bias = l.bias isa Bool ? nothing : reshape(copy(l.bias), ntuple(_ -> 1, length(k))..., out_chs, 1) - return Lux.ConvTranspose(k, in_chs * groups => out_chs, l.σ; l.stride, pad, - l.dilation, groups, use_bias=!(l.bias isa Bool), + return Lux.ConvTranspose(k, in_chs * groups => out_chs, l.σ; l.stride, + pad, l.dilation, groups, use_bias=!(l.bias isa Bool), init_weight=__copy_anonymous_closure(Lux._maybe_flip_conv_weight(l.weight)), init_bias=__copy_anonymous_closure(_bias)) else diff --git a/ext/LuxMLUtilsExt.jl b/ext/LuxMLUtilsExt.jl index e27b9ec8a..640041ff4 100644 --- a/ext/LuxMLUtilsExt.jl +++ b/ext/LuxMLUtilsExt.jl @@ -19,4 +19,4 @@ function MLUtils.getobs(dc::DistributedUtils.DistributedDataContainer, idx) return MLUtils.getobs(dc.data, dc.idxs[idx]) end -end \ No newline at end of file +end diff --git a/ext/LuxMPINCCLExt.jl b/ext/LuxMPINCCLExt.jl index da82ce1ed..2bff97cbe 100644 --- a/ext/LuxMPINCCLExt.jl +++ b/ext/LuxMPINCCLExt.jl @@ -8,15 +8,15 @@ using Setfield: @set! function DistributedUtils.__initialize( ::Type{NCCLBackend}; cuda_devices=nothing, amdgpu_devices=missing) - DistributedUtils.NCCL_Initialized[] = true @assert amdgpu_devices===missing "`AMDGPU` is not supported by `NCCL`." - DistributedUtils.__initialize(Val(:MPI); cuda_devices, amdgpu_devices) + DistributedUtils.__initialize(MPIBackend; cuda_devices, amdgpu_devices) + DistributedUtils.NCCL_Initialized[] = true return end function DistributedUtils.__get_distributed_backend(::Type{NCCLBackend}) unique_id = NCCL.UniqueID() # Generate on all ranks to know the type - mpi_backend = DistributedUtils.__get_distributed_backend(Val(:MPI)) + mpi_backend = DistributedUtils.__get_distributed_backend(MPIBackend) buf = [unique_id.internal...] DistributedUtils.bcast!(mpi_backend, buf; root=0) @set! unique_id.internal = Tuple(buf) diff --git a/src/distributed/backend.jl b/src/distributed/backend.jl index 5e9bfa209..dbfb81936 100644 --- a/src/distributed/backend.jl +++ b/src/distributed/backend.jl @@ -4,7 +4,7 @@ abstract type AbstractLuxDistributedBackend end MPIBackend(comm = nothing) Create an MPI backend for distributed training. Users should not use this function directly. -Instead use [`DistributedUtils.get_distributed_backend(Val(:NCCL))`](@ref). +Instead use [`DistributedUtils.get_distributed_backend(MPIBackend)`](@ref). """ struct MPIBackend{C} <: AbstractLuxDistributedBackend comm::C @@ -21,7 +21,7 @@ end NCCLBackend(comm = nothing, mpi_backend = nothing) Create an NCCL backend for distributed training. Users should not use this function -directly. Instead use [`DistributedUtils.get_distributed_backend(Val(:NCCL))`](@ref). +directly. Instead use [`DistributedUtils.get_distributed_backend(NCCLBackend)`](@ref). """ struct NCCLBackend{C, M <: Union{Nothing, MPIBackend}} <: AbstractLuxDistributedBackend comm::C diff --git a/src/distributed/public_api.jl b/src/distributed/public_api.jl index 4f65f5cc8..2ede31468 100644 --- a/src/distributed/public_api.jl +++ b/src/distributed/public_api.jl @@ -220,6 +220,10 @@ end `data` must be compatible with `MLUtils` interface. The returned container is compatible with `MLUtils` interface and is used to partition the dataset across the available processes. + +!!! danger + + `MLUtils.jl` must be installed and loaded before using this. """ @concrete struct DistributedDataContainer data @@ -250,6 +254,10 @@ averages the gradients across the processes using Allreduce. ## Arguments - `optimizer`: An Optimizer compatible with the Optimisers.jl package + +!!! danger + + `Optimisers.jl` must be installed and loaded before using this. """ function DistributedOptimizer(backend::AbstractLuxDistributedBackend, opt) mod = Base.get_extension(@__MODULE__, :LuxOptimisersExt)