Add documentation

LuxDL · Apr 6, 2024 · b26ed78 · b26ed78
1 parent 6293e2a
commit b26ed78
Show file tree

Hide file tree

Showing 13 changed files with 144 additions and 16 deletions.
diff --git a/Project.toml b/Project.toml
@@ -80,7 +80,7 @@ LuxCore = "0.1.12"
 LuxDeviceUtils = "0.1.19"
 LuxLib = "0.3.11"
 LuxTestUtils = "0.1.15"
-MLUtils = "0.4"
+MLUtils = "0.4.3"
 MacroTools = "0.5.13"
 Markdown = "1.10"
 NCCL = "0.1.1"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -14,5 +14,6 @@ LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 WeightInitializers = "d49dbf32-c5c2-4618-8acc-27bb2598ef2d"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/docs/make.jl b/docs/make.jl
@@ -38,14 +38,16 @@ pages = [
  "manual/freezing_model_parameters.md",
  "manual/gpu_management.md",
  "manual/migrate_from_flux.md",
- "manual/weight_initializers.md"
+ "manual/weight_initializers.md",
+ "manual/distributed_utils.md"
  ],
  "API Reference" => [
  "Lux" => [
  "api/Lux/layers.md",
  "api/Lux/utilities.md",
  "api/Lux/contrib.md",
- "api/Lux/switching_frameworks.md"
+ "api/Lux/switching_frameworks.md",
+ "api/Lux/distributed_utils.md",
  ],
  "Accelerator Support" => [
  "api/Accelerator_Support/LuxAMDGPU.md",

diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts
@@ -73,7 +73,8 @@ export default defineConfig({
  { text: 'Built-In Layers', link: '/api/Lux/layers' },
  { text: 'Utilities', link: '/api/Lux/utilities' },
  { text: 'Experimental', link: '/api/Lux/contrib' },
- { text: 'InterOp', link: '/api/Lux/switching_frameworks' }
+ { text: 'InterOp', link: '/api/Lux/switching_frameworks' },
+ { text: 'DistributedUtils', link: '/api/Lux/distributed_utils' }
  ]
  },
  {
@@ -146,7 +147,8 @@ export default defineConfig({
  { text: 'Freezing Model Parameters', link: '/manual/freezing_model_parameters' },
  { text: 'GPU Management', link: '/manual/gpu_management' },
  { text: 'Migrating from Flux to Lux', link: '/manual/migrate_from_flux' },
- { text: 'Initializing Weights', link: '/manual/weight_initializers' }]
+ { text: 'Initializing Weights', link: '/manual/weight_initializers' },
+ { text: 'Distributed Data Parallel Training', link: '/manual/distributed_utils' },]
  },
  "/api/": {
  text: 'API Reference', collapsed: false, items: [
@@ -155,7 +157,8 @@ export default defineConfig({
  { text: 'Built-In Layers', link: '/api/Lux/layers' },
  { text: 'Utilities', link: '/api/Lux/utilities' },
  { text: 'Experimental Features', link: '/api/Lux/contrib' },
- { text: 'Switching between Deep Learning Frameworks', link: '/api/Lux/switching_frameworks' }]
+ { text: 'Switching between Deep Learning Frameworks', link: '/api/Lux/switching_frameworks' },
+ { text: 'DistributedUtils', link: '/api/Lux/distributed_utils' }]
  },
  {
  text: 'Accelerator Support', collapsed: false, items: [

diff --git a/docs/src/api/Lux/distributed_utils.md b/docs/src/api/Lux/distributed_utils.md
@@ -0,0 +1,58 @@
+# Distributed Utils
+
+!!! note
+
+ These functionalities are available via the `Lux.DistributedUtils` module.
+
+```@meta
+CurrentModule = Lux
+```
+
+## Index
+
+```@index
+Pages = ["distributed_utils.md"]
+```
+
+## [Backends](@id communication-backends)
+
+```@docs
+MPIBackend
+NCCLBackend
+```
+
+## Initialization
+
+```@docs
+DistributedUtils.initialize
+DistributedUtils.initialized
+DistributedUtils.get_distributed_backend
+```
+
+## Helper Functions
+
+```@docs
+DistributedUtils.local_rank
+DistributedUtils.total_workers
+```
+
+## Communication Primitives
+
+```@docs
+DistributedUtils.allreduce!
+DistributedUtils.bcast!
+DistributedUtils.reduce!
+DistributedUtils.synchronize!!
+```
+
+## Optimizers.jl Integration
+
+```@docs
+DistributedUtils.DistributedOptimizer
+```
+
+## MLUtils.jl Integration
+
+```@docs
+DistributedUtils.DistributedDataLoader
+```
diff --git a/docs/src/introduction/index.md b/docs/src/introduction/index.md
@@ -26,6 +26,7 @@ import Pkg; Pkg.add("Lux")
 ```@example quickstart
 using Lux, Random, Optimisers, Zygote
 # using LuxCUDA, LuxAMDGPU, Metal # Optional packages for GPU support
+using Printf # For pretty printing
 ```
 
 We take randomness very seriously
@@ -117,7 +118,7 @@ for epoch in 1:1000
  return sum(abs2, y .- y_data), st_
  end
  gs = only(pb((one(loss), nothing)))
- epoch % 100 == 1 && println("Epoch: $(epoch) | Loss: $(loss)")
+ epoch % 100 == 1 && @printf "Epoch: %04d \t Loss: %10.9g\n" epoch loss
  Optimisers.update!(st_opt, ps, gs)
 end
 ```

diff --git a/docs/src/manual/distributed_utils.md b/docs/src/manual/distributed_utils.md
@@ -0,0 +1,55 @@
+# Distributed Data Parallel Training
+
+!!! tip
+
+ For a fully functional example, see the
+ [ImageNet Training Example](https://github.com/LuxDL/Lux.jl/tree/main/examples/ImageNet)
+
+DDP Training using `Lux.DistributedUtils` is a spiritual successor to
+[FluxMPI.jl](https://github.com/avik-pal/FluxMPI.jl), but has some key differences.
+
+## Guide to Integrating DistributedUtils into your code
+
+## [GPU-Aware MPI](@id gpu-aware-mpi)
+
+If you are using a custom MPI build that supports CUDA or ROCM, you can use the following
+preferences with [Preferences.jl](https://github.com/JuliaPackaging/Preferences.jl):
+
+1. `LuxDistributedMPICUDAAware` - Set this to `true` if your MPI build is CUDA aware.
+2. `LuxDistributedMPIROCMAware` - Set this to `true` if your MPI build is ROCM aware.
+
+By default, both of these values are set to `false`.
+
+## Migration Guide from `FluxMPI.jl`
+
+Let's compare the changes we need to make wrt the
+[FluxMPI.jl integration guide](https://avik-pal.github.io/FluxMPI.jl/dev/guide/).
+
+1. `FluxMPI.Init` is now [`DistributedUtils.initialize`](@ref).
+2. `FluxMPI.synchronize!(x)` needs to be changed to
+ `x_new = DistributedUtils.synchronize!!(backend, x)`.
+3. [`DistributedUtils.DistributedDataContainer`](@ref),
+ [`DistributedUtils.local_rank`](@ref), and
+ [`DistributedUtils.DistributedOptimizer`](@ref) need `backend` as the first input.
+
+And that's pretty much it!
+
+### Removed Functionality
+
+1. `FluxMPI.allreduce_gradients` no longer exists. Previously this was needed when CUDA
+ communication was flaky, with `NCCL.jl` this is no longer the case.
+2. `FluxMPIFluxModel` has been removed. `DistributedUtils` no longer works with `Flux`.
+
+### Key Differences
+
+1. `FluxMPI.synchronize!` is now `DistributedUtils.synchronize!!` to highlight the fact
+ that some of the inputs are not updated in-place.
+2. All of the functions now require a [communication backend](@ref communication-backends)
+ as input.
+3. We don't automatically determine if the MPI Implementation is CUDA or ROCM aware. See
+ [GPU-aware MPI](@ref gpu-aware-mpi) for more information.
+4. Older [`Lux.gpu`](@ref) implementations used to "just work" with `FluxMPI.jl`. We expect
+ [`gpu_device`](@ref) to continue working as expected, however, we recommend using
+ [`gpu_device`](@ref) after calling [`DistributedUtils.initialize`](@ref) to avoid any
+ mismatch between the device set via `DistributedUtils` and the device stores in
+ [`LuxCUDADevice`](@ref) or [`LuxAMDGPUDevice`](@ref).
diff --git a/ext/LuxComponentArraysExt.jl b/ext/LuxComponentArraysExt.jl
@@ -1,7 +1,7 @@
 module LuxComponentArraysExt
 
 using ComponentArrays: ComponentArrays, ComponentArray, FlatAxis
-using Lux: Lux
+using Lux: Lux, DistributedUtils
 
 # Empty NamedTuple: Hack to avoid breaking precompilation
 function ComponentArrays.ComponentArray(data::Vector{Any}, axes::Tuple{FlatAxis})

diff --git a/ext/LuxFluxExt.jl b/ext/LuxFluxExt.jl
@@ -143,8 +143,8 @@ function __from_flux_adaptor(l::Flux.ConvTranspose; preserve_ps_st::Bool=false,
  if preserve_ps_st
  _bias = l.bias isa Bool ? nothing :
  reshape(copy(l.bias), ntuple(_ -> 1, length(k))..., out_chs, 1)
- return Lux.ConvTranspose(k, in_chs * groups => out_chs, l.σ; l.stride, pad,
- l.dilation, groups, use_bias=!(l.bias isa Bool),
+ return Lux.ConvTranspose(k, in_chs * groups => out_chs, l.σ; l.stride,
+ pad, l.dilation, groups, use_bias=!(l.bias isa Bool),
  init_weight=__copy_anonymous_closure(Lux._maybe_flip_conv_weight(l.weight)),
  init_bias=__copy_anonymous_closure(_bias))
  else

diff --git a/ext/LuxMLUtilsExt.jl b/ext/LuxMLUtilsExt.jl
@@ -19,4 +19,4 @@ function MLUtils.getobs(dc::DistributedUtils.DistributedDataContainer, idx)
  return MLUtils.getobs(dc.data, dc.idxs[idx])
 end
 
-end
+end
diff --git a/ext/LuxMPINCCLExt.jl b/ext/LuxMPINCCLExt.jl
@@ -8,15 +8,15 @@ using Setfield: @set!
 
 function DistributedUtils.__initialize(
  ::Type{NCCLBackend}; cuda_devices=nothing, amdgpu_devices=missing)
- DistributedUtils.NCCL_Initialized[] = true
  @assert amdgpu_devices===missing "`AMDGPU` is not supported by `NCCL`."
- DistributedUtils.__initialize(Val(:MPI); cuda_devices, amdgpu_devices)
+ DistributedUtils.__initialize(MPIBackend; cuda_devices, amdgpu_devices)
+ DistributedUtils.NCCL_Initialized[] = true
  return
 end
 
 function DistributedUtils.__get_distributed_backend(::Type{NCCLBackend})
  unique_id = NCCL.UniqueID() # Generate on all ranks to know the type
- mpi_backend = DistributedUtils.__get_distributed_backend(Val(:MPI))
+ mpi_backend = DistributedUtils.__get_distributed_backend(MPIBackend)
  buf = [unique_id.internal...]
  DistributedUtils.bcast!(mpi_backend, buf; root=0)
  @set! unique_id.internal = Tuple(buf)

diff --git a/src/distributed/backend.jl b/src/distributed/backend.jl
@@ -4,7 +4,7 @@ abstract type AbstractLuxDistributedBackend end
  MPIBackend(comm = nothing)
 
 Create an MPI backend for distributed training. Users should not use this function directly.
-Instead use [`DistributedUtils.get_distributed_backend(Val(:NCCL))`](@ref).
+Instead use [`DistributedUtils.get_distributed_backend(MPIBackend)`](@ref).
 """
 struct MPIBackend{C} <: AbstractLuxDistributedBackend
  comm::C
@@ -21,7 +21,7 @@ end
  NCCLBackend(comm = nothing, mpi_backend = nothing)
 
 Create an NCCL backend for distributed training. Users should not use this function
-directly. Instead use [`DistributedUtils.get_distributed_backend(Val(:NCCL))`](@ref).
+directly. Instead use [`DistributedUtils.get_distributed_backend(NCCLBackend)`](@ref).
 """
 struct NCCLBackend{C, M <: Union{Nothing, MPIBackend}} <: AbstractLuxDistributedBackend
  comm::C

diff --git a/src/distributed/public_api.jl b/src/distributed/public_api.jl
@@ -220,6 +220,10 @@ end
 `data` must be compatible with `MLUtils` interface. The returned container is compatible
 with `MLUtils` interface and is used to partition the dataset across the available
 processes.
+
+!!! danger
+
+ `MLUtils.jl` must be installed and loaded before using this.
 """
 @concrete struct DistributedDataContainer
  data
@@ -250,6 +254,10 @@ averages the gradients across the processes using Allreduce.
 ## Arguments
 
  - `optimizer`: An Optimizer compatible with the Optimisers.jl package
+
+!!! danger
+
+ `Optimisers.jl` must be installed and loaded before using this.
 """
 function DistributedOptimizer(backend::AbstractLuxDistributedBackend, opt)
  mod = Base.get_extension(@__MODULE__, :LuxOptimisersExt)