diff --git a/README.md b/README.md
index ebc19db..90aa951 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # Boltz ⚡
 
 [![Join the chat at https://julialang.zulipchat.com #machine-learning](https://img.shields.io/static/v1?label=Zulip&message=chat&color=9558b2&labelColor=389826)](https://julialang.zulipchat.com/#narrow/stream/machine-learning)
-[![Latest Docs](https://img.shields.io/badge/docs-latest-blue.svg)](https://luxdl.github.io/Boltz.jl/dev)
-[![Stable Docs](https://img.shields.io/badge/docs-stable-blue.svg)](https://luxdl.github.io/Boltz.jl/stable)
+[![Latest Docs](https://img.shields.io/badge/docs-latest-blue.svg)](https://lux.csail.mit.edu/dev/api/)
+[![Stable Docs](https://img.shields.io/badge/docs-stable-blue.svg)](https://lux.csail.mit.edu/stable/api/)
 
 [![CI](https://github.com/LuxDL/Boltz.jl/actions/workflows/CI.yml/badge.svg)](https://github.com/LuxDL/Boltz.jl/actions/workflows/CI.yml)
 [![codecov](https://codecov.io/gh/LuxDL/Boltz.jl/branch/main/graph/badge.svg?token=YBImUxz5qO)](https://codecov.io/gh/LuxDL/Boltz.jl)
@@ -25,7 +25,7 @@ Pkg.add("Boltz")
 ```julia
 using Boltz, Lux, Metalhead
 
-model, ps, st = resnet(:resnet18; pretrained=true)
+model, ps, st = resnet(:alexnet; pretrained=true)
 ```
 
 ## Changelog
diff --git a/src/utils.jl b/src/utils.jl
index 9c9d133..cb6b02c 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -32,8 +32,6 @@ function assert_name_present_in(name, possibilities)
     @assert name in possibilities "`name` must be one of $(possibilities)"
 end
 
-# TODO(@avik-pal): Starting v0.2 we should be storing only the parameters and some of the
-#                  states. Fields like rng don't need to be stored explicitly.
 _get_pretrained_weights_path(name::Symbol) = _get_pretrained_weights_path(string(name))
 function _get_pretrained_weights_path(name::String)
     try
diff --git a/src/vision/vgg.jl b/src/vision/vgg.jl
index 05dbbe2..2e80a09 100644
--- a/src/vision/vgg.jl
+++ b/src/vision/vgg.jl
@@ -30,7 +30,7 @@ Create VGG convolution layers ([reference](https://arxiv.org/abs/1409.1556v6)).
 # Arguments
 
   - `config`: vector of tuples `(output_channels, num_convolutions)` for each block
-    (see [`Metalhead._vgg_block`](#))
+    (see `Metalhead._vgg_block`)
   - `batchnorm`: set to `true` to include batch normalization after each convolution
   - `inchannels`: number of input channels
 """
@@ -54,7 +54,7 @@ Create VGG classifier (fully connected) layers
 # Arguments
 
   - `imsize`: tuple `(width, height, channels)` indicating the size after the convolution
-    layers (see [`Metalhead._vgg_convolutional_layers`](#))
+    layers (see `Metalhead._vgg_convolutional_layers`)
   - `nclasses`: number of output classes
   - `fcsize`: input and output size of the intermediate fully connected layer
   - `dropout`: the dropout level between each fully connected layer
@@ -77,7 +77,7 @@ Create a VGG model ([reference](https://arxiv.org/abs/1409.1556v6)).
   - `batchnorm`: set to `true` to use batch normalization after each convolution
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
-    (see [`Metalhead._vgg_classifier_layers`](#))
+    (see `Metalhead._vgg_classifier_layers`)
   - `dropout`: dropout level between fully connected layers
 """
 function vgg(imsize; config, inchannels, batchnorm=false, nclasses, fcsize, dropout)
diff --git a/src/vision/vit.jl b/src/vision/vit.jl
index 2e8fd18..4521f74 100644
--- a/src/vision/vit.jl
+++ b/src/vision/vit.jl
@@ -90,9 +90,7 @@ struct ViPosEmbedding{I} <: Lux.AbstractExplicitLayer
     init::I
 end
 
-function ViPosEmbedding(embedding_size::Int,
-    number_patches::Int;
-    init=(rng, dims...) -> randn(rng, Float32, dims...))
+function ViPosEmbedding(embedding_size::Int, number_patches::Int; init=randn32)
     return ViPosEmbedding(embedding_size, number_patches, init)
 end
 
@@ -133,12 +131,9 @@ function transformer_encoder(in_planes, depth, number_heads; mlp_ratio=4.0f0,
     return Chain(layers...; disable_optimizations=true)
 end
 
-function patch_embedding(imsize::Tuple{<:Int, <:Int}=(224, 224);
-    in_channels=3,
-    patch_size::Tuple{<:Int, <:Int}=(16, 16),
-    embed_planes=768,
-    norm_layer=in_planes -> NoOpLayer(),
-    flatten=true)
+function patch_embedding(imsize::Tuple{<:Int, <:Int}=(224, 224); in_channels=3,
+    patch_size::Tuple{<:Int, <:Int}=(16, 16), embed_planes=768,
+    norm_layer=in_planes -> NoOpLayer(), flatten=true)
     im_width, im_height = imsize
     patch_width, patch_height = patch_size