diff --git a/docs/Project.toml b/docs/Project.toml
index 61d676d..595c9a9 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,6 +1,7 @@
 [deps]
 Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/docs/make.jl b/docs/make.jl
index 86415e6..9cc8848 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,4 +1,4 @@
-using Documenter, DocumenterVitepress, Boltz
+using Documenter, DocumenterVitepress, DocumenterCitations, Boltz
 
 #! format: off
 pages = [
@@ -16,6 +16,11 @@ pages = [
 ]
 #! format: on
 
+bib = CitationBibliography(
+    joinpath(@__DIR__, "ref.bib");
+    style=:authoryear
+)
+
 deploy_config = Documenter.auto_detect_deploy_system()
 deploy_decision = Documenter.deploy_folder(deploy_config; repo="github.com/LuxDL/Boltz.jl",
     devbranch="main", devurl="dev", push_preview=true)
@@ -30,6 +35,7 @@ makedocs(; sitename="Boltz.jl Docs",
     format=DocumenterVitepress.MarkdownVitepress(;
         repo="github.com/LuxDL/Boltz.jl", devbranch="main", devurl="dev", deploy_decision),
     draft=false,
+    plugins=[bib],
     pages)
 
 deploydocs(; repo="github.com/LuxDL/Boltz.jl.git",
diff --git a/docs/ref.bib b/docs/ref.bib
new file mode 100644
index 0000000..da6e339
--- /dev/null
+++ b/docs/ref.bib
@@ -0,0 +1,91 @@
+@article{dosovitskiy2020image,
+  title   = {An image is worth 16x16 words: Transformers for image recognition at scale},
+  author  = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
+  journal = {arXiv preprint arXiv:2010.11929},
+  year    = {2020}
+}
+
+@article{simonyan2014very,
+  title   = {Very deep convolutional networks for large-scale image recognition},
+  author  = {Simonyan, Karen},
+  journal = {arXiv preprint arXiv:1409.1556},
+  year    = {2014}
+}
+
+@article{greydanus2019hamiltonian,
+  title   = {Hamiltonian neural networks},
+  author  = {Greydanus, Samuel and Dzamba, Misko and Yosinski, Jason},
+  journal = {Advances in neural information processing systems},
+  volume  = {32},
+  year    = {2019}
+}
+
+@article{krizhevsky2012imagenet,
+  title   = {Imagenet classification with deep convolutional neural networks},
+  author  = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
+  journal = {Advances in neural information processing systems},
+  volume  = {25},
+  year    = {2012}
+}
+
+@inproceedings{he2016deep,
+  title     = {Deep residual learning for image recognition},
+  author    = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
+  booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages     = {770--778},
+  year      = {2016}
+}
+
+@inproceedings{xie2017aggregated,
+  title     = {Aggregated residual transformations for deep neural networks},
+  author    = {Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and Tu, Zhuowen and He, Kaiming},
+  booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages     = {1492--1500},
+  year      = {2017}
+}
+
+@inproceedings{szegedy2015going,
+  title     = {Going deeper with convolutions},
+  author    = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
+  booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages     = {1--9},
+  year      = {2015}
+}
+
+@inproceedings{huang2017densely,
+  title     = {Densely connected convolutional networks},
+  author    = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q},
+  booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages     = {4700--4708},
+  year      = {2017}
+}
+
+@article{trockman2022patches,
+  title   = {Patches are all you need?},
+  author  = {Trockman, Asher and Kolter, J Zico},
+  journal = {arXiv preprint arXiv:2201.09792},
+  year    = {2022}
+}
+
+@article{howard2017mobilenets,
+  title   = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications},
+  author  = {Howard, Andrew G},
+  journal = {arXiv preprint arXiv:1704.04861},
+  year    = {2017}
+}
+
+@inproceedings{sandler2018mobilenetv2,
+  title     = {Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author    = {Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages     = {4510--4520},
+  year      = {2018}
+}
+
+@inproceedings{howard2019searching,
+  title     = {Searching for mobilenetv3},
+  author    = {Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and others},
+  booktitle = {Proceedings of the IEEE/CVF international conference on computer vision},
+  pages     = {1314--1324},
+  year      = {2019}
+}
diff --git a/docs/src/api/layers.md b/docs/src/api/layers.md
index 0dd6038..66a1692 100644
--- a/docs/src/api/layers.md
+++ b/docs/src/api/layers.md
@@ -1,5 +1,12 @@
 # `Boltz.Layers` API Reference
 
+---
+
 ```@autodocs
 Modules = [Boltz.Layers]
 ```
+
+```@bibliography
+Pages = [@__FILE__]
+Style = :authoryear
+```
diff --git a/docs/src/api/vision.md b/docs/src/api/vision.md
index dcd7672..f64789a 100644
--- a/docs/src/api/vision.md
+++ b/docs/src/api/vision.md
@@ -46,3 +46,8 @@ Vision.ResNeXt
 
 All the pretrained models require that the images be normalized with the parameters
 `mean = [0.485f0, 0.456f0, 0.406f0]` and `std = [0.229f0, 0.224f0, 0.225f0]`.
+
+```@bibliography
+Pages = [@__FILE__]
+Style = :authoryear
+```
diff --git a/src/layers/encoder.jl b/src/layers/encoder.jl
index 7a5742e..086a626 100644
--- a/src/layers/encoder.jl
+++ b/src/layers/encoder.jl
@@ -2,7 +2,7 @@
     VisionTransformerEncoder(in_planes, depth, number_heads; mlp_ratio = 4.0f0,
         dropout = 0.0f0)
 
-Transformer as used in the base ViT architecture.
+Transformer as used in the base ViT architecture [dosovitskiy2020image](@citep).
 
 ## Arguments
 
@@ -14,11 +14,6 @@ Transformer as used in the base ViT architecture.
 
   - `mlp_ratio`: ratio of MLP layers to the number of input channels
   - `dropout_rate`: dropout rate
-
-## References
-
-[1] Dosovitskiy, Alexey, et al. "An image is worth 16x16 words: Transformers for image
-recognition at scale." arXiv preprint arXiv:2010.11929 (2020).
 """
 function VisionTransformerEncoder(
         in_planes, depth, number_heads; mlp_ratio=4.0f0, dropout_rate=0.0f0)
diff --git a/src/layers/hamiltonian.jl b/src/layers/hamiltonian.jl
index f3209cd..5cad99d 100644
--- a/src/layers/hamiltonian.jl
+++ b/src/layers/hamiltonian.jl
@@ -1,11 +1,11 @@
 """
     HamiltonianNN{FST}(model; autodiff=nothing) where {FST}
 
-Constructs a Hamiltonian Neural Network [1]. This neural network is useful for learning
-symmetries and conservation laws by supervision on the gradients of the trajectories. It
-takes as input a concatenated vector of length `2n` containing the position (of size `n`)
-and momentum (of size `n`) of the particles. It then returns the time derivatives for
-position and momentum.
+Constructs a Hamiltonian Neural Network [greydanus2019hamiltonian](@citep). This neural
+network is useful for learning symmetries and conservation laws by supervision on the
+gradients of the trajectories. It takes as input a concatenated vector of length `2n`
+containing the position (of size `n`) and momentum (of size `n`) of the particles. It then
+returns the time derivatives for position and momentum.
 
 ## Arguments
 
@@ -35,11 +35,6 @@ position and momentum.
     This layer uses nested autodiff. Please refer to the manual entry on
     [Nested Autodiff](https://lux.csail.mit.edu/stable/manual/nested_autodiff) for more
     information and known limitations.
-
-## References
-
-[1] Greydanus, Samuel, Misko Dzamba, and Jason Yosinski. "Hamiltonian Neural Networks."
-Advances in Neural Information Processing Systems 32 (2019): 15379-15389.
 """
 @concrete struct HamiltonianNN{FST} <: AbstractExplicitContainerLayer{(:model,)}
     model
diff --git a/src/vision/extensions.jl b/src/vision/extensions.jl
index ae93950..62b541e 100644
--- a/src/vision/extensions.jl
+++ b/src/vision/extensions.jl
@@ -1,24 +1,18 @@
 """
     AlexNet(; kwargs...)
 
-Create an AlexNet model [1]
+Create an AlexNet model [krizhevsky2012imagenet](@citep).
 
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with
-deep convolutional neural networks." Advances in neural information processing systems 25
-(2012): 1097-1105.
 """
 function AlexNet end
 
 """
     ResNet(depth::Int; kwargs...)
 
-Create a ResNet model [1].
+Create a ResNet model [he2016deep](@citep).
 
 ## Arguments
 
@@ -27,18 +21,13 @@ Create a ResNet model [1].
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the
-    IEEE conference on computer vision and pattern recognition. 2016.
 """
 function ResNet end
 
 """
     ResNeXt(depth::Int; kwargs...)
 
-Create a ResNeXt model [1].
+Create a ResNeXt model [xie2017aggregated](@citep).
 
 ## Arguments
 
@@ -47,37 +36,24 @@ Create a ResNeXt model [1].
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Saining Xie, Ross Girshick, Piotr Dollár, Zhuowen Tu, Kaiming He, Ross Gorshick, and
-    Piotr Dollár. "Aggregated residual transformations for deep neural networks."
-    Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
 """
 function ResNeXt end
 
 """
     GoogLeNet(; kwargs...)
 
-Create a GoogLeNet model [1].
+Create a GoogLeNet model [szegedy2015going](@citep).
 
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir Anguelov,
-    Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich. "Going deeper with
-    convolutions." Proceedings of the IEEE conference on computer vision and pattern
-    recognition. 2015.
 """
 function GoogLeNet end
 
 """
     DenseNet(depth::Int; kwargs...)
 
-Create a DenseNet model [1].
+Create a DenseNet model [huang2017densely](@citep).
 
 ## Arguments
 
@@ -86,19 +62,14 @@ Create a DenseNet model [1].
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Gao Huang, Zhuang Liu, Laurens van der Maaten, Kilian Q. Weinberger. "Densely connected
-    convolutional networks." Proceedings of the IEEE conference on computer vision and
-    pattern recognition. 2016.
 """
 function DenseNet end
 
 """
     MobileNet(name::Symbol; kwargs...)
 
-Create a MobileNet model [1, 2, 3].
+Create a MobileNet model
+[howard2017mobilenets, sandler2018mobilenetv2, howard2019searching](@citep).
 
 ## Arguments
 
@@ -108,23 +79,13 @@ Create a MobileNet model [1, 2, 3].
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Howard, Andrew G., et al. "Mobilenets: Efficient convolutional neural networks for
-    mobile vision applications." arXiv preprint arXiv:1704.04861 (2017).
-[2] Sandler, Mark, et al. "Mobilenetv2: Inverted residuals and linear bottlenecks."
-    Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.
-[3] Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias
-    Weyand, Marco Andreetto, Hartwig Adam. "Searching for MobileNetV3." arXiv preprint
-    arXiv:1905.02244. 2019.
 """
 function MobileNet end
 
 """
     ConvMixer(name::Symbol; kwargs...)
 
-Create a ConvMixer model [1].
+Create a ConvMixer model [trockman2022patches](@citep).
 
 ## Arguments
 
@@ -134,11 +95,6 @@ Create a ConvMixer model [1].
 ## Keyword Arguments
 
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Zhu, Zhuoyuan, et al. "ConvMixer: A Convolutional Neural Network with Faster
-    Depth-wise Convolutions for Computer Vision." arXiv preprint arXiv:1911.11907 (2019).
 """
 function ConvMixer end
 
diff --git a/src/vision/vgg.jl b/src/vision/vgg.jl
index cd3380e..6b962bd 100644
--- a/src/vision/vgg.jl
+++ b/src/vision/vgg.jl
@@ -20,7 +20,7 @@ end
 """
     VGG(imsize; config, inchannels, batchnorm = false, nclasses, fcsize, dropout)
 
-Create a VGG model [1].
+Create a VGG model [simonyan2014very](@citep).
 
 ## Arguments
 
@@ -31,11 +31,6 @@ Create a VGG model [1].
   - `nclasses`: number of output classes
   - `fcsize`: intermediate fully connected layer size
   - `dropout`: dropout level between fully connected layers
-
-## References
-
-[1] Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for large-scale
-image recognition." arXiv preprint arXiv:1409.1556 (2014).
 """
 function VGG(imsize; config, inchannels, batchnorm=false, nclasses, fcsize, dropout)
     feature_extractor = vgg_convolutional_layers(config, batchnorm, inchannels)
@@ -63,7 +58,7 @@ const VGG_CONFIG = Dict(
 """
     VGG(depth::Int; batchnorm=false, kwargs...)
 
-Create a VGG model [1] with ImageNet Configuration.
+Create a VGG model [simonyan2014very](@citep) with ImageNet Configuration.
 
 ## Arguments
 
@@ -73,11 +68,6 @@ Create a VGG model [1] with ImageNet Configuration.
 
   * `batchnorm = false`: set to `true` to use batch normalization after each convolution.
 $(INITIALIZE_KWARGS)
-
-## References
-
-[1] Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for large-scale
-    image recognition." arXiv preprint arXiv:1409.1556 (2014).
 """
 function VGG(depth::Int; batchnorm::Bool=false, kwargs...)
     name = Symbol(:vgg, depth, ifelse(batchnorm, "_bn", ""))