From 8c1ce65961c60df8c58817cae6f1cb7b5e5d407a Mon Sep 17 00:00:00 2001
From: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Date: Wed, 1 May 2024 15:52:36 -0700
Subject: [PATCH 1/2] Fix docs errors and most warnings (#9006)

* add various docs fixes

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* make conf.py changes clearer

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix Duplicate explicit target name error for links

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* more fixes, mainly citations

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix some code formatting

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update hf space iframe link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix new ERRORs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* Update docs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 docs/source/asr/datasets.rst                  | 53 ++++++------
 docs/source/asr/intro.rst                     |  4 +-
 docs/source/asr/models.rst                    |  4 +-
 docs/source/asr/speech_intent_slot/api.rst    |  2 +
 docs/source/asr/ssl/api.rst                   |  2 +
 docs/source/ckpt_converters/dev_guide.rst     |  4 +-
 docs/source/ckpt_converters/user_guide.rst    | 84 +++++++++----------
 docs/source/conf.py                           |  3 +-
 docs/source/core/adapters/api.rst             |  7 ++
 docs/source/core/adapters/components.rst      | 12 ++-
 docs/source/core/adapters/intro.rst           |  1 +
 docs/source/core/core.rst                     | 11 +--
 docs/source/core/exp_manager.rst              |  1 +
 docs/source/core/export.rst                   |  3 +-
 docs/source/core/neural_types.rst             |  3 +
 docs/source/features/memory_optimizations.rst | 13 +--
 docs/source/multimodal/api.rst                |  9 +-
 docs/source/multimodal/mllm/checkpoint.rst    | 10 +--
 docs/source/multimodal/nerf/dreamfusion.rst   |  6 +-
 .../source/multimodal/text2img/controlnet.rst |  8 +-
 .../source/multimodal/text2img/dreambooth.rst |  8 +-
 docs/source/multimodal/text2img/imagen.rst    | 10 +--
 docs/source/multimodal/text2img/insp2p.rst    |  6 +-
 docs/source/multimodal/text2img/intro.rst     |  1 +
 .../multimodal/text2img/sdxl_quantization.rst | 10 ++-
 docs/source/multimodal/vlm/clip.rst           |  6 +-
 docs/source/nlp/api.rst                       | 19 ++---
 docs/source/nlp/information_retrieval.rst     |  2 +-
 .../machine_translation.rst                   |  8 +-
 .../nlp/nemo_megatron/gpt/gpt_training.rst    |  2 +-
 .../nemo_megatron/positional_embeddings.rst   | 28 +++----
 ...ation_and_capitalization_lexical_audio.rst |  6 +-
 .../text_normalization_as_tagging.rst         |  8 +-
 docs/source/starthere/best-practices.rst      |  2 +-
 docs/source/starthere/migration-guide.rst     | 20 ++---
 docs/source/tools/nemo_forced_aligner.rst     |  8 +-
 docs/source/vision/checkpoint.rst             |  2 +-
 docs/source/vision/vit.rst                    |  6 +-
 nemo/collections/asr/models/asr_model.py      |  4 +-
 nemo/collections/asr/models/msdd_models.py    | 13 ++-
 nemo/collections/asr/modules/rnnt.py          | 23 +++--
 .../tokenizers/huggingface/auto_tokenizer.py  | 11 ++-
 .../language_modeling/megatron/t5_dataset.py  |  3 +-
 .../megatron/t5_prompt_learning_dataset.py    |  4 +-
 .../language_modeling/megatron/ul2_dataset.py |  4 +-
 .../megatron_bert_embedding_model.py          |  8 +-
 .../language_modeling/megatron_bert_model.py  |  8 +-
 .../language_modeling/megatron_gpt_model.py   |  8 +-
 .../megatron_lm_encoder_decoder_model.py      | 12 ++-
 .../common/transformer/text_generation.py     | 57 ++++++-------
 .../megatron_vit_classification_models.py     |  8 +-
 nemo/core/classes/dataset.py                  | 15 ++--
 nemo/utils/exp_manager.py                     |  4 +-
 53 files changed, 306 insertions(+), 268 deletions(-)

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index b4656eec3f3f..a6e9cbe96c63 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -261,11 +261,6 @@ Semi Sorted Batching
 
 Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting.
 
-  .. image:: images/ssb.png
-    :align: center
-    :alt: semi sorted batching
-    :scale: 50%
-
 It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config.
 
   .. code::
@@ -772,30 +767,30 @@ To enable multimodal dataloading, we provide several configuration options:
 
 Example 3. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so that mini-batches have some examples from both datasets. Provide a custom prompt field for both datasets (to be leveraged by a relevant dataset class):
 
-```yaml
-use_multimodal_sampling: true
-batch_tokens: 1024
-token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
-quadratic_factor: 50
-num_buckets: 30
-use_bucketing: true
-input_cfg:
-  - type: nemo_tarred
-    manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
-    tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
-    weight: 0.5
-    tags:
-      lang: en
-      prompt: "Given the following recording, transcribe what the person is saying:"
-  - type: txt_pair
-    source_path: /path/to/en__OP_0..512_CL_.txt
-    target_path: /path/to/pl__OP_0..512_CL_.txt
-    source_language: en
-    target_language: pl
-    weight: 0.5
-    tags:
-      prompt: "Translate the following text to Polish:"
-```
+.. code-block:: yaml
+
+    use_multimodal_sampling: true
+    batch_tokens: 1024
+    token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
+    quadratic_factor: 50
+    num_buckets: 30
+    use_bucketing: true
+    input_cfg:
+      - type: nemo_tarred
+        manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+        tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+        weight: 0.5
+        tags:
+          lang: en
+          prompt: "Given the following recording, transcribe what the person is saying:"
+      - type: txt_pair
+        source_path: /path/to/en__OP_0..512_CL_.txt
+        target_path: /path/to/pl__OP_0..512_CL_.txt
+        source_language: en
+        target_language: pl
+        weight: 0.5
+        tags:
+          prompt: "Translate the following text to Polish:"
 
 .. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples.
 
diff --git a/docs/source/asr/intro.rst b/docs/source/asr/intro.rst
index 7d1270af1267..d353b4d983dd 100644
--- a/docs/source/asr/intro.rst
+++ b/docs/source/asr/intro.rst
@@ -156,11 +156,11 @@ Canary-1B is a multi-lingual, multi-task model, supporting automatic speech-to-t
 
 .. raw:: html
 
-    <iframe src="https://hf.space/embed/nvidia/canary-1b/+"
+    <iframe src="https://nvidia-canary-1b.hf.space"
     width="100%" class="gradio-asr" allow="microphone *"></iframe>
 
     <script type="text/javascript" language="javascript">
-        $('.gradio-asr').css('height', $(window).height()+'px');
+        $('.gradio-asr').css('height', $(window).height() * 0.8+'px');
     </script>
 
 
diff --git a/docs/source/asr/models.rst b/docs/source/asr/models.rst
index 97dafcb2bf6d..f002137beb0f 100644
--- a/docs/source/asr/models.rst
+++ b/docs/source/asr/models.rst
@@ -46,12 +46,14 @@ HuggingFace Spaces to try out Parakeet models in your browser:
 * `Parakeet-TDT-1.1B <https://huggingface.co/spaces/nvidia/parakeet-tdt-1.1b>`__ space
 
 .. _Conformer_model:
+
 Conformer
 ---------
+
 .. _Conformer-CTC_model:
+
 Conformer-CTC
 ~~~~~~~~~~~~~
--------------
 
 Conformer-CTC is a CTC-based variant of the Conformer model introduced in :cite:`asr-models-gulati2020conformer`. Conformer-CTC has a
 similar encoder as the original Conformer but uses CTC loss and decoding instead of RNNT/Transducer loss, which makes it a non-autoregressive model.
diff --git a/docs/source/asr/speech_intent_slot/api.rst b/docs/source/asr/speech_intent_slot/api.rst
index 735c583f9115..d45f24f807f6 100644
--- a/docs/source/asr/speech_intent_slot/api.rst
+++ b/docs/source/asr/speech_intent_slot/api.rst
@@ -15,8 +15,10 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 .. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
     :show-inheritance:
     :members:
+    :no-index:
 
diff --git a/docs/source/asr/ssl/api.rst b/docs/source/asr/ssl/api.rst
index 7103243a4b20..8e6f83986032 100644
--- a/docs/source/asr/ssl/api.rst
+++ b/docs/source/asr/ssl/api.rst
@@ -15,10 +15,12 @@ Mixins
 .. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 .. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin
     :show-inheritance:
     :members:
+    :no-index:
 
 
 
diff --git a/docs/source/ckpt_converters/dev_guide.rst b/docs/source/ckpt_converters/dev_guide.rst
index 9faa752df2e1..601e69749b64 100644
--- a/docs/source/ckpt_converters/dev_guide.rst
+++ b/docs/source/ckpt_converters/dev_guide.rst
@@ -48,7 +48,7 @@ Script Placement and Naming Conventions
 Code Template
 -------------
 
-Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_  as an full example for development.
+Below template tries to address the 11 steps in the guideline part. Please also use `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__  as an full example for development.
 
 .. code-block:: python
 
@@ -210,7 +210,7 @@ A Simple Guide for Model Mapping and Conversion
 
 2. **Common issues when converting: results not matching between Community model and NeMo model**:
 
-   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`_ for guidance.
+   a. Megatron Core uses a special QKV layout, which needs careful handling and reshaping from community models, especially when GQA or MQA is used. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L144>`__ for guidance.
 
    b. GLU Variants weights could also be a common source of error. In Megatron Core, the regular feedforward projection weights and gated forward weights are fused together, requiring careful attention to the order of these two. Refer to the `Gemma Huggingface to NeMo converter <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py#L135>`_ for more details.
 
diff --git a/docs/source/ckpt_converters/user_guide.rst b/docs/source/ckpt_converters/user_guide.rst
index 9de22f4b5994..451679a7e3ae 100644
--- a/docs/source/ckpt_converters/user_guide.rst
+++ b/docs/source/ckpt_converters/user_guide.rst
@@ -6,45 +6,45 @@ This guide provides instructions on how to use the conversion scripts to convert
 Support Matrix
 --------------
 
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Conversion           | From             | To                  | Github Link                                                                                                        |
-+======================+==================+=====================+====================================================================================================================+
-| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`_        |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`_        |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`_      |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`_       |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`_  |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`_  |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`_     |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`_     |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`_         |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
-| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`_   |
-+----------------------+------------------+---------------------+--------------------------------------------------------------------------------------------------------------------+
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Conversion           | From             | To                  | Github Link                                                                                                         |
++======================+==================+=====================+=====================================================================================================================+
+| Baichuan             | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Baichuan             | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_baichuan2_nemo_to_hf.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| BERT                 | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py>`__        |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| BERT                 | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_bert_nemo_to_hf.py>`__        |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Falcon               | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Falcon               | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_falcon_nemo_to_hf.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | JAX              | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Gemma                | PyTorch          | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| GPT/LLaMA            | NeMo (Legacy)    | NeMo (Megatron-Core)| `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_gpt_nemo_to_mcore.py>`__      |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_hf_to_nemo.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| LLaMA                | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_llama_nemo_to_hf.py>`__       |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py>`__  |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mistral 7B           | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mistral_7b_nemo_to_hf.py>`__  |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py>`__     |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Mixtral              | NeMo             | Hugging Face        | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mixtral_nemo_to_hf.py>`__     |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| MPT                  | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_mpt_hf_to_nemo.py>`__         |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
+| Starcoder            | Hugging Face     | NeMo                | `Link <https://github.com/NVIDIA/NeMo/tree/main/scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py>`__   |
++----------------------+------------------+---------------------+---------------------------------------------------------------------------------------------------------------------+
 
 
 Convert Hugging Face LLaMA Checkpoints to NeMo
@@ -54,7 +54,7 @@ To convert a Hugging Face LLaMA checkpoint into a NeMo checkpoint, use the follo
 
 .. code-block:: bash
 
-    python convert_llama_hf_to_nemo.py>`_ \
+    python convert_llama_hf_to_nemo.py \
      --input_name_or_path <path_to_hf_checkpoints_folder> \
      --output_path <path_to_output_nemo_file>
 
@@ -67,7 +67,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two
 
 .. code-block:: bash
 
-    python convert_<model>_nemo_to_hf.py>`_ \
+    python convert_<model>_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/pytorch_model.bin
 
@@ -75,7 +75,7 @@ To convert a NeMo checkpoint into a Hugging Face LLaMA checkpoint, you have two
 
 .. code-block:: bash
 
-    python convert_<model>_nemo_to_hf.py>`_ \
+    python convert_<model>_nemo_to_hf.py \
     --input_name_or_path /path/to/file.nemo or /path/to/extracted_folder \
     --output_path /path/to/model_folder \
     --hf_input_path /path/to/input_hf_folder \
diff --git a/docs/source/conf.py b/docs/source/conf.py
index e8fba7457605..c599f630d7f7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -113,10 +113,9 @@
     "sphinx.ext.viewcode",
     "sphinx.ext.napoleon",
     "sphinx.ext.githubpages",
-    "sphinxcontrib.bibtex",
     "sphinx.ext.inheritance_diagram",
     "sphinx.ext.intersphinx",
-    "sphinx.ext.autosectionlabel",
+    # "sphinx.ext.autosectionlabel",
     "sphinxcontrib.bibtex",
     "sphinx_copybutton",
     "sphinxext.opengraph",
diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst
index b0f2a8e13610..8922c72d63eb 100644
--- a/docs/source/core/adapters/api.rst
+++ b/docs/source/core/adapters/api.rst
@@ -9,6 +9,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -17,6 +18,7 @@ Core
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -28,6 +30,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -35,6 +38,7 @@ Adapter Networks
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -47,6 +51,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -55,6 +60,7 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -63,3 +69,4 @@ Adapter Strategies
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst
index cc2ea0b525df..d8bed1b23a75 100644
--- a/docs/source/core/adapters/components.rst
+++ b/docs/source/core/adapters/components.rst
@@ -8,7 +8,7 @@ An adapter module can be any pytorch module, but it must follow certain straight
 1) The model accepts an input of some input dimension, and its output must match this dimension.
 2) Ideally, the module is initialized such that the output of the adapter when initialized is such that it does not modify the original input. This allows the model to produce the same output results, even when additional parameters have been added.
 
-According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider an adapter being represented as three components -
+According to Junxian et al :cite:`adapters-components-Junxian2021unified`, we can consider an adapter being represented as three components -
 
 1) Functional form - the trainable parameters that will modify the input
 2) Insertion form - Where the adapter outputs are integrated with the original input. The input to the adapters can be the last output of the layer, the input to some attention layer, or even the original input to the module itself (before even the modules forward pass).
@@ -17,7 +17,7 @@ According to Junxian et al :cite:`adapters-Junxian2021unified`, we can consider
 Functional Form - Adapter Networks
 ==================================
 
-Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-houlsby2019adapter`.
+Adapter modules represent the functional form of the adapter. We discuss an example of a most commonly used adapter module found in literature, titled the ``LinearAdapter`` (or Houlsby Adapter) :cite:`adapters-components-houlsby2019adapter`.
 
 .. note::
 
@@ -28,6 +28,7 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 -----
 
@@ -35,12 +36,13 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
 
 
 Insertion Form - Module Adapters
 --------------------------------
 
-Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
+Adapter modules can be integrated into many different locations of a given module. For example, it is possible to have an adapter that affects only the outputs of the final layer in each module. We can also have a ``Parallel Adapter`` :cite:`adapters-components-Junxian2021unified` that operates at the input of the module itself, in parallel to the forward pass of the module. Yet another insertion location is inside the Multi Head Attention Layers.
 
 On top of this, while adapters are commonly used only in the layers containing the most parameters (say the Encoder of a network), some models can support adapters in multiple locations (Encoder-Decoder architecture for Language Models, Machine Translation, or even Encoder-Decoder-Joint for ASR with Transducer Loss). As such, NeMo utilizes the concept of ``Module Adapters``.
 
@@ -70,6 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -78,6 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts
     :members:
     :member-order: bysource
     :undoc-members: adapter_module_names
+    :no-index:
 
 -----
 
@@ -87,4 +91,4 @@ References
 
 .. bibliography:: ./adapter_bib.bib
     :style: plain
-    :keyprefix: adapters-
+    :keyprefix: adapters-components-
diff --git a/docs/source/core/adapters/intro.rst b/docs/source/core/adapters/intro.rst
index fd94c8d23446..8c5e9cbc8895 100644
--- a/docs/source/core/adapters/intro.rst
+++ b/docs/source/core/adapters/intro.rst
@@ -144,4 +144,5 @@ References
 
 .. bibliography:: ./adapter_bib.bib
     :style: plain
+    :labelprefix: adapters
     :keyprefix: adapters-
diff --git a/docs/source/core/core.rst b/docs/source/core/core.rst
index 6e5efa56d5f0..1c9325cf0a96 100644
--- a/docs/source/core/core.rst
+++ b/docs/source/core/core.rst
@@ -16,9 +16,10 @@ NeMo models contain everything needed to train and reproduce Conversational AI m
 
 NeMo uses `Hydra <https://hydra.cc/>`_ for configuring both NeMo models and the PyTorch Lightning Trainer.
 
-.. note:: Every NeMo model has an example configuration file and training script that can be found `here <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_.
+.. note::
+    Every NeMo model has an example configuration file and training script that can be found `here <https://github.com/NVIDIA/NeMo/tree/stable/examples>`__.
 
-The end result of using NeMo, `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem. 
+The end result of using NeMo, `Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`__, and Hydra is that NeMo models all have the same look and feel and are also fully compatible with the PyTorch ecosystem.
 
 Pretrained
 ----------
@@ -42,14 +43,14 @@ To see all available pretrained models for a specific NeMo model, use the ``list
 
 For detailed information on the available pretrained models, refer to the collections documentation: 
 
-- :ref:`Automatic Speech Recognition (ASR)`
+- :doc:`Automatic Speech Recognition (ASR) <../asr/intro>`
 - :doc:`Natural Language Processing (NLP) <../nlp/models>`
 - :doc:`Text-to-Speech Synthesis (TTS) <../tts/intro>`
 
 Training
 --------
 
-NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`_ for model training. PyTorch Lightning lets NeMo decouple the 
+NeMo leverages `PyTorch Lightning <https://www.pytorchlightning.ai/>`__ for model training. PyTorch Lightning lets NeMo decouple the
 conversational AI code from the PyTorch training code. This means that NeMo users can focus on their domain (ASR, NLP, TTS) and 
 build complex AI applications without having to rewrite boiler plate code for PyTorch training.
 
@@ -298,7 +299,7 @@ With NeMo and Hydra, every aspect of model training can be modified from the com
 of experiments on compute clusters or for quickly testing parameters while developing.
 
 All NeMo `examples <https://github.com/NVIDIA/NeMo/tree/v1.0.2/examples>`_ come with instructions on how to
-run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`_
+run the training/inference script from the command-line (see `here <https://github.com/NVIDIA/NeMo/blob/4e9da75f021fe23c9f49404cd2e7da4597cb5879/examples/asr/asr_ctc/speech_to_text_ctc.py#L24>`__
 for an example).
 
 With Hydra, arguments are set using the ``=`` operator:
diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
index b44d27c38b4b..efb55b0feabb 100644
--- a/docs/source/core/exp_manager.rst
+++ b/docs/source/core/exp_manager.rst
@@ -379,3 +379,4 @@ ExpManagerConfig
     :show-inheritance:
     :members:
     :member-order: bysource
+    :no-index:
diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst
index 990769452a5c..c53dd8159a60 100644
--- a/docs/source/core/export.rst
+++ b/docs/source/core/export.rst
@@ -194,7 +194,7 @@ To facilitate that, the hooks below are provided. To export, for example, 'encod
         First goes the one receiving input (input_example)
         """
 
-Some nertworks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export:
+Some networks may be exported differently according to user-settable options (like ragged batch support for TTS or cache support for ASR). To facilitate that - `set_export_config()` method is provided by Exportable to set key/value pairs to predefined model.export_config dictionary, to be used during the export:
 
 .. code-block:: Python	
 
@@ -202,6 +202,7 @@ Some nertworks may be exported differently according to user-settable options (l
         """
         Sets/updates export_config dictionary
         """
+
 Also, if an action hook on setting config is desired, this method may be overloaded by `Exportable` descendants to include one.
 An example can be found in ``<NeMo_git_root>/nemo/collections/asr/models/rnnt_models.py``.
 
diff --git a/docs/source/core/neural_types.rst b/docs/source/core/neural_types.rst
index 9003b9ca5203..ec7d94336c05 100644
--- a/docs/source/core/neural_types.rst
+++ b/docs/source/core/neural_types.rst
@@ -24,6 +24,7 @@ Types are implemented in ``nemo.core.neural_types.NeuralType`` class. When you i
 are expected to include both *axes* information and *element type* information.
 
 .. autoclass:: nemo.core.neural_types.NeuralType
+    :no-index:
 
 Type Comparison Results
 -----------------------
@@ -31,6 +32,7 @@ Type Comparison Results
 When comparing two neural types, the following comparison results are generated.
 
 .. autoclass:: nemo.core.neural_types.NeuralTypeComparisonResult
+    :no-index:
 
 Examples
 --------
@@ -113,6 +115,7 @@ Custom element types
 It is possible to create user-defined element types to express the semantics of elements in your tensors. To do so, the user will need to inherit and implement abstract methods of the ``nemo.core.neural_types.elements.ElementType`` class
 
 .. autoclass:: nemo.core.neural_types.elements.ElementType
+    :no-index:
 
 Note that element types can be parametrized. Consider this example where it distinguishes between audio sampled at 8Khz and 16Khz.
 
diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
index 0e0b3ad84402..d72d54ab7c2c 100644
--- a/docs/source/features/memory_optimizations.rst
+++ b/docs/source/features/memory_optimizations.rst
@@ -3,7 +3,7 @@ Memory Optimizations
 
 Parallelism
 -----------
-Refer to :doc:`Parallelism <./parallelism>`.
+Refer to :doc:`Parallelism <./parallelisms>`.
 
 Flash Attention
 ---------------
@@ -20,10 +20,8 @@ In the NeMo Framework, Flash Attention is supported through the Transformer Engi
 
 For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
 
-.. bibliography:: ./nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
+Activation Recomputation
+------------------------
 
 Overview
 ^^^^^^^^
@@ -41,8 +39,3 @@ Selective Activation Recomputation
 This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
 
 Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198
-
-.. bibliography:: ./nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
\ No newline at end of file
diff --git a/docs/source/multimodal/api.rst b/docs/source/multimodal/api.rst
index d6f96e6c6ea4..3228cd76d4ad 100644
--- a/docs/source/multimodal/api.rst
+++ b/docs/source/multimodal/api.rst
@@ -8,6 +8,7 @@ Model Classes
     :show-inheritance:
     :no-members:
     :members: __init__, configure_optimizers
+    :no-index:
 
 
 .. autoclass:: nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion
@@ -16,18 +17,18 @@ Model Classes
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
 
-.. autoclass:: nemo.collections.multimodal.models.dreambooth.dreambooth.MegatronDreamBooth
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.dreambooth.dreambooth.MegatronDreamBooth
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
 
-.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.MegatronControlNet
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.MegatronControlNet
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
 
-.. autoclass:: nemo.collections.multimodal.models.imagen.imagen.MegatronImagen
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.imagen.imagen.MegatronImagen
     :show-inheritance:
     :no-members:
     :members: __init__, training_step, validation_step, setup, build_train_valid_test_datasets
@@ -65,7 +66,7 @@ Modules
     :members: __init__, encode
 
 
-.. autoclass:: nemo.collections.multimodal.models.controlnet.controlnet.ControlledUnetModel
+.. autoclass:: nemo.collections.multimodal.models.text_to_image.controlnet.controlnet.ControlledUnetModel
     :show-inheritance:
     :no-members:
     :members: forward
diff --git a/docs/source/multimodal/mllm/checkpoint.rst b/docs/source/multimodal/mllm/checkpoint.rst
index 46c6da631ba2..d1fe7b651e66 100644
--- a/docs/source/multimodal/mllm/checkpoint.rst
+++ b/docs/source/multimodal/mllm/checkpoint.rst
@@ -41,7 +41,7 @@ Converting Local Checkpoints
 
 The training script only auto-converts the final checkpoint into the ``.nemo`` format. To evaluate intermediate training checkpoints, conversion to ``.nemo`` might be needed. For this:
 
-.. code-block:: python
+.. code-block:: bash
 
    python -m torch.distributed.launch --nproc_per_node=<tensor_model_parallel_size> * <pipeline_model_parallel_size> \
        examples/multimodal/convert_ckpt_to_nemo.py \
@@ -59,12 +59,12 @@ NeVA Checkpoints
 
 Currently, the conversion mainly supports LLaVA checkpoints based on "llama-2 chat" checkpoints. As a reference, we'll consider the checkpoint `llava-llama-2-13b-chat-lightning-preview <https://huggingface.co/liuhaotian/llava-llama-2-13b-chat-lightning-preview>`_.
 
-After downloading this checkpoint and saving it at `/path/to/llava-llama-2-13b-chat-lightning-preview`, undertake the following procedures:
+After downloading this checkpoint and saving it at ``/path/to/llava-llama-2-13b-chat-lightning-preview``, undertake the following procedures:
 
 Modifying the Tokenizer
 """""""""""""""""""""""
 
-NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in `/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer`, execute the following in the NeMo container:
+NeMo mandates adding specific tokens to the tokenizer model for peak performance. To modify an existing tokenizer located in ``/path/to/llava-llama-2-13b-chat-lightning-preview/tokenizer``, execute the following in the NeMo container:
 
 .. code-block:: bash
 
@@ -82,7 +82,7 @@ Checkpoint Conversion
 
 For conversion:
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/multimodal/mllm/neva/convert_hf_llava_to_neva.py \
      --in-file /path/to/llava-llama-2-13b-chat-lightning-preview \
@@ -99,7 +99,7 @@ NeVA Checkpoints
 
 Adjust model parallelism with:
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/nlp/language_modeling/megatron_change_num_partitions.py \
     --model_file=/path/to/source.nemo \
diff --git a/docs/source/multimodal/nerf/dreamfusion.rst b/docs/source/multimodal/nerf/dreamfusion.rst
index a9f2f630bcdd..d6c926392556 100644
--- a/docs/source/multimodal/nerf/dreamfusion.rst
+++ b/docs/source/multimodal/nerf/dreamfusion.rst
@@ -3,7 +3,7 @@ DreamFusion
 
 Model Introduction
 -------------------
-DreamFusion  :cite:`mm-models-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
+DreamFusion  :cite:`mm-models-df-poole2022dreamfusion` uses a pretrained text-to-image diffusion model to perform
 text-to-3D synthesis. The model uses a loss based on probability density distillation that enables the use of a 2D
 diffusion model as a prior for optimization of a parametric image generator.
 
@@ -306,5 +306,5 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-DF
+    :keyprefix: mm-models-df-
diff --git a/docs/source/multimodal/text2img/controlnet.rst b/docs/source/multimodal/text2img/controlnet.rst
index 6eae36dd017a..b9f55031b79d 100644
--- a/docs/source/multimodal/text2img/controlnet.rst
+++ b/docs/source/multimodal/text2img/controlnet.rst
@@ -4,12 +4,12 @@ ControlNet
 Model Introduction
 --------------------
 
-ControlNet :cite:`mm-models-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
+ControlNet :cite:`mm-models-cn-controlnetgithub` is a neural network structure to control diffusion models by adding extra conditions.
 It copies the weights of neural network blocks into a "locked" copy and a "trainable" copy. The "trainable" one learns your condition. The "locked" one preserves your model. In this way, the ControlNet can reuse the SD encoder as a deep, strong, robust, and powerful backbone to learn diverse controls.
 NeMo Multimodal provides a training pipeline and example implementation for generating images based on segmentation maps. Users have the flexibility to explore other implementations using their own control input dataset and recipe.
 
 .. image:: ./images/controlnet-structure.png
-   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-controlnetgithub`)
+   :alt: ControlNet structure on stable diffusion (See :cite:`mm-models-cn-controlnetgithub`)
 
 
 ControlNet Dataset
@@ -102,5 +102,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-CN
+    :keyprefix: mm-models-cn-
diff --git a/docs/source/multimodal/text2img/dreambooth.rst b/docs/source/multimodal/text2img/dreambooth.rst
index fa7e52a7ccbb..1c6a420d49f2 100644
--- a/docs/source/multimodal/text2img/dreambooth.rst
+++ b/docs/source/multimodal/text2img/dreambooth.rst
@@ -5,7 +5,7 @@ DreamBooth
 Model Introduction
 --------------------
 
-DreamBooth :cite:`mm-models-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
+DreamBooth :cite:`mm-models-db-dreamboothpaper` is a fine-tuning technique and a solution to personalize large diffusion models like Stable Diffusion, which are powerful but lack the
 ability to mimic subjects of a given reference set. With DreamBooth, you only need a few images of a specific subject to
 fine-tune a pretrained text-to-image model, so that it learns to bind a unique identifier with a special subject. This
 unique identifier can then be used to synthesize fully-novel photorealistic images of the subject contextualized in
@@ -28,7 +28,7 @@ NeMo's Dreambooth is built upon the Stable Diffusion framework. While its archit
 
 - Training Dataset
 
-    NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
+    NeMo's Dreambooth model dataset is different from other NeMo multimodal models in that it doesn't necessitate data stored in the webdataset format. You can find a sample dataset at :cite:`mm-models-db-dreamboothdataset`. For each object you aim to integrate into the model, just place its images (typically 3-5) in a folder and specify its path in ``model.data.instance_dir``. When training with the prior preservation loss, store images produced by the original model in a distinct folder and reference its path in ``model.data.regularization_dir``. This process is automated in NeMo's DreamBooth implementation.
 
 Model Configuration
 --------------------
@@ -130,5 +130,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-DB
+    :keyprefix: mm-models-db-
diff --git a/docs/source/multimodal/text2img/imagen.rst b/docs/source/multimodal/text2img/imagen.rst
index 9aeff2f2a061..844f68df747f 100644
--- a/docs/source/multimodal/text2img/imagen.rst
+++ b/docs/source/multimodal/text2img/imagen.rst
@@ -4,7 +4,7 @@ Imagen
 Model Introduction
 -------------------
 
-Imagen  :cite:`mm-models-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
+Imagen  :cite:`mm-models-imagen-saharia2022photorealistic` is a multi-stage text-to-image diffusion model with an unprecedented 
 degree of photorealism and a deep level of language understanding. Given a text prompt, 
 Imagen first generates an image at a 64x64 resolution and then upsamples the generated image to 256x256 and 1024x1024 
 resolutions, all using diffusion models.
@@ -75,9 +75,9 @@ Recommended Efficient UNet size for SR256 and SR1024 models are listed below:
 Noise Scheduling / Sampler
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-nichol2021improved` and EDM :cite:`mm-models-karras2022elucidating`.
+NeMo Imagen supports two types of noise scheduling: Continous DDPM :cite:`mm-models-imagen-nichol2021improved` and EDM :cite:`mm-models-imagen-karras2022elucidating`.
 
-Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-ho2020denoising` 
+Denoising diffusion probabilistic models (DDPM) :cite:`mm-models-imagen-ho2020denoising` 
 represents the most widely adopted noise scheduling approach among all diffusion models. 
 Continuous DDPM introduces several modifications to the standard DDPM framework, 
 with the most significant change being the transition from a discrete noise space to a continuous space.
@@ -285,5 +285,5 @@ Reference
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-IMAGEN
+    :keyprefix: mm-models-imagen-
diff --git a/docs/source/multimodal/text2img/insp2p.rst b/docs/source/multimodal/text2img/insp2p.rst
index 177734584bc7..282874444738 100644
--- a/docs/source/multimodal/text2img/insp2p.rst
+++ b/docs/source/multimodal/text2img/insp2p.rst
@@ -4,7 +4,7 @@ InstructPix2Pix
 Model Introduction
 --------------------
 
-InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
+InstructPix2Pix [InstructPix2Pix]_ :cite:`mm-models-insp2p-insp2p` offers a unique approach to image editing using human-written instructions. Given an input image and a textual directive, the model adjusts the image according to the provided instructions. NeMo Multimodal presents a training pipeline for this conditional diffusion model, utilizing a dataset generated by harnessing the strengths of two prominent pretrained models: a language model (GPT-3) and a text-to-image model (Stable Diffusion). The InstructPix2Pix model operates swiftly, editing images within seconds, eliminating the need for per-example fine-tuning or inversion. It has demonstrated remarkable results across a wide variety of input images and written instructions.
 
 Built upon the Stable Diffusion framework, NeMo's InstructPix2Pix shares a similar architecture with Stable Diffusion (refer to :doc:`Stable Diffusion <./sd>`). What sets it apart is its unique training dataset and the combined guidance from both image and text prompts. Specifically, InstructPix2pix ::class::``nemo.collections.multimodal.models.instruct_pix2pix.ldm.ddpm_edit.MegatronLatentDiffusionEdit`` is derived directly from Stable Diffusion's ::class::``nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.ddpm.MegatronLatentDiffusion``, with alterations to accommodate the dataset and provide support for dual guidance.
 
@@ -79,7 +79,7 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-INSP2P
+    :keyprefix: mm-models-insp2p-
 
 
diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
index 3c3c17768679..599c9bae5e15 100644
--- a/docs/source/multimodal/text2img/intro.rst
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -13,4 +13,5 @@ NeMo multimodal provides implementations of multiple image-to-text models, inclu
    imagen
    dreambooth
    controlnet
+   insp2p
    sdxl_quantization
diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
index 78403e9c402c..68bb7ff8d511 100644
--- a/docs/source/multimodal/text2img/sdxl_quantization.rst
+++ b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -7,16 +7,17 @@ This example shows how to use Ammo to calibrate and quantize the UNet part of th
 We also provide instructions on deploying and running E2E SDXL pipeline
 with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
 
-To get started, it is required to have a pretrained SDXL checkpoint in `nemo` format. The example training configs are provided in NeMo,
-which is located in `NeMo/examples/multimodal/text2img/stable_diffusion`.
+To get started, it is required to have a pretrained SDXL checkpoint in ``nemo`` format. The example training configs are provided in NeMo,
+which is located in ``NeMo/examples/multimodal/text2img/stable_diffusion``.
 
 Calibration
 ---------------
 The first step is to run quantization script with default config, and finally the script will export the quantized unet to onnx file.
-Here is the default config for `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py`.
+Here is the default config for ``NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py``.
 
 
 .. code-block:: yaml
+
     quantize
       exp_name: nemo
       n_steps: 20          # number of inference steps
@@ -41,6 +42,7 @@ Build the TRT engine for the Quantized ONNX UNet
 ------------------------------------------------------------
 
 .. code-block:: bash
+
     trtexec --onnx=./nemo_onnx/unet.onnx --shapes=x:8x4x128x128,timesteps:8,context:8x80x2048,y:8x2816 --fp16 --int8 --builderOptimizationLevel=4 --saveEngine=nemo_unet_xl.plan
 
 
@@ -57,6 +59,7 @@ Build End-to-end Stable Diffusion XL Pipeline with NeMo
 We provide a script to build end to end TRT inference pipeline with NeMo backend, which is located at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_export.py`.
 
 .. code-block:: yaml
+
     infer:
         out_path: sdxl_export
         width: 1024
@@ -82,6 +85,7 @@ Run End-to-end Stable Diffusion XL TRT Pipeline
 The inference script can be found at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_trt_inference.py`.
 
 .. code-block:: yaml
+
     unet_xl: sdxl_export/plan/unet_xl.plan
     vae: sdxl_export/plan/vae.plan
     clip1: sdxl_export/plan/clip1.plan
diff --git a/docs/source/multimodal/vlm/clip.rst b/docs/source/multimodal/vlm/clip.rst
index e28fb836ff4a..976baadb5a83 100644
--- a/docs/source/multimodal/vlm/clip.rst
+++ b/docs/source/multimodal/vlm/clip.rst
@@ -4,7 +4,7 @@ CLIP
 Model Introduction
 -------------------
 
-Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
+Contrastive Language-Image Pre-training (CLIP) :cite:`mm-models-clip-radford2021learning` offers an efficient method for learning image representations using natural language supervision. The essence of CLIP is to train both an image encoder and a text encoder from scratch. The model aims to predict the correct pairings of a batch of (image, text) training examples by jointly training these encoders. During pre-training, CLIP is designed to predict which images and texts form a semantically coherent pair by maximizing the similarity between the correct (image, text) pairs while minimizing the similarity between incorrect pairs. This contrastive learning approach ensures that CLIP learns meaningful and contextually rich representations of both visual and textual data.
 
 NeMo's implementation of the CLIP model leverages its parallel transformer implementation, specifically the `nemo.collections.nlp.modules.common.megatron.transformer.ParallelTransformer`, to enable model parallelism support in both the text encoder and vision model. This design choice ensures efficient scaling and utilization of resources during training. Additionally, some of the model design and loss implementations in NeMo's CLIP are inspired by the open-source [open_clip](https://github.com/mlfoundations/open_clip) repository.
 
@@ -153,5 +153,5 @@ References
 .. bibliography:: ../mm_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: MM-MODELS
-    :keyprefix: mm-models-
+    :labelprefix: MM-MODELS-CLIP
+    :keyprefix: mm-models-clip-
diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
index b9b4d529ba46..52c1b537b0bf 100755
--- a/docs/source/nlp/api.rst
+++ b/docs/source/nlp/api.rst
@@ -22,7 +22,7 @@ Pretraining Model Classes
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_bart_model.MegatronBARTModel
     :show-inheritance: 
     :no-members:
-    :members: training_step, validation_step, build_train_valid_test_datasets, setup, on_save_checkpoint, on_load_checkpoint
+    :members: training_step, validation_step, build_train_valid_test_datasets, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_retrieval_model.MegatronRetrievalModel
     :show-inheritance: 
@@ -45,32 +45,27 @@ Customization Model Classes
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTAdapterLearningModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: __init__, state_dict, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_adapter_model.MegatronGPTInfusedAdapterModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: __init__, state_dict, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model.MegatronGPTPromptLearningModel
     :show-inheritance: 
     :no-members:
-    :members: built_virtual_prompt_dataset, generate, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: build_virtual_prompt_dataset, generate, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel
     :show-inheritance: 
     :no-members:
-    :members: __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
-
-.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5AdapterLearningModel
-    :show-inheritance: 
-    :no-members:
-    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, setup
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_t5_adapter_model.MegatronT5InfusedAdapterModel
     :show-inheritance: 
     :no-members:
-    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, build_train_valid_test_datasets, setup
+    :members: _add_adapters_to_component, __init__, state_dict, training_step, validation_step, setup
 
 Modules
 -------
@@ -86,7 +81,7 @@ Modules
     :no-members:
     :members: forward
 
-.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert_model.BertModel
+.. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.bert.bert_model.NeMoBertModel
     :show-inheritance: 
     :no-members:
     :members: forward
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index fa9157e45b59..26732283e8f4 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -53,7 +53,7 @@ BERT checkpoint to NeMo (mcore) using the following:
 
 Then you can fine-tune the sentence-BERT model using the following script:
 
-.. code-block:: python
+.. code-block:: bash
 
 
     #!/bin/bash
diff --git a/docs/source/nlp/machine_translation/machine_translation.rst b/docs/source/nlp/machine_translation/machine_translation.rst
index 190ac5b07da9..f58c67551abe 100644
--- a/docs/source/nlp/machine_translation/machine_translation.rst
+++ b/docs/source/nlp/machine_translation/machine_translation.rst
@@ -470,12 +470,12 @@ NMT with bottleneck encoder architecture is also supported (i.e., fixed size bot
 
 1. Supported  learning frameworks (**model.model_type**):
     * NLL - Conditional cross entropy (the usual NMT loss)
-    * VAE - Variational Auto-Encoder (`paper <https://arxiv.org/pdf/1312.6114.pdf>`_)
-    * MIM - Mutual Information Machine (`paper <https://arxiv.org/pdf/2003.02645.pdf>`_)
+    * VAE - Variational Auto-Encoder (`paper <https://arxiv.org/pdf/1312.6114.pdf>`__)
+    * MIM - Mutual Information Machine (`paper <https://arxiv.org/pdf/2003.02645.pdf>`__)
 2. Supported encoder architectures (**model.encoder.arch**):
     * seq2seq - the usual transformer encoder without a bottleneck
-    * bridge - attention bridge bottleneck (`paper <https://arxiv.org/pdf/1703.03130.pdf>`_)
-    * perceiver -  Perceiver bottleneck (`paper <https://arxiv.org/pdf/2103.03206.pdf>`_)
+    * bridge - attention bridge bottleneck (`paper <https://arxiv.org/pdf/1703.03130.pdf>`__)
+    * perceiver -  Perceiver bottleneck (`paper <https://arxiv.org/pdf/2103.03206.pdf>`__)
 
 
 +----------------------------------------+----------------+--------------+-------------------------------------------------------------------------------------------------------+
diff --git a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
index 2e94cc45b40f..efc2ac3f8439 100644
--- a/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
+++ b/docs/source/nlp/nemo_megatron/gpt/gpt_training.rst
@@ -70,7 +70,7 @@ Note that training tokenizer model will also take some time.
         --pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 \
         --split_digits true
 
-After this is done (will take a while), you'll have two files: ```spm_32k_wiki.model``` and ```spm_32k_wiki.vocab``corresponding to the model and vocabulary.
+After this is done (will take a while), you'll have two files: ``spm_32k_wiki.model`` and ``spm_32k_wiki.vocab`` corresponding to the model and vocabulary.
 
 **Step 4: Convert training data into memory map format**
 
diff --git a/docs/source/nlp/nemo_megatron/positional_embeddings.rst b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
index 332ce304049d..cac0bb452f58 100644
--- a/docs/source/nlp/nemo_megatron/positional_embeddings.rst
+++ b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
@@ -18,38 +18,38 @@ GPT
      - .. code::
           
           model.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
+     - Absolute Position Encodings :cite:`pos-emb-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **rope**
      - .. code::
 
           model.position_embedding_type='rope'
           model.rotary_percentage=1.0
-     - Rotary Position Embedding (RoPE) :cite:`nlp-megatron-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations. It achieves this by leveraging the geometric properties of vectors and complex numbers and applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
-
+     - Rotary Position Embedding (RoPE) :cite:`pos-emb-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations by leveraging the geometric properties of vectors and complex numbers, applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
+   
    * - **alibi**
      - .. code::
 
           model.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
+     - Attention with Linear Biases (ALiBi) :cite:`pos-emb-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`pos-emb-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
    * - **xpos**
      - .. code::
 
           model.position_embedding_type='xpos'
-     - Extrapolatable Position Embedding (xPos) :cite:`nlp-megatron-sun2022lengthextrapolatable`
+     - Extrapolatable Position Embedding (xPos) :cite:`pos-emb-sun2022lengthextrapolatable`
 
    * - **sandwich**
      - .. code::
 
           model.position_embedding_type='sandwich'
-     - Sandwich :cite:`nlp-megatron-chi2023dissecting`
+     - Sandwich :cite:`pos-emb-chi2023dissecting`
 
 T5
 ^^
@@ -67,32 +67,32 @@ T5
 
           model.encoder.position_embedding_type='learned_absolute'
           model.decoder.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
+     - Absolute Position Encodings :cite:`pos-emb-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **relative**
      - .. code::
 
           model.encoder.position_embedding_type='relative'
           model.decoder.position_embedding_type='relative'
-     - Relative Position Representations :cite:`nlp-megatron-shaw2018selfattention`
+     - Relative Position Representations :cite:`pos-emb-shaw2018selfattention`
 
    * - **alibi**
      - .. code::
 
           model.encoder.position_embedding_type='alibi'
           model.decoder.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
+     - Attention with Linear Biases (ALiBi) :cite:`pos-emb-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.encoder.position_embedding_type='kerple'
           model.decoder.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`pos-emb-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
 Positional interpolation
 ------------------------
-Position Interpolation (PI) :cite:`nlp-megatron-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
+Position Interpolation (PI) :cite:`pos-emb-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
 
 Positional Interpolation is supported in Megatron GPT SFT models. Set RoPE Interpolation factor for sequence length :code:`seq_len_interpolation_factor` to enable it.
 
@@ -107,5 +107,5 @@ References
 
 .. bibliography:: ../nlp_all.bib
     :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
\ No newline at end of file
+    :labelprefix: pos-emb
+    :keyprefix: pos-emb-
\ No newline at end of file
diff --git a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
index 8314676e5c4c..4cd13abd2264 100644
--- a/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
+++ b/docs/source/nlp/punctuation_and_capitalization_lexical_audio.rst
@@ -36,7 +36,7 @@ Quick Start Guide
 Model Description
 -----------------
 In addition to :doc:`Punctuation And Capitalization model <./punctuation_and_capitalization>` we add audio encoder (e.g. Conformer's encoder) and attention based fusion of lexical and audio features.
-This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-sunkara20_interspeech`.
+This model architecture is based on `Multimodal Semi-supervised Learning Framework for Punctuation Prediction in Conversational Speech <https://arxiv.org/pdf/2008.00702.pdf>`__ :cite:`nlp-punct-lex-sunkara20_interspeech`.
 
 .. note::
 
@@ -386,6 +386,6 @@ References
 
 .. bibliography:: nlp_all.bib
     :style: plain
-    :labelprefix: NLP-PUNCT
-    :keyprefix: nlp-punct-
+    :labelprefix: NLP-PUNCT-LEX
+    :keyprefix: nlp-punct-lex-
 
diff --git a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
index 672226622357..702fb9425026 100644
--- a/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
+++ b/docs/source/nlp/text_normalization/text_normalization_as_tagging.rst
@@ -59,7 +59,7 @@ In the example, ``<self>`` denotes that the spoken form is the same as the writt
     <eos>	<eos>
 
 
-More information about the Google Text Normalization Dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-sproat2016rnn`.
+More information about the Google Text Normalization Dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-tag-sproat2016rnn`.
 
 
 Data preprocessing
@@ -146,7 +146,7 @@ contextualized representation for each input token. It then uses a classificatio
 to predict the tag for each token. Another classification head is used to predict a "semiotic" class label for each token.
 
 Overall, our design is partly inspired by the LaserTagger approach proposed in the paper
-`Encode, tag, realize: High-precision text editing <https://arxiv.org/abs/1909.01187>`__ :cite:`nlp-textnorm-malmi2019encode`.
+`Encode, tag, realize: High-precision text editing <https://arxiv.org/abs/1909.01187>`__ :cite:`nlp-textnorm-tag-malmi2019encode`.
 
 The LaserTagger method is not directly applicable to ITN because it can only regard the whole non-common fragment as a single
 replacement tag, whereas spoken-to-written conversion, e.g. a date, needs to be aligned on a more granular level. Otherwise,
@@ -161,5 +161,5 @@ References
 
 .. bibliography:: tn_itn_all.bib
     :style: plain
-    :labelprefix: NLP-TEXTNORM
-    :keyprefix: nlp-textnorm-
+    :labelprefix: NLP-TEXTNORM-TAG
+    :keyprefix: nlp-textnorm-tag
diff --git a/docs/source/starthere/best-practices.rst b/docs/source/starthere/best-practices.rst
index ec0fea1985cc..759ee108ed7b 100644
--- a/docs/source/starthere/best-practices.rst
+++ b/docs/source/starthere/best-practices.rst
@@ -23,7 +23,7 @@ NeMo excels in training large-scale LLM & MM, utilizing optimizations from Megat
 - Advanced checkpointing through the Distributed Checkpoint Format.
 
 Speech AI
---------
+---------
 
 Data Augmentation
 ~~~~~~~~~~~~~~~~~
diff --git a/docs/source/starthere/migration-guide.rst b/docs/source/starthere/migration-guide.rst
index 1d9816493a5b..7005873e5343 100644
--- a/docs/source/starthere/migration-guide.rst
+++ b/docs/source/starthere/migration-guide.rst
@@ -8,39 +8,39 @@ Upgrade guide to use lightning 2.0
 
 .. _dummy_header:
 
-* Replace ``trainer.strategy=null`` with ``trainer.strategy=auto`` as `lightning 2.0 doesn't have None strategy <https://lightning.ai/docs/pytorch/stable/common/trainer.html#:~:text=strategy%20(Union%5Bstr%2C%20Strategy%5D)%20%E2%80%93%20Supports%20different%20training%20strategies%20with%20aliases%20as%20well%20custom%20strategies.%20Default%3A%20%22auto%22.>`_.
+* Replace ``trainer.strategy=null`` with ``trainer.strategy=auto`` as `lightning 2.0 doesn't have None strategy <https://lightning.ai/docs/pytorch/stable/common/trainer.html#:~:text=strategy%20(Union%5Bstr%2C%20Strategy%5D)%20%E2%80%93%20Supports%20different%20training%20strategies%20with%20aliases%20as%20well%20custom%20strategies.%20Default%3A%20%22auto%22.>`__.
 
-* Remove ``resume_from_checkpoint`` if being used as a trainer flag and pass the path to `Trainer.fit(ckpt_path="...") method <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20Trainer%E2%80%99s%20flag%20resume_from_checkpoint>`_.
+* Remove ``resume_from_checkpoint`` if being used as a trainer flag and pass the path to `Trainer.fit(ckpt_path="...") method <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20Trainer%E2%80%99s%20flag%20resume_from_checkpoint>`__.
 
 * Set ``trainer.strategy = "ddp_find_unused_parameters_true"`` if there are unused parameters in your model as lightning 2.0 has find_unused_parameters as False by default. 
   
-    Reference: `NeMo PR 6433 <https://github.com/NVIDIA/NeMo/pull/6433/files#:~:text=Resolve%20conversation-,cfg.trainer.strategy%20%3D%20%22ddp_find_unused_parameters_true%22,-logging.info>`_.  More details about this change: `lightning PR 16611 <https://github.com/Lightning-AI/lightning/pull/16611>`_.
+    Reference: `NeMo PR 6433 <https://github.com/NVIDIA/NeMo/pull/6433/files#:~:text=Resolve%20conversation-,cfg.trainer.strategy%20%3D%20%22ddp_find_unused_parameters_true%22,-logging.info>`__.  More details about this change: `lightning PR 16611 <https://github.com/Lightning-AI/lightning/pull/16611>`__.
 
 
-* If used Trainer's flag ``replace_sampler_ddp`` replace it with `use_distributed_sampler <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=use%20use_distributed_sampler%3B%20the%20sampler%20gets%20created%20not%20only%20for%20the%20DDP%20strategies>`_.
+* If used Trainer's flag ``replace_sampler_ddp`` replace it with `use_distributed_sampler <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=use%20use_distributed_sampler%3B%20the%20sampler%20gets%20created%20not%20only%20for%20the%20DDP%20strategies>`__.
 
-* If using ``CheckpointConnector`` replace it with `_CheckpointConnector <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-fbee9218112b5eb07e4b799b868cbe3ab582336157bde6dc7c881daa63965ff5R20>`_.
+* If using ``CheckpointConnector`` replace it with `_CheckpointConnector <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-fbee9218112b5eb07e4b799b868cbe3ab582336157bde6dc7c881daa63965ff5R20>`__.
 
 * To set or get ``ckpt_path`` use ``trainer.ckpt_path`` directly instead of calling protected API via ``trainer._checkpoint_connector._ckpt_path`` or using ``trainer._checkpoint_connector.resume_from_checkpoint_fit_path``.
 
 * Change ``import load`` from pytorch_lightning.utilities.cloud_io to ``import _load``.
 
-* If used ``from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin`` from replace it with `from pytorch_lightning.plugins.precision import MixedPrecisionPlugin <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20the%20pl.plugins.NativeMixedPrecisionPlugin%20plugin>`_. 
+* If used ``from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin`` from replace it with `from pytorch_lightning.plugins.precision import MixedPrecisionPlugin <https://lightning.ai/docs/pytorch/stable/upgrade/from_1_9.html#:~:text=used%20the%20pl.plugins.NativeMixedPrecisionPlugin%20plugin>`__.
 
 * Lightning 2.0 adds ``'16-mixed'``, ``'bf16-mixed'`` as the preicison values for fp16 mixed precision and bf16 mixed precision respectively. 
   
-    For backward compatbility ``16`` or ``'16'`` and ``'bf16'`` also perform mixed precision and is equivalent to ``'16-mixed'`` and ``'bf16-mixed'`` respectively. However, lightning recommends to use ``'16-mixed'`` and ``'bf16-mixed'`` to make it less ambiguous. Due to this, ``MegatronHalfPrecisionPlugin's`` parent class from lightning ``MixedPrecisionPlugin`` class, expects the precision arg to be ``'16-mixed'`` and ``'bf16-mixed'``. As a result it's required to pass ``'16-mixed'`` or ``'bf16-mixed'`` to ``MixedPrecisionPLugin`` whenever the precision passed is any of ``[16, '16', '16-mixed']`` or ``['bf16', 'bf16-mixed']``. This can be taken care as shown here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R140>`_ and here: `MixedPrecisionPlugin <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R148-R152>`_. Also, ``'32-true'`` is added as a precsion value for pure fp32 along with ``32``, ``'32'`` that existed. This can be taken into account as shown here in the `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R269>`_.
+    For backward compatbility ``16`` or ``'16'`` and ``'bf16'`` also perform mixed precision and is equivalent to ``'16-mixed'`` and ``'bf16-mixed'`` respectively. However, lightning recommends to use ``'16-mixed'`` and ``'bf16-mixed'`` to make it less ambiguous. Due to this, ``MegatronHalfPrecisionPlugin's`` parent class from lightning ``MixedPrecisionPlugin`` class, expects the precision arg to be ``'16-mixed'`` and ``'bf16-mixed'``. As a result it's required to pass ``'16-mixed'`` or ``'bf16-mixed'`` to ``MixedPrecisionPLugin`` whenever the precision passed is any of ``[16, '16', '16-mixed']`` or ``['bf16', 'bf16-mixed']``. This can be taken care as shown here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R140>`__ and here: `MixedPrecisionPlugin <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-c0fc606b0f7750c3444a51159ce5deaa422a8cc4dd1134c504c4df2fdb683d64R148-R152>`__. Also, ``'32-true'`` is added as a precsion value for pure fp32 along with ``32``, ``'32'`` that existed. This can be taken into account as shown here in the `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R269>`__.
 
-* Lightning 2.0 renames epoch end hooks from ``training_epoch_end``, ``validation_epoch_end``, ``test_epoch_end`` to ``on_train_epoch_end``, ``on_validation_epoch_end``, ``on_test_epoch_end``. The renamed hooks do not accept the outputs arg but instead outputs needs to be defined as an instance variable of the model class to which the outputs of the step needs to be manually appended. More detailed examples implementing this can be found under migration guide of `lightning's PR 16520 <https://github.com/Lightning-AI/lightning/pull/16520>`_. Example from NeMo  can be found `here <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R904-R911>`_.
+* Lightning 2.0 renames epoch end hooks from ``training_epoch_end``, ``validation_epoch_end``, ``test_epoch_end`` to ``on_train_epoch_end``, ``on_validation_epoch_end``, ``on_test_epoch_end``. The renamed hooks do not accept the outputs arg but instead outputs needs to be defined as an instance variable of the model class to which the outputs of the step needs to be manually appended. More detailed examples implementing this can be found under migration guide of `lightning's PR 16520 <https://github.com/Lightning-AI/lightning/pull/16520>`__. Example from NeMo  can be found `here <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-e93ccae74f4b67d341676afc9f3c7e2c50f751ec64df84eb3b2a86b62029ef76R904-R911>`__.
 
 * Lightning 2.0 is not currently supporting multiple dataloders for validation and testing in case of ``dataloader_iter``. The support for this will be added back soon in an upcoming release. If ``dataloader_iter`` is being used and your config passes multiple files to ``validation_ds.file_names`` or ``test_ds.file_names``, please use just one file until this issue is fixed with pytorch lightning.
 
 * With lightning 2.0 it's required to set ``limit_val_batches`` and ``num_sanity_val_steps`` to be a multiple of number of microbatches while using ``dataloader_iter`` (applies only to Megatron files that use dataloader_iter) for all pretraining files (not downstream tasks like finetuning). This is being taken care internally in NeMo and does not require anything to be done by the user. However, if you are a developer of NeMo and are building a new model for pretraining that uses ``dataloader_iter`` instead of batch in ``validation_step`` methods please make sure to call ``self._reconfigure_val_batches()`` in ``build_train_valid_test_datasets method`` of your model.
 
 * If model is being wrapped with ``LightningDistributedModule`` in ``configure_ddp`` method please replace it with ``_LightningModuleWrapperBase`` 
-  as being done here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR136>`_.
+  as being done here: `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR136>`__.
 
-* If using ``pre_configure_ddp()`` in your DDP, remove it as it's not required anymore. `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR148-R150>`_.
+* If using ``pre_configure_ddp()`` in your DDP, remove it as it's not required anymore. `NeMo upgrade to lightning 2.0 PR <https://github.com/NVIDIA/NeMo/pull/6433/files#diff-7667eae242a8ef776bff78cd08e79bc81df4896a450f0a781f6ed317a3dfb7ffR148-R150>`__.
 
 * If any of the tests use CPU as the device, ensure to explicitly pass it in the trainer as ``trainer = pl.Trainer(max_epochs=1, accelerator='cpu')`` since deafult val in PTL >= 2.0 is auto and it picks cuda.
 
diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst
index aa8d2139653f..df872e7d2195 100644
--- a/docs/source/tools/nemo_forced_aligner.rst
+++ b/docs/source/tools/nemo_forced_aligner.rst
@@ -12,14 +12,14 @@ NFA can be used on long audio files of 1+ hours duration (subject to your hardwa
 Demos & Tutorials
 -----------------
 
-* HuggingFace Space `demo <https://huggingface.co/spaces/erastorgueva-nv/NeMo-Forced-Aligner>`_ to quickly try out NFA in various languages.
-* NFA "how-to" notebook `tutorial <https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/>`_.
-* "How forced alignment works" NeMo blog `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`_.
+* HuggingFace Space `demo <https://huggingface.co/spaces/erastorgueva-nv/NeMo-Forced-Aligner>`__ to quickly try out NFA in various languages.
+* NFA "how-to" notebook `tutorial <https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/>`__.
+* "How forced alignment works" NeMo blog `tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`__.
 
 Quickstart
 ----------
 
-1. Install `NeMo <https://github.com/NVIDIA/NeMo#installation>`_.
+1. Install `NeMo <https://github.com/NVIDIA/NeMo#installation>`__.
 2. Prepare a NeMo-style manifest containing the paths of audio files you would like to proces, and (optionally) their text.
 3. Run NFA's ``align.py`` script with the desired config, e.g.:
 
diff --git a/docs/source/vision/checkpoint.rst b/docs/source/vision/checkpoint.rst
index 7e3e197a1169..49848b90d51a 100644
--- a/docs/source/vision/checkpoint.rst
+++ b/docs/source/vision/checkpoint.rst
@@ -63,7 +63,7 @@ ViT Checkpoints
 
 To adjust model parallelism from original model parallelism size to a new model parallelism size (Note: NeMo ViT currently only supports `pipeline_model_parallel_size=1`):
 
-.. code-block:: python
+.. code-block:: bash
 
    python examples/nlp/language_modeling/megatron_change_num_partitions.py \
     --model_file=/path/to/source.nemo \
diff --git a/docs/source/vision/vit.rst b/docs/source/vision/vit.rst
index 679313bcbd66..a7b4e2546f22 100644
--- a/docs/source/vision/vit.rst
+++ b/docs/source/vision/vit.rst
@@ -4,7 +4,7 @@ ViT
 Model Introduction
 -------------------
 
-The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit`, serves as a foundational model
+The Vision Transformer, commonly referred to as ViT :cite:`vision-models-vit-vit`, serves as a foundational model
 for image classification tasks in NeMo. Unlike conventional convolutional neural networks, ViT adopts a transformer-like
 architecture to process image data. In this approach, an image is divided into fixed-size patches, typically
 14x14 or 16x16. These patches are linearly embedded and augmented with position embeddings. The resulting
@@ -136,5 +136,5 @@ Reference
 .. bibliography:: ./vision_all.bib
     :style: plain
     :filter: docname in docnames
-    :labelprefix: VISION-MODELS
-    :keyprefix: vision-models-
+    :labelprefix: VISION-MODELS-VIT
+    :keyprefix: vision-models-vit-
diff --git a/nemo/collections/asr/models/asr_model.py b/nemo/collections/asr/models/asr_model.py
index 4420318dd416..e14424cec5c1 100644
--- a/nemo/collections/asr/models/asr_model.py
+++ b/nemo/collections/asr/models/asr_model.py
@@ -203,9 +203,9 @@ def forward_for_export(
         """
         This forward is used when we need to export the model to ONNX format.
         Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
+
         Args:
-            input: Tensor that represents a batch of raw audio signals,
-                of shape [B, T]. T here represents timesteps.
+            input: Tensor that represents a batch of raw audio signals of shape [B, T]. T here represents timesteps.
             length: Vector of length B, that contains the individual lengths of the audio sequences.
             cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
             cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
diff --git a/nemo/collections/asr/models/msdd_models.py b/nemo/collections/asr/models/msdd_models.py
index d96bafd5af9b..01926eb4ae79 100644
--- a/nemo/collections/asr/models/msdd_models.py
+++ b/nemo/collections/asr/models/msdd_models.py
@@ -400,10 +400,15 @@ def get_cluster_avg_embs_model(
                 multi-scale input tensors during forward propagating.
 
                 Example: `batch_size=3, scale_n=6, emb_dim=192`
-                    ms_seg_counts =  
-                     [[8,  9, 12, 16, 25, 51],  
-                      [11, 13, 14, 17, 25, 51],  
-                      [ 9,  9, 11, 16, 23, 50]]  
+                    .. code:: python
+
+                        ms_seg_counts =
+                            [
+                                [ 8,  9, 12, 16, 25, 51],
+                                [11, 13, 14, 17, 25, 51],
+                                [ 9,  9, 11, 16, 23, 50]
+                            ]
+
                     Counts of merged segments: (121, 131, 118)  
                     embs has shape of (370, 192)  
                     clus_label_index has shape of (3, 131)  
diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 5a7457f6379d..055066c00660 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -1559,13 +1559,13 @@ def joint_after_projection(self, f: torch.Tensor, g: torch.Tensor) -> torch.Tens
         NOTE:
             The implementation of this model is slightly modified from the original paper.
             The original paper proposes the following steps :
-            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
-            *1 -> Forward through joint final [B, T, U, V + 1].
+            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
+            \*1 -> Forward through joint final [B, T, U, V + 1].
 
             We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
-            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
-            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
-            (*1, *2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].
+            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
+            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
+            (\*1, \*2) -> Sum [B, T, U, H] -> Forward through joint final [B, T, U, V + 1].
 
         Args:
             f: Output of the Encoder model. A torch.Tensor of shape [B, T, H1]
@@ -2050,8 +2050,7 @@ def sampled_joint(
         """
         Compute the sampled joint step of the network.
 
-        # Reference
-        - [Memory-Efficient Training of RNN-Transducer with Sampled Softmax](https://arxiv.org/abs/2203.16868)
+        Reference: `Memory-Efficient Training of RNN-Transducer with Sampled Softmax <https://arxiv.org/abs/2203.16868>`__.
 
         Here,
         B = Batch size
@@ -2065,13 +2064,13 @@ def sampled_joint(
         NOTE:
             The implementation of this joint model is slightly modified from the original paper.
             The original paper proposes the following steps :
-            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- *1
-            *1 -> Forward through joint final [B, T, U, V + 1].
+            (enc, dec) -> Expand + Concat + Sum [B, T, U, H1+H2] -> Forward through joint hidden [B, T, U, H] -- \*1
+            \*1 -> Forward through joint final [B, T, U, V + 1].
 
             We instead split the joint hidden into joint_hidden_enc and joint_hidden_dec and act as follows:
-            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- *1
-            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- *2
-            (*1, *2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
+            enc -> Forward through joint_hidden_enc -> Expand [B, T, 1, H] -- \*1
+            dec -> Forward through joint_hidden_dec -> Expand [B, 1, U, H] -- \*2
+            (\*1, \*2) -> Sum [B, T, U, H] -> Sample Vocab V_Pos (for target tokens) and V_Neg ->
             (V_Neg is sampled not uniformly by as a rand permutation of all vocab tokens, then eliminate
             all Intersection(V_Pos, V_Neg) common tokens to avoid duplication of loss) ->
             Concat new Vocab V_Sampled = Union(V_Pos, V_Neg)
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index b264890ce48d..dc0cef692ee2 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -26,9 +26,10 @@
 
 
 class AutoTokenizer(TokenizerSpec):
-    '''
+    """
         Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
-    '''
+
+    """
 
     def __init__(
         self,
@@ -52,7 +53,7 @@ def __init__(
                 For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained. 
                 The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
             vocab_file: path to file with vocabulary which consists
-                of characters separated by '\n'.
+                of characters separated by newlines.
             mask_token: mask token 
             bos_token: the beginning of sequence token
             eos_token: the end of sequence token. Usually equal to sep_token
@@ -167,11 +168,13 @@ def add_special_tokens(self, special_tokens_dict: dict) -> int:
         """
         Adds a dictionary of special tokens (eos, pad, cls...). If special tokens are NOT in the vocabulary, they are added
         to it (indexed starting from the last index of the current vocabulary).
+
         Args:
             special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
                 [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
                 ``additional_special_tokens``].
-            Tokens are only added if they are not already in the vocabulary.
+                Tokens are only added if they are not already in the vocabulary.
+
         Returns:
             Number of tokens added to the vocabulary.
         """
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
index 72f4fd0e12a1..f0efaf5cd1aa 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_dataset.py
@@ -252,7 +252,8 @@ def build_training_sample(
         skip_masking_id=None,
     ):
         """Build training sample.
-        Arguments:
+
+        Args:
             sample: A list of sentences in which each sentence is a list token ids.
             target_seq_length: Desired sequence length.
             max_seq_length: Maximum length of the sequence. All values are padded to
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
index 5ed0da009cf2..fb8ec9554a95 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/t5_prompt_learning_dataset.py
@@ -72,10 +72,10 @@ def load_data(self, dataset):
         """
         Loads a dataset by filling in the task templates specified in the config file
         with the information from each training/inference example. Converts all input 
-        text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in 
+        text into token ids. Also replaces the ``<|VIRTUAL_PROMPT_#|>`` placeholders in
         the task templates with the actual virtual prompt token ids. 
 
-        params:
+        Args:
             dataset: A list of json objects or a dictionary objects each
                      containing the information needed for a training example
         """
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
index c2d19305cf03..485388d84343 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/ul2_dataset.py
@@ -25,6 +25,7 @@
 class UL2Dataset(T5Dataset):
     """ UL2 Dataset from https://arxiv.org/abs/2205.05131.
     Consists of three different objectives:
+
     1. Short span masking with small probabilities (ex: T5). Typically max ngram size of 5 with 0.15 mask prob.
     2. Extreme span masking with either large probabilities or large ngram sizes or both.
     3. Prefx-LM as in the T5 or LM-adapted T5 (prompt-tuning paper).
@@ -312,7 +313,8 @@ def build_extreme_masking_training_sample(
         skip_masking_id=None,
     ):
         """Build training sample.
-        Arguments:
+
+        Args:
             sample: A list of sentences in which each sentence is a list token ids.
             target_seq_length: Desired sequence length.
             max_seq_length: Maximum length of the sequence. All values are padded to
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index d974c8182234..102ab5ec0f84 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -182,9 +182,11 @@ def build_train_valid_test_datasets(self):
         return self._train_ds, self._validation_ds, self._test_ds
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index dc6d81649122..0f1fa76f9b01 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -760,9 +760,11 @@ def _append_sequence_parallel_module_grads(self, module, grads):
                 grads.append(grad.data)
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 7a2f3459470c..d7f489abf158 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1475,9 +1475,11 @@ def build_pretraining_data_loader(
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 459bf5b71c7e..4c39bd877b4a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -911,9 +911,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, num_workers):
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
@@ -1413,11 +1415,13 @@ def dummy():
 
     def complete(self, request: Dict):
         """
-            Autoregressively invokes language model in the inference mode
+        Autoregressively invokes language model in the inference mode
+
         Args:
             request: Dictionary with the following fields
                 * prompt: a string which text the model should complete.
                 * tokens_to_generate: how many tokens to generate while doing prompt completion.
+
         Returns:
             response: A python dictionary with the following fields
                 * prompt: original text of the prompt
diff --git a/nemo/collections/nlp/modules/common/transformer/text_generation.py b/nemo/collections/nlp/modules/common/transformer/text_generation.py
index a4e37935adc9..5f0275ff4553 100644
--- a/nemo/collections/nlp/modules/common/transformer/text_generation.py
+++ b/nemo/collections/nlp/modules/common/transformer/text_generation.py
@@ -67,47 +67,48 @@ def generate(
             inputs (Union[List[str], Tensor, List[dict]]):
                 Can be one of the 3 types: 
 
-                    1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
-                        E.g [‘sentence’, ‘sentence2’ … ]
+                1. List of strings. Each element of the list provides input prompt. The model will apply tokenizer on it.
+                    E.g [‘sentence’, ‘sentence2’ … ]
 
-                    2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder. 
-                        The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
-                        E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
+                2. Tuple of Pytorch Tensors (context_tokens, context_lengths). The `context_tokens` has shape (batch_size, seq_length),  it's the batched sequences of tokens used as a prompst for the generation or as model inputs to the encoder.
+                    The generative model will skip the tokenization and padding step.  The `context_lengths` has shape (batch_size,), it indicates the length of the context tokens for each of the input sequences.
+                    E.g. ( torch.tensor([[23,5234,23,35,…], [223,323,23,23232,232,...] …]), torch.tensor([20, 30, …]))
 
-                    3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
-                        E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
-                        {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
-                        where 'prompt-tag' is used to identify the type of NLP task to solve.
+                3. List of python dict objects. Used for prompt/p-tuning inputs where a set of key-value pairs are converted into input token embeddings for the model.
+                    E.g. [{"prompt-tag": "sentiment", "sentence": "this is a good movie"},
+                    {"prompt-tag": "qa", "context": "some context text", "question": "a simple question"} ... ]
+                    where 'prompt-tag' is used to identify the type of NLP task to solve.
 
             length_params (LengthParam):
                 a dictionary type which controls the sampling length.
 
-                    max_length: int, The maximum length of the sequence to be generated.
-
-                    min_length: int,  The minimum length of the sequence to be generated.
+                * max_length: int, The maximum length of the sequence to be generated.
+                * min_length: int,  The minimum length of the sequence to be generated.
 
                 If None, max_length is set to 30, and min_length is set to None
+
             sampling_params (SamplingParam):
                 a dictionary type which contains the parameters for text sampling. It has the following keys
 
-                    use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
-                    top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
-                    top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
-                    repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty. 
-                    add_BOS: bool, Whether add the bos token at the begining of the prompt
-                    all_probs: bool  # whether return the log prob for all the tokens in vocab
-                    compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-                    end_strings: List[str]  # generation will stop when one of these tokens is generated
+                * use_greedy: bool,  Whether or not to use sampling ; use greedy decoding otherwise
+                * top_k: int, The number of highest probability vocabulary tokens to keep for top-k-filtering.
+                * top_p: float, If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+                * repetition_penalty: float, The parameter for repetition penalty. 1.0 means no penalty.
+                * add_BOS: bool, Whether add the bos token at the begining of the prompt
+                * all_probs: bool  # whether return the log prob for all the tokens in vocab
+                * compute_logprob: bool  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+                * end_strings: List[str]  # generation will stop when one of these tokens is generated
+
                 Default None, If it is None, use_greedy will be "True".
 
         Returns:
-            OutputType: It generates the output in a dictionary type. It has the following keys:
-
-                sentences: List[str], output sentences
-                tokens: List[List[str]], output sentences borken into tokens
-                logprob: List[List[float]],  log prob of generated tokens
-                full_logprob: List[List[float]], log prob of all the tokens in the vocab
-                token_ids: List[List[int]], output sentence token ids
-                offsets: List[List[int]]  # list of tokens start positions in text
+            It generates the output in a dictionary type. It has the following keys,
+
+            * sentences: List[str], output sentences
+            * tokens: List[List[str]], output sentences borken into tokens
+            * logprob: List[List[float]],  log prob of generated tokens
+            * full_logprob: List[List[float]], log prob of all the tokens in the vocab
+            * token_ids: List[List[int]], output sentence token ids
+            * offsets: List[List[int]]  # list of tokens start positions in text
         """
         raise NotImplementedError("please implement this method")
diff --git a/nemo/collections/vision/models/megatron_vit_classification_models.py b/nemo/collections/vision/models/megatron_vit_classification_models.py
index c27c37c2b917..ea6d3578c540 100644
--- a/nemo/collections/vision/models/megatron_vit_classification_models.py
+++ b/nemo/collections/vision/models/megatron_vit_classification_models.py
@@ -621,9 +621,11 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, drop_last=Tru
         )
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
-            We setup datasets here as megatron datasets require DDP to instantiate.
-            See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+        """
+        PTL hook that is executed after DDP spawns.
+        We setup datasets here as megatron datasets require DDP to instantiate.
+        See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
+
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
diff --git a/nemo/core/classes/dataset.py b/nemo/core/classes/dataset.py
index 738ae22f5416..789fc0b863d7 100644
--- a/nemo/core/classes/dataset.py
+++ b/nemo/core/classes/dataset.py
@@ -42,12 +42,15 @@ def collate_fn(self, batch):
 
         Please note, subclasses of Dataset should not implement `input_types`.
 
-        # Usage:
-        dataloader = torch.utils.data.DataLoader(
-                ....,
-                collate_fn=dataset.collate_fn,
-                ....
-        )
+        Usage:
+
+        .. code-block:: python
+
+            dataloader = torch.utils.data.DataLoader(
+                    ....,
+                    collate_fn=dataset.collate_fn,
+                    ....
+            )
 
         Returns:
             Collated batch, with or without types.
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index be9a6e8cfbb3..5c7cac5a9a55 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -304,9 +304,9 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
                 recent checkpoint under ``*last.ckpt``, and the final checkpoint after training completes under ``*end.ckpt``.
                 Defaults to True.
             - create_early_stopping_callback (bool): Flag to decide if early stopping should be used to stop training. Default is False.
-             See EarlyStoppingParams dataclass above.
+                See EarlyStoppingParams dataclass above.
             - create_preemption_callback (bool): Flag to decide whether to enable preemption callback to save checkpoints and exit training
-             immediately upon preemption. Default is True.
+                immediately upon preemption. Default is True.
             - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which
                 copies no files.
             - log_local_rank_0_only (bool): Whether to only create log files for local rank 0. Defaults to False.

From 9e2325d18b4a0e6576ffabe8003c3cad26eb3954 Mon Sep 17 00:00:00 2001
From: Valerie Sarge <vsarge@nvidia.com>
Date: Wed, 1 May 2024 16:34:21 -0700
Subject: [PATCH 2/2] Handle case where num_query_groups is set to null for
 LoRA config setup (#9075)

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
---
 nemo/collections/nlp/parts/peft_config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index 47d5167d630e..820e2ad63f24 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -123,6 +123,9 @@ def __init__(self, cfg):
         kv_channels = self._calculate_kv_channels(cfg)
         projection_size = kv_channels * cfg.num_attention_heads
         num_query_groups = cfg.get("num_query_groups", cfg.num_attention_heads)
+        if num_query_groups is None:
+            # Cover the case where num_query_groups is explicitly set to null
+            num_query_groups = cfg.num_attention_heads
 
         qkv_projection_size = projection_size + (2 * kv_channels * num_query_groups)