From b2debd8d83ab88dbfbd6a1d9e7949320c4a7e8e2 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 9 Apr 2024 16:03:12 -0700
Subject: [PATCH 01/39] Akoumparouli/fix get params for weight decay
 optimization (#8841)

* fix get_params_for_weight_decay_optimization

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* filter returned values by presence of parameters

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* use module_._parameters.items instead of .named_parameters to avoid duplicate params

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../nlp/modules/common/megatron/utils.py      | 31 ++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 42d14592c363..97022ab5e459 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -366,17 +366,25 @@ def get_params_for_weight_decay_optimization(
     is_expert = lambda param: not getattr(param, 'allreduce', True)
     # Do the actual param classification
     for module in modules:
-        for name, param in module.named_parameters():
-            if param is None:
-                continue
-            if name.endswith('.bias'):
-                no_weight_decay_params['params'].extend([param])
+        for module_ in module.modules():
+            if isinstance(module_, (FusedLayerNorm, FastLayerNorm, MixedFusedRMSNorm)):
+                no_weight_decay_params['params'].extend(
+                    list(filter(lambda p: p is not None, module_._parameters.values()))
+                )
             else:
-                if is_expert(param):
-                    weight_decay_expert_params['params'].extend([param])
-                else:
-                    weight_decay_params['params'].extend([param])
-    return weight_decay_params, weight_decay_expert_params, no_weight_decay_params
+                for name, param in module_._parameters.items():
+                    if param is None:
+                        continue
+                    if name.endswith('bias'):
+                        no_weight_decay_params['params'].extend([param])
+                    else:
+                        if is_expert(param):
+                            weight_decay_expert_params['params'].extend([param])
+                        else:
+                            weight_decay_params['params'].extend([param])
+
+    param_groups = [weight_decay_params, weight_decay_expert_params, no_weight_decay_params]
+    return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
 def get_all_params_for_weight_decay_optimization(
@@ -394,7 +402,8 @@ def get_all_params_for_weight_decay_optimization(
         weight_decay_params['params'] += list(filter(lambda x: not is_expert(x), module.parameters()))
         weight_decay_expert_params['params'] += list(filter(is_expert, module.parameters()))
 
-    return weight_decay_params, weight_decay_expert_params
+    param_groups = [weight_decay_params, weight_decay_expert_params]
+    return tuple(filter(lambda g: len(g['params']) > 0, param_groups))
 
 
 def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> Iterator:

From 3f3df1c34aa73cfc890b93af678e3c3a63760fe5 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:07:27 -0700
Subject: [PATCH 02/39] Akoumparouli/peft fix (#8823)

* Move precision restoration inside megtron_trainer_builder

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Don't enforce O1 in eval

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* safer prefix replacer

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* comment

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* drop conf resolve

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* typo

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../tuning/megatron_gpt_finetuning.py          |  5 -----
 .../nlp/parts/megatron_trainer_builder.py      |  7 ++++++-
 .../nlp/parts/mixins/nlp_adapter_mixins.py     | 18 +++++++++++++++---
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
index 1e6f680fad7e..aaa087a46623 100644
--- a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
+++ b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py
@@ -56,12 +56,7 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    # cfg.trainer.precision becomes None in TrainerBuilder if precision_plugins exist since both precision plugins and precision
-    # can't exist in PTL >= 2.1, hence storing precision value from cfg.trainer.precision as its used for future steps like in merge_cfg_with func.
-    precision = cfg.trainer.precision
     trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer()
-    # Restore the precision value after Trainer is built.
-    cfg.trainer.precision = precision
     exp_manager(trainer, cfg.exp_manager)
 
     model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg)
diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
index 77d306c17da0..b25ce249d09d 100644
--- a/nemo/collections/nlp/parts/megatron_trainer_builder.py
+++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py
@@ -129,12 +129,17 @@ def _plugins(self) -> list:
         return plugins
 
     def create_trainer(self, callbacks=None) -> Trainer:
+        # cfg.trainer.precision becomes None in Trainer if precision_plugins exist since both precision plugins and precision
+        precision = self.cfg.trainer.precision
         strategy = self._training_strategy()
         plugins = self._plugins()
         # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
         if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar:
             callbacks = [CustomProgressBar()]
-        return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
+        trainer = Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks)
+        # Restore the precision value after Trainer is built.
+        self.cfg.trainer.precision = precision
+        return trainer
 
 
 class MegatronBertTrainerBuilder(MegatronTrainerBuilder):
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 3797ec909737..123f0f06a33d 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -47,6 +47,14 @@
     HAVE_MEGATRON_CORE = False
 
 
+def replace_prefix(name, old_prefix, new_prefix):
+    if name.startswith(new_prefix):
+        return name
+    if not name.startswith(old_prefix):
+        return name
+    return name.replace(old_prefix, new_prefix, 1)
+
+
 class NLPAdapterModelMixin:
     """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support.
     This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule.
@@ -268,7 +276,7 @@ def load_adapters(
         """
         Utility method that restores only the adapter module(s), and not the entire model itself.
         This allows the sharing of adapters which are often just a fraction of the size of the full model,
-        enabling easier deliver.
+        enabling easier delivery.
 
         .. note::
 
@@ -299,6 +307,8 @@ def load_adapters(
                 '.nemo'
             ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument."
             peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)]
+        if self.cfg.megatron_amp_O2:
+            state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()}
         self.add_adapter(peft_cfgs)
         if not self.ptuning_only_and_non_first_stage:
             assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys)
@@ -506,17 +516,19 @@ def merge_inference_cfg(cls, path: str, cfg: DictConfig) -> DictConfig:
 
         with open_dict(peft_cfg):
             # update the model config of the trained model with params we want to set at inference time.
-            peft_cfg.precision = cfg.trainer.precision
             for key, val in cfg.model.items():
                 if key != 'data':
                     peft_cfg[key] = val
+            if cfg.get("trainer", None) and cfg.trainer.get("precision"):
+                peft_cfg.precision = cfg.trainer.precision
             peft_cfg.data.test_ds = cfg.model.data.test_ds
 
         with open_dict(cfg):
             cfg.inference.add_BOS = peft_cfg.data.test_ds.add_bos
             cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.get("tokens_to_generate", 1)
 
-        peft_cfg.megatron_amp_O2 = False  # always evaluate with O1
+        if cfg.model.get('megatron_amp_O2', None) is not None:
+            peft_cfg.megatron_amp_O2 = cfg.model.megatron_amp_O2
         return peft_cfg
 
     def freeze(self, training: bool = False) -> None:

From 4d0ae36dc8b6b720a66a43c5bb9eb6ef6e27fec1 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Tue, 9 Apr 2024 21:28:04 -0400
Subject: [PATCH 03/39] Add deploy triton and query scripts (#8852)

* Add deploy triton and query scripts

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Update scripts based on reviews

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
---
 scripts/deploy/nlp/deploy_triton.py | 274 ++++++++++++++++++++++++++++
 scripts/deploy/nlp/query.py         | 247 +++++++++++++++++++++++++
 2 files changed, 521 insertions(+)
 create mode 100755 scripts/deploy/nlp/deploy_triton.py
 create mode 100644 scripts/deploy/nlp/query.py

diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
new file mode 100755
index 000000000000..aa896e924584
--- /dev/null
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+
+from nemo.deploy import DeployPyTriton
+from nemo.export import TensorRTLLM
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def get_args(argv):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton",
+    )
+    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
+    parser.add_argument(
+        "-ptnc",
+        "--ptuning_nemo_checkpoint",
+        nargs='+',
+        type=str,
+        required=False,
+        help="Source .nemo file for prompt embeddings table",
+    )
+    parser.add_argument(
+        '-ti', '--task_ids', nargs='+', type=str, required=False, help='Unique task names for the prompt embedding.'
+    )
+    parser.add_argument(
+        "-mt",
+        "--model_type",
+        type=str,
+        required=False,
+        choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma"],
+        help="Type of the model. gptnext, gpt, llama, falcon, and starcoder are only supported."
+        " gptnext and gpt are the same and keeping it for backward compatibility",
+    )
+    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
+    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
+    parser.add_argument(
+        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
+    )
+    parser.add_argument(
+        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
+    )
+    parser.add_argument(
+        "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
+    )
+    parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
+    parser.add_argument(
+        "-dt",
+        "--dtype",
+        choices=["bfloat16", "float16", "fp8", "int8"],
+        default="bfloat16",
+        type=str,
+        help="dtype of the model on TensorRT-LLM",
+    )
+    parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
+    parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
+    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
+    parser.add_argument(
+        "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
+    )
+    parser.add_argument(
+        "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
+    )
+    parser.add_argument(
+        "-dcf",
+        "--disable_context_fmha",
+        default=False,
+        action='store_true',
+        help="Disable fused Context MultiHeadedAttention (required for V100 support).",
+    )
+    parser.add_argument(
+        "-mbm",
+        '--multi_block_mode',
+        default=False,
+        action='store_true',
+        help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
+                        It is beneifical when batchxnum_heads cannot fully utilize GPU.',
+    )
+    parser.add_argument(
+        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
+    )
+    parser.add_argument(
+        '--use_lora_plugin',
+        nargs='?',
+        const=None,
+        default=False,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates the lora plugin which enables embedding sharing.",
+    )
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",],
+        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
+    )
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.',
+    )
+    parser.add_argument(
+        "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
+    )
+    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
+
+    args = parser.parse_args(argv)
+    return args
+
+
+def nemo_deploy(argv):
+    args = get_args(argv)
+
+    if args.debug_mode:
+        loglevel = logging.DEBUG
+    else:
+        loglevel = logging.INFO
+
+    LOGGER.setLevel(loglevel)
+    LOGGER.info("Logging level set to {}".format(loglevel))
+    LOGGER.info(args)
+
+    if args.triton_model_repository is None:
+        trt_llm_path = "/tmp/trt_llm_model_dir/"
+        LOGGER.info(
+            "/tmp/trt_llm_model_dir/ path will be used as the TensorRT LLM folder. "
+            "Please set this parameter if you'd like to use a path that has already "
+            "included the TensorRT LLM model files."
+        )
+        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
+    else:
+        trt_llm_path = args.triton_model_repository
+
+    if args.nemo_checkpoint is None and args.triton_model_repository is None:
+        LOGGER.error(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint."
+        )
+        return
+
+    if args.nemo_checkpoint is None and not os.path.isdir(args.triton_model_repository):
+        LOGGER.error(
+            "The provided model repository is not a valid TensorRT-LLM model "
+            "directory. Please provide a --nemo_checkpoint."
+        )
+        return
+
+    if args.nemo_checkpoint is not None and args.model_type is None:
+        LOGGER.error("Model type is required to be defined if a nemo checkpoint is provided.")
+        return
+
+    ptuning_tables_files = []
+    if not args.ptuning_nemo_checkpoint is None:
+        if args.max_prompt_embedding_table_size is None:
+            LOGGER.error("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).")
+            return
+
+        for pt_checkpoint in args.ptuning_nemo_checkpoint:
+            ptuning_nemo_checkpoint_path = Path(pt_checkpoint)
+            if ptuning_nemo_checkpoint_path.exists():
+                if ptuning_nemo_checkpoint_path.is_file():
+                    ptuning_tables_files.append(pt_checkpoint)
+                else:
+                    LOGGER.error("Could not read the prompt tuning tables from {0}".format(pt_checkpoint))
+                    return
+            else:
+                LOGGER.error("File or directory {0} does not exist.".format(pt_checkpoint))
+                return
+
+        if args.task_ids is not None:
+            if len(ptuning_tables_files) != len(args.task_ids):
+                LOGGER.error(
+                    "Number of task ids and prompt embedding tables have to match. "
+                    "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids))
+                )
+                return
+
+    trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt)
+
+    if args.nemo_checkpoint is not None:
+        try:
+            LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
+            trt_llm_exporter.export(
+                nemo_checkpoint_path=args.nemo_checkpoint,
+                model_type=args.model_type,
+                n_gpus=args.num_gpus,
+                tensor_parallel_size=args.num_gpus,
+                pipeline_parallel_size=1,
+                max_input_token=args.max_input_len,
+                max_output_token=args.max_output_len,
+                max_batch_size=args.max_batch_size,
+                max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+                paged_kv_cache=args.use_paged_kv_cache,
+                enable_context_fmha=not args.disable_context_fmha,
+                dtype=args.dtype,
+                enable_multi_block_mode=args.multi_block_mode,
+                use_lora_plugin=args.use_lora_plugin,
+                lora_target_modules=args.lora_target_modules,
+                max_lora_rank=args.max_lora_rank,
+                save_nemo_model_config=True,
+            )
+        except Exception as error:
+            LOGGER.error("An error has occurred during the model export. Error message: " + str(error))
+            return
+
+    try:
+        for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files):
+            if args.task_ids is not None:
+                task_id = args.task_ids[i]
+            else:
+                task_id = i
+
+            LOGGER.info(
+                "Adding prompt embedding table: {0} with task id: {1}.".format(
+                    prompt_embeddings_checkpoint_path, task_id
+                )
+            )
+            trt_llm_exporter.add_prompt_table(
+                task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+            )
+    except Exception as error:
+        LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error))
+        return
+
+    try:
+        nm = DeployPyTriton(
+            model=trt_llm_exporter,
+            triton_model_name=args.triton_model_name,
+            triton_model_version=args.triton_model_version,
+            max_batch_size=args.max_batch_size,
+            port=args.triton_port,
+            address=args.triton_http_address,
+            streaming=args.enable_streaming,
+        )
+
+        LOGGER.info("Triton deploy function will be called.")
+        nm.deploy()
+    except Exception as error:
+        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
+        return
+
+    try:
+        LOGGER.info("Model serving on Triton is will be started.")
+        nm.serve()
+    except Exception as error:
+        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
+        return
+
+    LOGGER.info("Model serving will be stopped.")
+    nm.stop()
+
+
+if __name__ == '__main__':
+    nemo_deploy(sys.argv[1:])
diff --git a/scripts/deploy/nlp/query.py b/scripts/deploy/nlp/query.py
new file mode 100644
index 000000000000..20f3d587a1cc
--- /dev/null
+++ b/scripts/deploy/nlp/query.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+import typing
+
+import numpy as np
+from pytriton.client import DecoupledModelClient, ModelClient
+
+
+def get_args(argv):
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Exports nemo models stored in nemo checkpoints to TensorRT-LLM",
+    )
+    parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
+    parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
+    prompt_group = parser.add_mutually_exclusive_group(required=True)
+    prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt")
+    prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from")
+    parser.add_argument("-swl", "--stop_words_list", type=str, help="Stop words list")
+    parser.add_argument("-bwl", "--bad_words_list", type=str, help="Bad words list")
+    parser.add_argument("-nrns", "--no_repeat_ngram_size", type=int, help="No repeat ngram size")
+    parser.add_argument("-mot", "--max_output_token", default=128, type=int, help="Max output token length")
+    parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
+    parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
+    parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
+    parser.add_argument("-ti", "--task_id", type=str, help="Task id for the prompt embedding tables")
+    parser.add_argument(
+        "-lt",
+        "--lora_task_uids",
+        default=None,
+        type=str,
+        nargs="+",
+        help="The list of LoRA task uids; use -1 to disable the LoRA module",
+    )
+    parser.add_argument(
+        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
+    )
+    parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
+
+    args = parser.parse_args(argv)
+    return args
+
+
+def str_list2numpy(str_list: typing.List[str]) -> np.ndarray:
+    str_ndarray = np.array(str_list)[..., np.newaxis]
+    return np.char.encode(str_ndarray, "utf-8")
+
+
+def query_llm(
+    url,
+    model_name,
+    prompts,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    max_output_token=128,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    random_seed=None,
+    task_id=None,
+    lora_uids=None,
+    init_timeout=60.0,
+):
+    prompts = str_list2numpy(prompts)
+    inputs = {"prompts": prompts}
+
+    if max_output_token is not None:
+        inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
+
+    if top_k is not None:
+        inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+    if top_p is not None:
+        inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+    if temperature is not None:
+        inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+    if random_seed is not None:
+        inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.single)
+
+    if stop_words_list is not None:
+        stop_words_list = np.char.encode(stop_words_list, "utf-8")
+        inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
+
+    if bad_words_list is not None:
+        bad_words_list = np.char.encode(bad_words_list, "utf-8")
+        inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
+
+    if no_repeat_ngram_size is not None:
+        inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
+
+    if task_id is not None:
+        task_id = np.char.encode(task_id, "utf-8")
+        inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
+
+    if lora_uids is not None:
+        lora_uids = np.char.encode(lora_uids, "utf-8")
+        inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
+    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
+        result_dict = client.infer_batch(**inputs)
+        output_type = client.model_config.outputs[0].dtype
+
+    if output_type == np.bytes_:
+        sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8")
+        return sentences
+    else:
+        return result_dict["outputs"]
+
+
+def query_llm_streaming(
+    url,
+    model_name,
+    prompts,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    max_output_token=512,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    random_seed=None,
+    task_id=None,
+    lora_uids=None,
+    init_timeout=60.0,
+):
+    prompts = str_list2numpy(prompts)
+    inputs = {"prompts": prompts}
+
+    if max_output_token is not None:
+        inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
+
+    if top_k is not None:
+        inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+    if top_p is not None:
+        inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+    if temperature is not None:
+        inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+    if random_seed is not None:
+        inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_)
+
+    if stop_words_list is not None:
+        stop_words_list = np.char.encode(stop_words_list, "utf-8")
+        inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
+
+    if bad_words_list is not None:
+        bad_words_list = np.char.encode(bad_words_list, "utf-8")
+        inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
+
+    if no_repeat_ngram_size is not None:
+        inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
+
+    if task_id is not None:
+        task_id = np.char.encode(task_id, "utf-8")
+        inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
+
+    if lora_uids is not None:
+        lora_uids = np.char.encode(lora_uids, "utf-8")
+        inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
+    with DecoupledModelClient(url, model_name, init_timeout_s=init_timeout) as client:
+        for partial_result_dict in client.infer_batch(**inputs):
+            output_type = client.model_config.outputs[0].dtype
+            if output_type == np.bytes_:
+                sentences = np.char.decode(partial_result_dict["outputs"].astype("bytes"), "utf-8")
+                yield sentences
+            else:
+                yield partial_result_dict["outputs"]
+
+
+def query(argv):
+    args = get_args(argv)
+
+    if args.prompt_file is not None:
+        with open(args.prompt_file, "r") as f:
+            args.prompt = f.read()
+
+    if args.enable_streaming:
+        output_generator = query_llm_streaming(
+            url=args.url,
+            model_name=args.model_name,
+            prompts=[args.prompt],
+            stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
+            bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
+            no_repeat_ngram_size=args.no_repeat_ngram_size,
+            max_output_token=args.max_output_token,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            task_id=args.task_id,
+            lora_uids=args.lora_task_uids,
+            init_timeout=args.init_timeout,
+        )
+        # The query returns a generator that yields one array per model step,
+        # with the partial generated text in the last dimension. Print that partial text
+        # incrementally and compare it with all the text generated so far.
+        prev_output = ''
+        for output in output_generator:
+            cur_output = output[0][0]
+            if prev_output == '' or cur_output.startswith(prev_output):
+                print(cur_output[len(prev_output) :], end='', flush=True)
+            else:
+                print("WARN: Partial output mismatch, restarting output...")
+                print(cur_output, end='', flush=True)
+            prev_output = cur_output
+        print()
+
+    else:
+        outputs = query_llm(
+            url=args.url,
+            model_name=args.model_name,
+            prompts=[args.prompt],
+            stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
+            bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
+            no_repeat_ngram_size=args.no_repeat_ngram_size,
+            max_output_token=args.max_output_token,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            temperature=args.temperature,
+            task_id=args.task_id,
+            lora_uids=args.lora_task_uids,
+            init_timeout=args.init_timeout,
+        )
+        print(outputs[0][0])
+
+
+if __name__ == '__main__':
+    query(sys.argv[1:])

From a6db8dbabec50ee151d94e1352df8c078874fbfb Mon Sep 17 00:00:00 2001
From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Date: Tue, 9 Apr 2024 21:30:10 -0400
Subject: [PATCH 04/39] add check if pos embed (#8857)

Signed-off-by: jiemingz <jiemingz@nvidia.com>
Co-authored-by: jiemingz <jiemingz@nvidia.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_base_model.py        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index baa6e30af81d..854c5ee02e31 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -775,7 +775,8 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
                 if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
                     if self.mcore_gpt:
                         fp32_params.append(modules[0].shared_embedding_or_output_weight())
-                        fp32_params.append(modules[0].embedding.position_embeddings.weight)
+                        if modules[0].embedding.add_position_embedding:
+                            fp32_params.append(modules[0].embedding.position_embeddings.weight)
                     else:
                         fp32_params.append(modules[0].word_embeddings_weight())
                         fp32_params.append(modules[0].position_embeddings_weight())

From 0ea94f78b39eec76cfa9bf9df3126328a93337c4 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Tue, 9 Apr 2024 18:36:22 -0700
Subject: [PATCH 05/39] Enable DGRAD RS overlap (#8840)

* Enable DGRAD RS overlap

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* Support cases where TE version is new but NeMo/MCore is not

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean up syntax

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../gpt_full_te_layer_autocast_spec.py        | 30 ++++++++++++++++---
 .../modules/common/megatron/transformer.py    | 30 ++++++++++++++++---
 2 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index f89cbedf9f5d..a6d422a3f2d4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -123,8 +123,18 @@ def __init__(
         }
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version > packaging.version.Version("1.5.0"):
-            transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True)
-            transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True)
+            for comm in ["ag", "rs"]:
+                ub_overlap_flag = "ub_overlap_" + comm
+                split_gemm_flag = "ub_split_" + comm
+                atomic_gemm_flag = "ub_atomic_gemm_" + comm
+                # Use old overlap flags if they were supplied instead
+                if ub_overlap_flag in kwargs:
+                    transformer_layer_args[ub_overlap_flag] = kwargs[ub_overlap_flag]
+                else:
+                    transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
+                        atomic_gemm_flag, False
+                    )
+            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -204,8 +214,20 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
         }
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version > packaging.version.Version("1.5.0"):
-            transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag
-            transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs
+            # Use old overlap flags if they were supplied instead
+            transformer_layer_args["ub_overlap_ag"] = (
+                config.tp_comm_overlap_ag
+                if hasattr(config, "tp_comm_overlap_ag")
+                else config.tp_comm_split_ag or config.tp_comm_atomic_ag
+            )
+            transformer_layer_args["ub_overlap_rs"] = (
+                config.tp_comm_overlap_rs
+                if hasattr(config, "tp_comm_overlap_rs")
+                else config.tp_comm_split_rs or config.tp_comm_atomic_rs
+            )
+            transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+            )
         else:
             transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
             transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index d37c1e75d341..b33a996b7987 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -839,8 +839,18 @@ def __init__(
         }
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version > packaging.version.Version("1.5.0"):
-            transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True)
-            transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True)
+            for comm in ["ag", "rs"]:
+                ub_overlap_flag = "ub_overlap_" + comm
+                split_gemm_flag = "ub_split_" + comm
+                atomic_gemm_flag = "ub_atomic_gemm_" + comm
+                # Use old overlap flags if they were supplied instead
+                if ub_overlap_flag in kwargs:
+                    transformer_layer_args[ub_overlap_flag] = kwargs[ub_overlap_flag]
+                else:
+                    transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
+                        atomic_gemm_flag, False
+                    )
+            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -1099,8 +1109,20 @@ def build_layer(layer_number):
                 }
                 te_version = packaging.version.Version(version("transformer-engine"))
                 if te_version > packaging.version.Version("1.5.0"):
-                    transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag
-                    transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs
+                    # Use old overlap flags if they were supplied instead
+                    transformer_layer_args["ub_overlap_ag"] = (
+                        config.tp_comm_overlap_ag
+                        if hasattr(config, "tp_comm_overlap_ag")
+                        else config.tp_comm_split_ag or config.tp_comm_atomic_ag
+                    )
+                    transformer_layer_args["ub_overlap_rs"] = (
+                        config.tp_comm_overlap_rs
+                        if hasattr(config, "tp_comm_overlap_rs")
+                        else config.tp_comm_split_rs or config.tp_comm_atomic_rs
+                    )
+                    transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                        config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+                    )
                 else:
                     transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
                     transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs

From 6b660e74439f96f987034ae1ecbf9e837dbff02f Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 10 Apr 2024 08:02:38 -0400
Subject: [PATCH 06/39] fix precision of output model in conversion scripts
 (#8855)

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 scripts/checkpoint_converters/convert_bert_hf_to_nemo.py   | 3 +++
 scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py  | 3 +++
 scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py | 3 +++
 scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index 24294cfdfb85..278f7b879b28 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -31,6 +31,7 @@
 from transformers import AutoModel
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 
@@ -238,6 +239,8 @@ def convert(args):
             nemo_state_dict['model.language_model.embedding.word_embeddings.weight'] = padded_embedding
 
     model.load_state_dict(nemo_state_dict, strict=True)
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 
diff --git a/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
index 9e2eb5e3a797..de12aefd1844 100644
--- a/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py
@@ -30,6 +30,7 @@
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 
@@ -259,6 +260,8 @@ def convert(args):
     ), f'prediction mismatch: {hf_tokenizer.decode(hf_next_token)} != {hf_tokenizer.decode(next_token)}'
     logging.info(f'=' * 100)
 
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 
diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
index eeefbd215a1a..c35906dc78c1 100644
--- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py
@@ -33,6 +33,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 
@@ -224,6 +225,8 @@ def convert(args):
     nemo_state_dict = adjust_tensor_shapes(model, new_state_dict)
     model.load_state_dict(nemo_state_dict, strict=False)
 
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 
diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
index b0dddcc60233..583ee7893c0f 100644
--- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py
@@ -33,6 +33,7 @@
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
 from nemo.utils import logging
 
 PAD_TOKEN_ID = -1
@@ -303,6 +304,8 @@ def convert(args):
     )
     assert torch.argmax(nemo_outputs[0, -1], dim=-1) == pyt_outputs, "Predicted next token not match."
 
+    dtype = torch_dtype_from_precision(args.precision)
+    model = model.to(dtype=dtype)
     model.save_to(args.output_path)
     logging.info(f'NeMo model saved to: {args.output_path}')
 

From 95a0a3e2b6b72f8fd6941a7de176029821d5cc3e Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Wed, 10 Apr 2024 19:01:38 +0300
Subject: [PATCH 07/39] NeMo upgrade to ToT mcore & ToT TE (#8755)

* add mcore dataset updates

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix mcore import

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex, TE & PyT

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* setup pythonpath for mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore to python path

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mcore to pythonpath

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update pythonpath for mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change pythonpath for mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore pythonpath

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore pythonpath

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert mcore ds changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert config

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add qk_layernorm support for Falcon self attn submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* code style changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add nemo implementation for get_gpt_layer_ammo_spec

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* skip Llama2 - INT8 SQ test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* skip Llama2 - INT8 SQ test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* comment out NeMo PTQ test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* bert mcore updates

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add qk_layernorm support for bert's self attention submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add qk_layernorm support for bert's self attn submodule

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* switch back to mcore original

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* bugfix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update TE

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* change legacy model to mcore based model for lora

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unnecessary files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* uncomment PTQ tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove sbert

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* switch back to mcore main

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* remove unused variable

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* comment out CUDA Graph test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 Jenkinsfile                                   | 111 ++++++++----------
 .../megatron/gpt_fim_dataset.py               |   5 -
 .../megatron_bert_embedding_model.py          |   4 +-
 .../megatron/bert/bert_model.py               |  19 +--
 .../megatron/bert/bert_spec.py                |   9 +-
 .../megatron/falcon/falcon_spec.py            |   5 +-
 .../gpt_full_te_layer_autocast_spec.py        |   2 +-
 .../megatron/gpt_layer_ammo_spec.py           |  77 ++++++++++++
 .../language_modeling/megatron_bert_model.py  |   2 +-
 .../language_modeling/megatron_gpt_model.py   |  28 ++---
 10 files changed, 168 insertions(+), 94 deletions(-)
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 14f9a38a9c17..431bc24907ed 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,7 +1,7 @@
 pipeline {
   agent {
         docker {
-          image 'nvcr.io/nvidia/pytorch:24.01-py3'
+          image 'nvcr.io/nvidia/pytorch:24.02-py3'
           args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1'
         }
   }
@@ -63,44 +63,35 @@ pipeline {
       }
     }
 
-    // Transformer Engine 1.2.0
     stage('Transformer Engine installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
              cd TransformerEngine && \
-             git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \
+             git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
              git checkout FETCH_HEAD && \
              git submodule init && git submodule update && \
              NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
       }
     }
 
-    // Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
     stage('Apex installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/apex.git && \
              cd apex && \
-             git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \
+             git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
              cp -R apex /usr/local/lib/python3.10/dist-packages'
       }
     }
 
-    stage('Pytorch lightning installation') {
-      steps {
-         sh 'git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \
-             cd pytorch-lightning && \
-             PACKAGE_NAME=pytorch pip install -e .'
-      }
-    }
-
-    // pip package should be working with main, if not we can update the commit here
-    // until the pip package is updated
     stage('Megatron Core installation') {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
-             pip install .'
+             git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
+             pip install . && \
+             cd megatron/core/datasets && \
+             make'
+         sh 'export PYTHONPATH="${PYTHONPATH}:/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM"'
       }
     }
 
@@ -217,48 +208,48 @@ pipeline {
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-      when {
-        anyOf {
-          branch 'main'
-          changeRequest target: 'main'
-        }
-      }
-      failFast true
-      steps {
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-            trainer.precision=16 \
-            trainer.num_nodes=1 \
-            trainer.devices=1 \
-            ++exp_manager.max_time_per_run=00:00:03:00 \
-            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-            trainer.max_steps=20 \
-            model.micro_batch_size=1 \
-            model.global_batch_size=1 \
-            model.data.synthetic_data=True \
-            model.first_stage_key=images_moments \
-            model.cond_stage_key=clip_encoded \
-            model.optim.name=megatron_fused_adam \
-            +model.optim.capturable=True \
-            exp_manager.ema.enable=False \
-            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-            ++model.cond_stage_config.max_length=77 \
-            model.inductor=False \
-            ~model.cond_stage_config.restore_from_path \
-            ~model.cond_stage_config.freeze \
-            ~model.cond_stage_config.layer \
-            model.first_stage_config.from_pretrained=null \
-            model.ddp_overlap=False \
-            model.capture_cudagraph_iters=15 \
-            model.unet_config.use_flash_attention=False \
-            model.unet_config.attention_resolutions=[1] \
-            model.unet_config.channel_mult=[1] \
-            "
-        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-      }
-    }
+    //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
+    //  when {
+    //    anyOf {
+    //      branch 'main'
+    //      changeRequest target: 'main'
+    //    }
+    //  }
+    //  failFast true
+    //  steps {
+    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+    //    sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+    //        trainer.precision=16 \
+    //        trainer.num_nodes=1 \
+    //        trainer.devices=1 \
+    //        ++exp_manager.max_time_per_run=00:00:03:00 \
+    //        exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
+    //        trainer.max_steps=20 \
+    //        model.micro_batch_size=1 \
+    //        model.global_batch_size=1 \
+    //       model.data.synthetic_data=True \
+    //        model.first_stage_key=images_moments \
+    //        model.cond_stage_key=clip_encoded \
+    //        model.optim.name=megatron_fused_adam \
+    //        +model.optim.capturable=True \
+    //        exp_manager.ema.enable=False \
+    //        model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+    //        ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+    //        ++model.cond_stage_config.max_length=77 \
+    //        model.inductor=False \
+    //        ~model.cond_stage_config.restore_from_path \
+    //        ~model.cond_stage_config.freeze \
+    //        ~model.cond_stage_config.layer \
+    //        model.first_stage_config.from_pretrained=null \
+    //        model.ddp_overlap=False \
+    //        model.capture_cudagraph_iters=15 \
+    //        model.unet_config.use_flash_attention=False \
+    //        model.unet_config.attention_resolutions=[1] \
+    //        model.unet_config.channel_mult=[1] \
+    //        "
+    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+    //  }
+    //}
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
@@ -4654,7 +4645,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         model.pipeline_model_parallel_size=1 \
         model.tensor_model_parallel_size=2 \
         model.sequence_parallel=true \
-        model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
+        model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
         model.peft.peft_scheme='lora' \
         model.answer_only_loss=True \
         model.micro_batch_size=1 \
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
index 8862b52ee84b..474761c41d67 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py
@@ -33,11 +33,6 @@
     IMPORT_ERROR = e
 
 
-# is_dataset_built_on_rank function is needed for mcore GPTDatasetConfig
-def is_dataset_built_on_rank():
-    return True
-
-
 class GPTFIMDatasetConfig(GPTDatasetConfig):
     """Configuration object for Megatron Core GPT FIM datasets
 
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index 5d8ff1d305bd..849438d408a5 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -227,7 +227,7 @@ def setup(self, stage=None):
                 for i, module in enumerate(self.model):
                     parallel_state.set_virtual_pipeline_model_parallel_rank(i)
                     sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
+                        module.setup_embeddings_and_output_layer
                         if self.mcore_bert
                         else module.sync_initial_word_embeddings
                     )
@@ -235,7 +235,7 @@ def setup(self, stage=None):
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
             else:
                 sync_embeddings = (
-                    self.model.initialize_last_stage_with_word_embeddings
+                    self.model.setup_embeddings_and_output_layer
                     if self.mcore_bert
                     else self.model.sync_initial_word_embeddings
                 )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index 0fed19dd7718..749d960b9729 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -347,17 +347,19 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
         # Output
         if self.post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
-            self.lm_head = MCoreBertLMHead(
+            self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,)
+
+            self.output_layer = tensor_parallel.ColumnParallelLinear(
                 self.config.hidden_size,
-                self.config,
-                self.parallel_output,
                 self.vocab_size,
-                self.pre_process,
-                self.share_embeddings_and_output_weights,
+                config=self.config,
+                init_method=self.config.init_method,
+                bias=True,
+                skip_bias_add=False,
+                gather_output=not self.parallel_output,
+                skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights,
             )
 
-            self.output_layer = self.lm_head.output_layer
-
             self.binary_head = None
             if self.add_binary_head:
                 # TODO: Shoudl switch this to TE ?
@@ -412,7 +414,8 @@ def forward(
         if self.share_embeddings_and_output_weights:
             output_weight = self.shared_embedding_or_output_weight()
 
-        logits = self.lm_head(hidden_states=hidden_states, word_embeddings_weight=output_weight)
+        hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states)
+        logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight)
 
         binary_logits = None
         if self.binary_head is not None and self.add_pooler:
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
index 31fd62126c15..58ea9c26fbcf 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py
@@ -26,6 +26,7 @@
     )
     from megatron.core.transformer.dot_product_attention import DotProductAttention
     from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
     from megatron.core.transformer.spec_utils import ModuleSpec
 
@@ -51,6 +52,8 @@
                 linear_qkv=TEColumnParallelLinear,
                 core_attention=TEDotProductAttention,
                 linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
             ),
         ),
         self_attn_bda=get_bias_dropout_add,
@@ -71,7 +74,11 @@
             module=SelfAttention,
             params={"attn_mask_type": AttnMaskType.padding},
             submodules=SelfAttentionSubmodules(
-                linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear,
+                linear_qkv=ColumnParallelLinear,
+                core_attention=DotProductAttention,
+                linear_proj=RowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
             ),
         ),
         self_attn_bda=get_bias_dropout_add,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
index 924e5f4321e6..cf0c4c4d99ef 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py
@@ -24,9 +24,9 @@
         TERowParallelLinear,
     )
     from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
     from megatron.core.transformer.spec_utils import ModuleSpec
-
     from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules
 
     HAVE_MEGATRON_CORE = True
@@ -39,6 +39,7 @@
 
 from .falcon_decoder_layer import FalconTransformerLayer
 
+
 # Use this spec for an implementation using modules in TE
 def get_falcon_layer_spec() -> ModuleSpec:
     if not HAVE_MEGATRON_CORE:
@@ -54,6 +55,8 @@ def get_falcon_layer_spec() -> ModuleSpec:
                 linear_qkv=TEColumnParallelLinear,
                 core_attention=TEDotProductAttention,
                 linear_proj=TERowParallelLinear,
+                q_layernorm=IdentityOp,
+                k_layernorm=IdentityOp,
             ),
         ),
         self_attn_bda=get_bias_dropout_add,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index a6d422a3f2d4..19766e4a34ca 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -287,7 +287,7 @@ def _get_layer_offset(self):
 
         return offset
 
-    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = ()):
+    def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), metadata=None):
         TENSOR_PARALLEL_LAYERS_AXIS_MAP = {
             'self_attention.layernorm_qkv.weight': 0,
             'self_attention.layernorm_qkv.bias': 0,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
new file mode 100644
index 000000000000..e51ecaba463a
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+    from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
+    from megatron.core.transformer.custom_layers.transformer_engine import TENorm
+    from megatron.core.transformer.dot_product_attention import DotProductAttention
+    from megatron.core.transformer.enums import AttnMaskType
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.mlp import MLP, MLPSubmodules
+    from megatron.core.transformer.spec_utils import ModuleSpec
+    from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError) as e:
+
+    TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults
+    MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults
+    AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults
+    ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+    IMPORT_ERROR = e
+
+# Use this spec for AMMO PTQ and TensorRT-LLM export
+def get_gpt_layer_ammo_spec() -> ModuleSpec:
+    """Mix the native spec with TENorm.
+
+    This is essentially the native local spec except for the layernorm implementation
+    is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
+    prevents the apex dependency.
+    """
+    if not HAVE_MEGATRON_CORE:
+        raise Exception(IMPORT_ERROR)
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            input_layernorm=TENorm,
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.causal},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=ColumnParallelLinear,
+                    core_attention=DotProductAttention,
+                    linear_proj=RowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
+            ),
+            self_attn_bda=get_bias_dropout_add,
+            pre_mlp_layernorm=TENorm,
+            mlp=ModuleSpec(
+                module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,),
+            ),
+            mlp_bda=get_bias_dropout_add,
+            # Map TE-layernorm-fusion keys back
+            sharded_state_dict_keys_map={
+                'input_layernorm.': 'self_attention.linear_qkv.layer_norm_',
+                'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_',
+            },
+        ),
+    )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index fb02223112d6..82b2b1a96ff4 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -806,7 +806,7 @@ def setup(self, stage=None):
                 if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                     parallel_state.set_virtual_pipeline_model_parallel_rank(index)
                     sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
+                        module.setup_embeddings_and_output_layer
                         if self.mcore_bert
                         else module.sync_initial_word_embeddings
                     )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index ae33cc6761e9..6648abac8ee0 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -36,15 +36,12 @@
     MegatronPretrainingSampler,
 )
 from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import (
-    GPTFIMDataset,
-    GPTFIMDatasetConfig,
-    is_dataset_built_on_rank,
-)
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig
 from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import (
     get_gpt_full_te_layer_autocast_spec,
 )
+from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_ammo_spec import get_gpt_layer_ammo_spec
 from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common.megatron.build_model import build_model
@@ -92,7 +89,9 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
-    from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec
+
+    # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used
+    # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
     from megatron.core.models.gpt.gpt_layer_specs import (
         get_gpt_layer_local_spec,
@@ -1375,9 +1374,11 @@ def build_train_valid_test_datasets(self):
                 tokenizer=self.tokenizer,
             )
         else:
+            # Function needed for mcore GPTDataset
+            is_dataset_built_on_rank = lambda: True
+
             mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
             kwargs = {
-                "is_built_on_rank": is_dataset_built_on_rank,
                 "random_seed": self.cfg.seed,
                 "sequence_length": self.cfg.data.seq_length,
                 "path_to_cache": self.cfg.data.index_mapping_dir,
@@ -1399,17 +1400,14 @@ def build_train_valid_test_datasets(self):
 
             if self.cfg.data.get('add_fim', False):
                 dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs)
-
-                self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                    GPTFIMDataset, train_valid_test_num_samples, dataset_config,
-                ).build()
+                dataset_type = GPTFIMDataset
             else:
                 dataset_config = GPTDatasetConfig(**kwargs)
                 dataset_type = MockGPTDataset if mock_dataset else GPTDataset
 
-                self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
-                    dataset_type, train_valid_test_num_samples, dataset_config,
-                ).build()
+            self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+                dataset_type, train_valid_test_num_samples, is_dataset_built_on_rank, dataset_config,
+            ).build()
 
         if self._train_ds is not None:
             logging.info(f'Length of train dataset: {len(self._train_ds)}')
@@ -1746,7 +1744,7 @@ def initialize_last_rank_embeddings(self):
                     if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                         parallel_state.set_virtual_pipeline_model_parallel_rank(index)
                     sync_embeddings = (
-                        module.initialize_last_stage_with_word_embeddings
+                        module.setup_embeddings_and_output_layer
                         if self.mcore_gpt
                         else module.sync_initial_word_embeddings
                     )

From b33af25bdd1425eb42b96dc3aa06211c830b5278 Mon Sep 17 00:00:00 2001
From: Vladimir Bataev <vbataev@nvidia.com>
Date: Wed, 10 Apr 2024 21:03:16 +0400
Subject: [PATCH 08/39] Use Label-Looping algorithm for RNN-T decoding by
 default (#8831)

* Use Label-Looping algorithm for RNN-T decoding by default
* Fix loop labels + stateless decoding

---------

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
---
 nemo/collections/asr/modules/rnnt.py          |   4 +-
 .../asr/parts/submodules/rnnt_decoding.py     |   2 +-
 .../parts/submodules/rnnt_greedy_decoding.py  |   8 +-
 .../submodules/rnnt_loop_labels_computer.py   |  15 +-
 .../submodules/tdt_loop_labels_computer.py    |  15 +-
 .../test_asr_hybrid_rnnt_ctc_model_char.py    |  63 ++++++--
 .../asr/test_asr_rnnt_encdec_model.py         | 139 ++++++++++++++----
 7 files changed, 173 insertions(+), 73 deletions(-)

diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py
index 948760e68b30..5a7457f6379d 100644
--- a/nemo/collections/asr/modules/rnnt.py
+++ b/nemo/collections/asr/modules/rnnt.py
@@ -310,7 +310,9 @@ def score_hypothesis(
 
     def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
         batch = y.size(0)
-        state = [torch.ones([batch, self.context_size], dtype=torch.long, device=y.device) * self.blank_idx]
+        # state contains context_size - 1 elements for each utterance in batch,
+        # consistent with the state returned from StatelessNet.forward
+        state = [torch.ones([batch, self.context_size - 1], dtype=torch.long, device=y.device) * self.blank_idx]
         return state
 
     def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]):
diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
index ad71e5371f01..7a260f3c6c89 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py
@@ -319,7 +319,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int):
                         preserve_alignments=self.preserve_alignments,
                         preserve_frame_confidence=self.preserve_frame_confidence,
                         confidence_method_cfg=self.confidence_method_cfg,
-                        loop_labels=self.cfg.greedy.get('loop_labels', False),
+                        loop_labels=self.cfg.greedy.get('loop_labels', True),
                         use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False),
                     )
                 else:
diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
index d69ed1c41049..464dc46e358c 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py
@@ -568,9 +568,9 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer):
                     - 'lin' for using the linear mapping.
                     - 'exp' for using exponential mapping with linear shift.
         loop_labels: Switching between decoding algorithms. Both algorithms produce equivalent results.
-            loop_labels=True algorithm is faster (especially for large batches) but can use a bit more memory
+            loop_labels=True (default) algorithm is faster (especially for large batches) but can use a bit more memory
             (negligible overhead compared to the amount of memory used by the encoder).
-            loop_labels=False (default) is an implementation of a traditional decoding algorithm, which iterates over
+            loop_labels=False is an implementation of a traditional decoding algorithm, which iterates over
             frames (encoder output vectors), and in the inner loop, decodes labels for the current frame one by one,
             stopping when <blank> is found.
             loop_labels=True iterates over labels, on each step finding the next non-blank label
@@ -588,7 +588,7 @@ def __init__(
         preserve_alignments: bool = False,
         preserve_frame_confidence: bool = False,
         confidence_method_cfg: Optional[DictConfig] = None,
-        loop_labels: bool = False,
+        loop_labels: bool = True,
         use_cuda_graph_decoder: bool = False,
     ):
         super().__init__(
@@ -2299,7 +2299,7 @@ class GreedyBatchedRNNTInferConfig:
     preserve_alignments: bool = False
     preserve_frame_confidence: bool = False
     confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig())
-    loop_labels: bool = False
+    loop_labels: bool = True
     use_cuda_graph_decoder: bool = False
 
     def __post_init__(self):
diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
index 89b474e0f8ba..92cb8a36aeb5 100644
--- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py
@@ -283,21 +283,12 @@ def loop_labels_torch(
         became_inactive_mask = torch.empty_like(active_mask)
 
         # loop while there are active utterances
-        first_step = True
         while active_mask.any():
             active_mask_prev.copy_(active_mask, non_blocking=True)
             # stage 1: get decoder (prediction network) output
-            if first_step:
-                # start of the loop, SOS symbol is passed into prediction network, state is None
-                # we need to separate this for torch.jit
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size
-                )
-                first_step = False
-            else:
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
-                )
+            decoder_output, state, *_ = self.decoder.predict(
+                labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
+            )
             decoder_output = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
 
             # stage 2: get joint output, iteratively seeking for non-blank labels
diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
index e95ea48d15fe..c289ce06cdfa 100644
--- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
+++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py
@@ -294,21 +294,12 @@ def loop_labels_torch(
         became_inactive_mask = torch.empty_like(active_mask)
 
         # loop while there are active utterances
-        first_step = True
         while active_mask.any():
             active_mask_prev.copy_(active_mask, non_blocking=True)
             # stage 1: get decoder (prediction network) output
-            if first_step:
-                # start of the loop, SOS symbol is passed into prediction network, state is None
-                # we need to separate this for torch.jit
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size
-                )
-                first_step = False
-            else:
-                decoder_output, state, *_ = self.decoder.predict(
-                    labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
-                )
+            decoder_output, state, *_ = self.decoder.predict(
+                labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size
+            )
             decoder_output = self.joint.project_prednet(decoder_output)  # do not recalculate joint projection
 
             # stage 2: get joint output, iteratively seeking for non-blank labels
diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
index 60f807dc7b3e..85156bf9e2c5 100644
--- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
+++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+from typing import Optional
 
 import pytest
 import torch
@@ -309,9 +310,14 @@ def test_BeamRNNTInferConfig(self):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding(self, greedy_class):
+    def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -330,7 +336,10 @@ def test_greedy_decoding(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
@@ -381,9 +390,15 @@ def test_greedy_multi_decoding(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_stateless_decoder(self, greedy_class):
+    @pytest.mark.parametrize("context_size", [1, 2])
+    def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Optional[bool], context_size: int):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -391,7 +406,7 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder_output_size = 4
         joint_output_shape = 4
 
-        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1}
+        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1, 'context_size': context_size}
         jointnet_cfg = {
             'encoder_hidden': encoder_output_size,
             'pred_hidden': decoder_output_size,
@@ -402,7 +417,10 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder = StatelessTransducerDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
@@ -453,9 +471,14 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_preserve_alignment(self, greedy_class):
+    def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -474,8 +497,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list)
 
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
         greedy = greedy_class(
-            decoder, joint_net, blank_index=len(token_list) - 1, preserve_alignments=True, max_symbols_per_step=5
+            decoder,
+            joint_net,
+            blank_index=len(token_list) - 1,
+            preserve_alignments=True,
+            max_symbols_per_step=5,
+            **additional_decoding_kwargs,
         )
 
         # (B, D, T)
@@ -591,9 +620,14 @@ def test_beam_decoding_preserve_alignments(self, beam_config):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
+    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -612,7 +646,10 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)
diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py
index d7c47adce1ad..d5ab0054ff87 100644
--- a/tests/collections/asr/test_asr_rnnt_encdec_model.py
+++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py
@@ -73,7 +73,7 @@ def predict(
                 return (
                     output,
                     [
-                        torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :].exand(
+                        torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :].expand(
                             [1, batch_size, -1]
                         )
                     ],
@@ -90,22 +90,25 @@ def predict(
                 ],
             )
 
-        def initialize_state(self, y: torch.Tensor) -> Optional[List[torch.Tensor]]:
-            return None
+        def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]:
+            batch_size = y.shape[0]
+            # NB: .clone is necessary after .expand, since the decoding algorithm manipulates the state
+            # (replacing elements), and this requires the state to be a real full tensor
+            # (not an expanded view, in which different elements can refer to the same memory location)
+            return [
+                torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :]
+                .expand([1, batch_size, -1])
+                .clone()
+            ]
 
         def score_hypothesis(
             self, hypothesis: Hypothesis, cache: Dict[Tuple[int], Any]
         ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
             return torch.tensor(), [torch.tensor()], torch.tensor()
 
-        def batch_select_state(
-            self, batch_states: Optional[List[torch.Tensor]], idx: int
-        ) -> Optional[List[List[torch.Tensor]]]:
-            if batch_states is not None:
-                states = [batch_states[0][:, idx]]
-                return [states]
-            else:
-                return None
+        def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> Optional[List[List[torch.Tensor]]]:
+            states = [batch_states[0][:, idx]]
+            return [states]
 
         def batch_copy_states(
             self,
@@ -126,6 +129,22 @@ def mask_select_states(
                 return None
             return [states[0][:, mask]]
 
+        @classmethod
+        def batch_replace_states_mask(
+            cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor], mask: torch.Tensor,
+        ):
+            """Replace states in dst_states with states from src_states using the mask"""
+            for src_substate, dst_substate in zip(src_states, dst_states):
+                torch.where(mask.unsqueeze(0).unsqueeze(-1), src_substate, dst_substate, out=dst_substate)
+
+        @classmethod
+        def batch_split_states(cls, batch_states: list[torch.Tensor]) -> list[list[torch.Tensor]]:
+            """
+            Split states into a list of states.
+            Useful for splitting the final state for converting results of the decoding algorithm to Hypothesis class.
+            """
+            return [sub_state.split(1, dim=1) for sub_state in batch_states]
+
     class DummyRNNTJoint(AbstractRNNTJoint):
         def __init__(self, num_outputs: int):
             super().__init__()
@@ -621,9 +640,15 @@ def test_greedy_multi_decoding(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_stateless_decoder(self, greedy_class):
+    @pytest.mark.parametrize("context_size", [1, 2])
+    def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Optional[bool], context_size: int):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -631,7 +656,7 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder_output_size = 4
         joint_output_shape = 4
 
-        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1}
+        prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1, 'context_size': context_size}
         jointnet_cfg = {
             'encoder_hidden': encoder_output_size,
             'pred_hidden': decoder_output_size,
@@ -642,8 +667,14 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class):
         decoder = StatelessTransducerDecoder(prednet_cfg, vocab_size)
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
-
-            greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+            greedy = greedy_class(
+                decoder,
+                joint_net,
+                blank_index=len(token_list) - 1,
+                max_symbols_per_step=5,
+                **additional_decoding_kwargs,
+            )
 
             # (B, D, T)
             enc_out = torch.randn(1, encoder_output_size, 30)
@@ -696,9 +727,14 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_preserve_alignment(self, greedy_class):
+    def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -719,13 +755,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
         max_symbols_per_step = 5
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
-
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder,
                 joint_net,
                 blank_index=len(token_list),
                 preserve_alignments=True,
                 max_symbols_per_step=max_symbols_per_step,
+                **additional_decoding_kwargs,
             )
 
             # (B, D, T)
@@ -760,9 +797,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
+    def test_greedy_decoding_preserve_frame_confidence(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -784,12 +826,14 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
         for joint_type in [RNNTJoint, HATJoint]:
             joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list)
 
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder,
                 joint_net,
                 blank_index=len(token_list),
                 preserve_frame_confidence=True,
                 max_symbols_per_step=max_symbols_per_step,
+                **additional_decoding_kwargs,
             )
 
             # (B, D, T)
@@ -827,10 +871,17 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
     @pytest.mark.parametrize("max_symbols_per_step", [1, 5])
-    def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+    def test_greedy_decoding_max_symbols_alignment(
+        self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool]
+    ):
         decoders = [max_symbols_setup["decoder"]]
         if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
             decoders.append(max_symbols_setup["decoder_masked"])
@@ -839,12 +890,14 @@ def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_c
         encoded_lengths = max_symbols_setup["encoded_lengths"]
 
         for decoder in decoders:
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder_model=decoder,
                 joint_model=joint,
                 blank_index=decoder.blank_idx,
                 max_symbols_per_step=max_symbols_per_step,
                 preserve_alignments=True,
+                **additional_decoding_kwargs,
             )
 
             with torch.no_grad():
@@ -869,10 +922,17 @@ def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_c
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
     @pytest.mark.parametrize("max_symbols_per_step", [-1, 0])
-    def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+    def test_greedy_decoding_max_symbols_confidence_incorrect_max_symbols(
+        self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool]
+    ):
         """Test ValueError for max_symbols_per_step <= 0"""
         decoders = [max_symbols_setup["decoder"]]
         if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
@@ -880,6 +940,7 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
         joint = max_symbols_setup["joint"]
 
         for decoder in decoders:
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             with pytest.raises(ValueError):
                 _ = greedy_class(
                     decoder_model=decoder,
@@ -887,6 +948,7 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
                     blank_index=decoder.blank_idx,
                     max_symbols_per_step=max_symbols_per_step,
                     preserve_frame_confidence=True,
+                    **additional_decoding_kwargs,
                 )
 
     @pytest.mark.skipif(
@@ -894,10 +956,17 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
     @pytest.mark.parametrize("max_symbols_per_step", [1, 5])
-    def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step):
+    def test_greedy_decoding_max_symbols_confidence(
+        self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool]
+    ):
         decoders = [max_symbols_setup["decoder"]]
         if greedy_class is greedy_decode.GreedyBatchedRNNTInfer:
             decoders.append(max_symbols_setup["decoder_masked"])
@@ -906,12 +975,14 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_
         encoded_lengths = max_symbols_setup["encoded_lengths"]
 
         for decoder in decoders:
+            additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
             greedy = greedy_class(
                 decoder_model=decoder,
                 joint_model=joint,
                 blank_index=decoder.blank_idx,
                 max_symbols_per_step=max_symbols_per_step,
                 preserve_frame_confidence=True,
+                **additional_decoding_kwargs,
             )
 
             with torch.no_grad():
@@ -1035,9 +1106,14 @@ def test_beam_decoding_preserve_alignments(self, beam_config):
     )
     @pytest.mark.unit
     @pytest.mark.parametrize(
-        "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer],
+        ("greedy_class", "loop_labels"),
+        [
+            (greedy_decode.GreedyRNNTInfer, None),
+            (greedy_decode.GreedyBatchedRNNTInfer, True),
+            (greedy_decode.GreedyBatchedRNNTInfer, False),
+        ],
     )
-    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
+    def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optional[bool]):
         token_list = [" ", "a", "b", "c"]
         vocab_size = len(token_list)
 
@@ -1056,7 +1132,10 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class):
         decoder = RNNTDecoder(prednet_cfg, vocab_size)
         joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list)
 
-        greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5)
+        additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels}
+        greedy = greedy_class(
+            decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs
+        )
 
         # (B, D, T)
         enc_out = torch.randn(1, encoder_output_size, 30)

From dd75285b295c5d5e71ea27bf9ddd74dbbd99c87a Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 10 Apr 2024 11:19:20 -0700
Subject: [PATCH 09/39] Cancel old runs for PR commit update (#8874)

---
 .github/workflows/cicd-main.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 309e7936ee3b..550defff7814 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -15,9 +15,12 @@ name: "CICD NeMo"
 
 on:
   pull_request:
-    types: [opened, reopened, ready_for_review]
     branches: [ "main" ]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   gpu-test:
     runs-on: self-hosted-azure

From 7c07a8de3743c4c03fb6727567cf1c3de3e7d193 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Wed, 10 Apr 2024 16:36:02 -0400
Subject: [PATCH 10/39] Fix packed seq doc math rendering issue (#8832)

* Fix packed seq doc math rendering issue

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix packed seq doc math rendering issue

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 docs/source/nlp/nemo_megatron/packed_sequence.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/nlp/nemo_megatron/packed_sequence.rst b/docs/source/nlp/nemo_megatron/packed_sequence.rst
index 23c8976d4f5e..e31444fe1e60 100644
--- a/docs/source/nlp/nemo_megatron/packed_sequence.rst
+++ b/docs/source/nlp/nemo_megatron/packed_sequence.rst
@@ -123,7 +123,7 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
       preprocessing step. You can increase the ``pack_size`` to achieve the same purpose of increasing micro batch size.
     - Global batch size has to be adjusted so that the training recipe is maintained. Because each pack contains
       multiple sequences now, global batch size needs to be reduced by the average number of sequences per pack ``n``,
-      where :math:`n = \frac{# sequences in dataset}{# packs}`. This ensures that each gradient iteration sees (on
+      where ``n = num_sequences_in_dataset / num_packs``. This ensures that each gradient iteration sees (on
       average) the same number of tokens. The value of ``n`` is printed out when the script is run.
 
     .. code-block:: bash

From f7941cbd41697291cbee714c2182a18b70b85755 Mon Sep 17 00:00:00 2001
From: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:06:37 -0500
Subject: [PATCH 11/39] Move logic for distopt FP32 grads to models (#8867)

* Move logic for FP32 embedding grads to models

Signed-off-by: Tim Moon <tmoon@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Tim Moon <tmoon@nvidia.com>
Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../language_modeling/megatron_base_model.py  | 27 -------
 .../language_modeling/megatron_gpt_model.py   | 72 ++++++++++---------
 .../megatron_lm_encoder_decoder_model.py      |  5 ++
 3 files changed, 44 insertions(+), 60 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 854c5ee02e31..980ea8f9f76d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -768,33 +768,6 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any:
             optim_dtype = str_to_dtype(get_config_arg('dtype', torch.float32))
             optim_kwargs['dtype'] = optim_dtype
 
-            # Make sure embedding grad reductions are in FP32
-            if optim_dtype == torch.float32:
-                fp32_params = []
-                modules = self.get_model_module_list()
-                if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                    if self.mcore_gpt:
-                        fp32_params.append(modules[0].shared_embedding_or_output_weight())
-                        if modules[0].embedding.add_position_embedding:
-                            fp32_params.append(modules[0].embedding.position_embeddings.weight)
-                    else:
-                        fp32_params.append(modules[0].word_embeddings_weight())
-                        fp32_params.append(modules[0].position_embeddings_weight())
-                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    share_embeddings_and_output_weights = (
-                        modules[-1].share_embeddings_and_output_weights
-                        if self.mcore_gpt
-                        else modules[-1].share_token_embeddings
-                    )
-                    if share_embeddings_and_output_weights:
-                        if self.mcore_gpt:
-                            fp32_params.append(modules[-1].shared_embedding_or_output_weight())
-                        else:
-                            fp32_params.append(modules[-1].word_embeddings_weight())
-                for param in fp32_params:
-                    if param is not None:
-                        param._with_fp32_optimizer = True
-
             # Match param allgather with model dtype
             model_dtype = torch.float32
             if self.megatron_amp_O2 and hasattr(self, 'autocast_dtype'):
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 6648abac8ee0..8d1d428a9989 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -102,9 +102,6 @@
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import drain_embedding_wgrad_compute, init_method_normal, scaled_init_method_normal
 
-    # TODO @tmoon: Use once available in Megatron-LM
-    # from megatron.core.pipeline_parallel.schedules import DataIteratorList
-
     HAVE_MEGATRON_CORE = True
 
 except (ImportError, ModuleNotFoundError):
@@ -494,36 +491,45 @@ def configure_optimizers(self):
 
         if self.with_distributed_adam:
 
-            # Disable overlapped grad sync for embedding grad when
-            # pipeline parallelism is enabled
-            if parallel_state.get_pipeline_model_parallel_world_size() > 1:
-                modules = self.get_model_module_list()
-                if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
-                    if len(modules) > 1:
-                        module = modules[0]  # only the first virtual rank has the embeddings
-                    else:
-                        module = modules[0]
-                    if self.cfg.get('share_embeddings_and_output_weights', True):
-                        param = (
-                            module.shared_embedding_or_output_weight()
-                            if self.mcore_gpt
-                            else module.word_embeddings_weight()
-                        )
-                        param._disable_greedy_grad_copy = not self.megatron_amp_O2
-                        param._disable_overlap_grad_sync = True
-                if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
-                    if len(modules) > 1:
-                        module = modules[-1]  # only the last virtual rank has the embeddings
-                    else:
-                        module = modules[0]
-                    if self.cfg.get('share_embeddings_and_output_weights', True):
-                        param = (
-                            module.shared_embedding_or_output_weight()
-                            if self.mcore_gpt
-                            else module.word_embeddings_weight()
-                        )
-                        param._disable_greedy_grad_copy = not self.megatron_amp_O2
-                        param._disable_overlap_grad_sync = True
+            # Special handling for embedding grads
+            modules = self.get_model_module_list()
+            if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+                module = modules[0]  # first virtual rank has the embeddings
+
+                # Word embeddings: use FP32 grads and disable
+                # overlapped grad sync with pipeline parallelism
+                word_embeddings = (
+                    module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
+                )
+                word_embeddings._with_fp32_optimizer = True
+                if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.get(
+                    'share_embeddings_and_output_weights', True
+                ):
+                    word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2
+                    word_embeddings._disable_overlap_grad_sync = True
+
+                # Position embeddings: use FP32 grads
+                position_embeddings = None
+                if self.mcore_gpt:
+                    if module.embedding.add_position_embedding:
+                        position_embeddings = module.embedding.position_embeddings.weight
+                else:
+                    position_embeddings = module.position_embeddings_weight()
+                if position_embeddings is not None:
+                    position_embeddings._with_fp32_optimizer = True
+
+            # Handle case where embeddings are used in output layer
+            if parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.cfg.get(
+                'share_embeddings_and_output_weights', True
+            ):
+                module = modules[-1]  # last virtual rank has the embeddings
+                word_embeddings = (
+                    module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight()
+                )
+                word_embeddings._with_fp32_optimizer = True
+                if parallel_state.get_pipeline_model_parallel_world_size() > 1:
+                    word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2
+                    word_embeddings._disable_overlap_grad_sync = True
 
             # Disable overlapped grad sync for layer norm grads when
             # sequence parallelism is enabled
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 651034c91520..3a7ad3d6714c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -189,6 +189,11 @@ def configure_optimizers(self):
                 param._disable_greedy_grad_copy = not self.megatron_amp_O2
                 param._disable_overlap_grad_sync = True
 
+            # Make sure embedding grads are reduced in FP32
+            for name, param in self.named_parameters():
+                if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name:
+                    param._with_fp32_optimizer = True
+
         return super().configure_optimizers()
 
     def _handle_bias_activation_fusion_args(self, cfg):

From 6e2398a896313c8806766129832423b334c8d876 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Wed, 10 Apr 2024 14:35:47 -0700
Subject: [PATCH 12/39] Fix transcription utils function for duration check
 (#8862)

* add none check

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* add for restore func

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 nemo/collections/asr/parts/utils/transcribe_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 980500e9ef00..8465406224e7 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -298,7 +298,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]:
 def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict]:
     """Sorts the manifest if duration key is available for every utterance."""
     items = manifest_utils.read_manifest(path)
-    if try_sort and all("duration" in item for item in items):
+    if try_sort and all("duration" in item and item["duration"] is not None for item in items):
         items = sorted(items, reverse=True, key=lambda item: item["duration"])
     return items
 
@@ -306,7 +306,7 @@ def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict
 def restore_transcription_order(manifest_path: str, transcriptions: list) -> list:
     with open(manifest_path) as f:
         items = [(idx, json.loads(l)) for idx, l in enumerate(f)]
-    if not all("duration" in item[1] for item in items):
+    if not all("duration" in item[1] and item[1]["duration"] is not None for item in items):
         return transcriptions
     new2old = [item[0] for item in sorted(items, reverse=True, key=lambda it: it[1]["duration"])]
     del items  # free up some memory

From 2c6e65e7dd42751f74fdaa47e8c2bc060e8f29f1 Mon Sep 17 00:00:00 2001
From: Danial Mohseni Taheri <49656670+DanialTaheri@users.noreply.github.com>
Date: Wed, 10 Apr 2024 16:29:15 -0700
Subject: [PATCH 13/39] Add clip conv layer (#8838)

* Replace einops with ConvLayer

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* Modify the layers

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* Fix version and arch

Signed-off-by: Danial <smohsenitahe@nvidia.com>

* Fix a bug in openclip conversion

Signed-off-by: Danial <smohsenitahe@nvidia.com>

---------

Signed-off-by: Danial <smohsenitahe@nvidia.com>
Co-authored-by: Danial <smohsenitahe@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../clip/convert_external_clip_to_nemo.py     | 17 +++++------------
 .../vision/modules/vit/vit_backbone.py        | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
index 4ac99a951f0d..631b3faa2f47 100644
--- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
+++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py
@@ -55,8 +55,8 @@
 
 def get_args():
     parser = ArgumentParser()
-    parser.add_argument("--arch", type=str, default="ViT-H-14")
-    parser.add_argument("--version", type=str, default="laion2b_s32b_b79k")
+    parser.add_argument("--arch", type=str, default="openai/clip-vit-base-patch32")
+    parser.add_argument("--version", type=str, default="huggingface")
 
     parser.add_argument(
         "--hparams_file",
@@ -112,7 +112,6 @@ def mapping_openclip_state_dict(open_model):
         ".positional_embedding": ".position_embeddings",
         ".backbone.proj": ".head.weight",
         ".class_embedding": ".cls_token",
-        ".backbone.conv1.weight": ".backbone.linear_encoder.weight",
     }
 
     nemo_state_dict = {}
@@ -139,9 +138,6 @@ def mapping_openclip_state_dict(open_model):
     nemo_state_dict["vision_encoder.backbone.cls_token"] = nemo_state_dict[
         "vision_encoder.backbone.cls_token"
     ].reshape(1, 1, -1)
-    w = nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"]
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] = einops.rearrange(w, "b c p1 p2 -> b (p1 p2 c)",)
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.bias"] = torch.zeros(w.shape[0])
 
     return nemo_state_dict
 
@@ -168,10 +164,10 @@ def mapping_hf_state_dict(hf_model):
         ".pre_layrnorm.bias": ".preprocess_layernorm.bias",
         ".post_layernorm.weight": ".transformer.final_layernorm.weight",
         ".post_layernorm.bias": ".transformer.final_layernorm.bias",
-        ".backbone.embeddings.position_embedding.weight": ".backbone.position_embeddings",
-        ".language_model.embeddings.position_embedding.weight": ".language_model.embedding.position_embeddings",
+        ".backbone.embeddings.position_embedding.weight": ".backbone.position_embeddings.weight",
+        ".language_model.embeddings.position_embedding.weight": ".language_model.embedding.position_embeddings.weight",
         ".embeddings.class_embedding": ".cls_token",
-        ".backbone.embeddings.patch_embedding.weight": ".backbone.linear_encoder.weight",
+        ".backbone.embeddings.patch_embedding.weight": ".backbone.conv1.weight",
         ".final_layer_norm.weight": ".encoder.final_layernorm.weight",
         ".final_layer_norm.bias": ".encoder.final_layernorm.bias",
         ".embeddings.token_embedding.weight": ".embedding.word_embeddings.weight",
@@ -208,9 +204,6 @@ def mapping_hf_state_dict(hf_model):
     nemo_state_dict["vision_encoder.backbone.cls_token"] = nemo_state_dict[
         "vision_encoder.backbone.cls_token"
     ].reshape(1, 1, -1)
-    w = nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"]
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] = einops.rearrange(w, "b c p1 p2 -> b (p1 p2 c)",)
-    nemo_state_dict["vision_encoder.backbone.linear_encoder.bias"] = torch.zeros(w.shape[0])
 
     return nemo_state_dict
 
diff --git a/nemo/collections/vision/modules/vit/vit_backbone.py b/nemo/collections/vision/modules/vit/vit_backbone.py
index ebd7e0da3e5c..67989f0f5496 100644
--- a/nemo/collections/vision/modules/vit/vit_backbone.py
+++ b/nemo/collections/vision/modules/vit/vit_backbone.py
@@ -227,8 +227,14 @@ def __init__(
                 torch.nn.init.zeros_(self.cls_token)
             self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda()
 
-            # Linear encoder
-            self.linear_encoder = torch.nn.Linear(self.flatten_dim, self.hidden_size)
+            # Convolution layer
+            self.conv1 = torch.nn.Conv2d(
+                in_channels=model_cfg.num_channels,  # Number of input channels
+                out_channels=self.hidden_size,  # Number of output channels
+                kernel_size=(self.patch_dim, self.patch_dim),  # Kernel size (height, width)
+                stride=(self.patch_dim, self.patch_dim),  # Stride (height, width)
+                bias=False,
+            )  # Disable bias
 
             # embedding
             self.position_embedding_type = model_cfg.get("position_embedding_type", "learned_absolute")
@@ -332,12 +338,9 @@ def interpolate_pos_encoding(
     def forward(self, input):
 
         if self.pre_process:
-            rearranged_input = einops.rearrange(
-                input, "b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=self.patch_dim, p2=self.patch_dim,
-            )
-
-            # [b num_patch patch_dim*patch_dim*c] ->  [b, s, h]; s:=num_patch, h:=hidden
-            encoder_output = self.linear_encoder(rearranged_input)
+            rearranged_input = self.conv1(input)
+            rearranged_input = rearranged_input.reshape(rearranged_input.shape[0], rearranged_input.shape[1], -1)
+            encoder_output = rearranged_input.permute(0, 2, 1)
 
             concatenated_tokens = encoder_output
             if self.class_token:

From 1809b61efa95e0440ca7e35c62148c8b4fcc2e9d Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 10 Apr 2024 16:47:53 -0700
Subject: [PATCH 14/39] [Nemo CICD] Update dependencies for container build
 (#8878)

* Cancel old runs for PR commit update

* update dependencies for container build

* temp for test

* update back

* Revert "temp for test"

This reverts commit 9f9221155412393d05b2c862880f9128a93b26a4.
---
 .github/workflows/cicd-main.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 550defff7814..5cc990902953 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -73,7 +73,7 @@ jobs:
     - name: Container setup
       run: |
         # Pull base PyTorch container
-        docker pull nvcr.io/nvidia/pytorch:24.01-py3
+        docker pull nvcr.io/nvidia/pytorch:24.02-py3
         docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
             set -x
 
@@ -93,21 +93,19 @@ jobs:
             # NeMo Installation
             ./reinstall.sh release
 
-            # Transformer Engine 1.2.0
             # Transformer Engine installation
             git clone https://github.com/NVIDIA/TransformerEngine.git && \
                 pushd TransformerEngine && \
-                git fetch origin 9b2fed514ea419141146f843ab2c84b22b86bfd7 && \
+                git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \
                 git checkout FETCH_HEAD && \
                 git submodule init && git submodule update && \
                 NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .  && \
                 popd
 
-            # Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760
             # Apex installation
             git clone https://github.com/NVIDIA/apex.git && \
                 pushd apex && \
-                git checkout b496d85fb88a801d8e680872a12822de310951fd && \
+                git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \
                 cp -R apex /usr/local/lib/python3.10/dist-packages && \
                 popd
 
@@ -116,12 +114,13 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout 43792028f003ed25a3ee8c5a0d4cad82317d81b5 && \
+                git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \
                   popd && \
                 popd
+            export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
             # Install only for test: L2: Segmentation Tool
             pushd tools/ctc_segmentation && \

From 2890b3338f18c972246b26487d0d4a18795248fd Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Wed, 10 Apr 2024 20:17:11 -0700
Subject: [PATCH 15/39] Akoumparouli/fix sd train (#8876)

* hardcode autocast

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* uncomment sd_train

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 Jenkinsfile                                   | 86 +++++++++----------
 .../stable_diffusion/sd_train.py              |  3 +-
 2 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 431bc24907ed..6471fa3d011f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -208,48 +208,48 @@ pipeline {
         sh "rm -rf /home/TestData/multimodal/stable_diffusion_train"
       }
     }
-    //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
-    //  when {
-    //    anyOf {
-    //      branch 'main'
-    //      changeRequest target: 'main'
-    //    }
-    //  }
-    //  failFast true
-    //  steps {
-    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-    //    sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
-    //        trainer.precision=16 \
-    //        trainer.num_nodes=1 \
-    //        trainer.devices=1 \
-    //        ++exp_manager.max_time_per_run=00:00:03:00 \
-    //        exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
-    //        trainer.max_steps=20 \
-    //        model.micro_batch_size=1 \
-    //        model.global_batch_size=1 \
-    //       model.data.synthetic_data=True \
-    //        model.first_stage_key=images_moments \
-    //        model.cond_stage_key=clip_encoded \
-    //        model.optim.name=megatron_fused_adam \
-    //        +model.optim.capturable=True \
-    //        exp_manager.ema.enable=False \
-    //        model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
-    //        ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
-    //        ++model.cond_stage_config.max_length=77 \
-    //        model.inductor=False \
-    //        ~model.cond_stage_config.restore_from_path \
-    //        ~model.cond_stage_config.freeze \
-    //        ~model.cond_stage_config.layer \
-    //        model.first_stage_config.from_pretrained=null \
-    //        model.ddp_overlap=False \
-    //        model.capture_cudagraph_iters=15 \
-    //        model.unet_config.use_flash_attention=False \
-    //        model.unet_config.attention_resolutions=[1] \
-    //        model.unet_config.channel_mult=[1] \
-    //        "
-    //    sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
-    //  }
-    //}
+    stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+        sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \
+            trainer.precision=16 \
+            trainer.num_nodes=1 \
+            trainer.devices=1 \
+            ++exp_manager.max_time_per_run=00:00:03:00 \
+            exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \
+            trainer.max_steps=20 \
+            model.micro_batch_size=1 \
+            model.global_batch_size=1 \
+           model.data.synthetic_data=True \
+            model.first_stage_key=images_moments \
+            model.cond_stage_key=clip_encoded \
+            model.optim.name=megatron_fused_adam \
+            +model.optim.capturable=True \
+            exp_manager.ema.enable=False \
+            model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \
+            ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \
+            ++model.cond_stage_config.max_length=77 \
+            model.inductor=False \
+            ~model.cond_stage_config.restore_from_path \
+            ~model.cond_stage_config.freeze \
+            ~model.cond_stage_config.layer \
+            model.first_stage_config.from_pretrained=null \
+            model.ddp_overlap=False \
+            model.capture_cudagraph_iters=15 \
+            model.unet_config.use_flash_attention=False \
+            model.unet_config.attention_resolutions=[1] \
+            model.unet_config.channel_mult=[1] \
+            "
+        sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs"
+      }
+    }
 //     stage('L2: Multimodal ControlNet Train') {
 //       when {
 //         anyOf {
@@ -5849,4 +5849,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
       cleanWs()
     }
   }
-}
\ No newline at end of file
+}
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
index 968d9bec2884..b10eda550e9a 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py
@@ -83,7 +83,8 @@ def main(cfg) -> None:
             else:
                 autocast_enabled = True
                 dgrad_dtype = torch.float16
-
+            # akoumparouli: temp fix.
+            autocast_enabled = True
             model = model.cuda()
             for _ in range(5):
                 with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16):

From 57444ae9b63afc5419a16948f7d032a1a9f7dc7f Mon Sep 17 00:00:00 2001
From: fedorovgv <50668534+fedorovgv@users.noreply.github.com>
Date: Thu, 11 Apr 2024 17:10:47 +0300
Subject: [PATCH 16/39] Add Semi Sorted Batching. (#8584)

---
 docs/source/asr/datasets.rst                  | 119 +++++----
 nemo/collections/asr/data/audio_to_text.py    |   7 +
 nemo/collections/asr/models/ctc_bpe_models.py |  17 ++
 nemo/collections/asr/models/ctc_models.py     |  17 ++
 .../asr/models/hybrid_rnnt_ctc_bpe_models.py  |  17 ++
 .../collections/asr/models/rnnt_bpe_models.py |  17 ++
 nemo/collections/asr/models/rnnt_models.py    |  17 ++
 .../asr/parts/utils/asr_batching.py           | 237 ++++++++++++++++++
 nemo/core/optim/lr_scheduler.py               |   8 +
 tests/collections/asr/test_asr_samplers.py    | 157 ++++++++++++
 10 files changed, 566 insertions(+), 47 deletions(-)
 create mode 100644 nemo/collections/asr/parts/utils/asr_batching.py
 create mode 100644 tests/collections/asr/test_asr_samplers.py

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 7b6873de0ed7..7612c6a3f630 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -250,62 +250,49 @@ To enable sharded manifest filename expansion, set the ``shard_manifests`` field
 ``defer_setup`` flag needs to be true as well, so that the dataloader will be initialized after the DDP and its length can be collected from
 the distributed workers.
 
+Batching strategies
+---------------------
 
-Conversion to Tarred Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length.
+These extra paddings is a significant source of computation waste. 
 
-You can easily convert your existing NeMo-compatible ASR datasets using the
-`conversion script here <https://github.com/NVIDIA/NeMo/tree/stable/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
+Semi Sorted Batching
+---------------------
 
-.. code:: bash
+Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting.
 
-  python convert_to_tarred_audio_dataset.py \
-    --manifest_path=<path to the manifest file> \
-    --target_dir=<path to output directory> \
-    --num_shards=<number of tarfiles that will contain the audio>
-    --max_duration=<float representing maximum duration of audio samples> \
-    --min_duration=<float representing minimum duration of audio samples> \
-    --force_codec=flac \
-    --shuffle --shuffle_seed=0
+  .. image:: images/ssb.png
+    :align: center
+    :alt: semi sorted batching
+    :scale: 50%
 
-.. note:: For extra reduction of storage space at the cost of lossy (but high-quality) compression, you may use ``--force_codec=opus`` instead.
+It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config.
 
-This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter
-audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory
-``--target_dir`` in ``n`` shards, along with separate manifest and metadata files.
+  .. code::
 
-The files in the target directory should look similar to the following:
+    ++model.train_ds.use_semi_sorted_batching=true
+    ++model.train_ds.randomization_factor=0.1
 
-.. code:: none
+Semi sorted batching is supported by the following models:
 
-  target_dir/
-  ├── audio_1.tar
-  ├── audio_2.tar
-  ├── ...
-  ├── metadata.yaml
-  ├── tarred_audio_manifest.json
-  ├── sharded_manifests/
-      ├── manifest_1.json
-      ├── ...
-      └── manifest_N.json
+  .. code::
 
+    nemo.collections.asr.models.EncDecCTCModel
+    nemo.collections.asr.models.EncDecCTCModelBPE
+    nemo.collections.asr.models.EncDecRNNTModel
+    nemo.collections.asr.models.EncDecRNNTBPEModel
+    nemo.collections.asr.models.EncDecHybridRNNTCTCModel
+    nemo.collections.asr.models.EncDecHybridRNNTCTCBPEModel
 
-Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that
-filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are
-simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav``
-in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``.
-
-Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag.
+For more details about this algorithm, see the `paper <https://www.isca-speech.org/archive/pdfs/interspeech_2021/ge21_interspeech.pdf>`_ .
 
 Bucketing Datasets
-------------------
+---------------------
 
-For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length.
-These extra paddings is a significant source of computation waste. Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy.
+Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy.
 It may result into training speeedup of more than 2X. To enable and use the bucketing feature, you need to create the bucketing version of the dataset by using `conversion script here <https://github.com/NVIDIA/NeMo/tree/stable/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
 You may use --buckets_num to specify the number of buckets (Recommend to use 4 to 8 buckets). It creates multiple tarred datasets, one per bucket, based on the audio durations. The range of [min_duration, max_duration) is split into equal sized buckets.
 
-
 To enable the bucketing feature in the dataset section of the config files, you need to pass the multiple tarred datasets as a list of lists.
 If user passes just a list of strings, then the datasets would simply get concatenated which would be different from bucketing.
 Here is an example for 4 buckets and 512 shards:
@@ -352,6 +339,50 @@ The fully_randomized strategy would have lower speedup than synced_randomized bu
 Bucketing may improve the training speed more than 2x but may affect the final accuracy of the model slightly. Training for more epochs and using 'synced_randomized' strategy help to fill this gap.
 Currently bucketing feature is just supported for tarred datasets.
 
+
+Conversion to Tarred Datasets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can easily convert your existing NeMo-compatible ASR datasets using the
+`conversion script here <https://github.com/NVIDIA/NeMo/tree/stable/scripts/speech_recognition/convert_to_tarred_audio_dataset.py>`_.
+
+.. code:: bash
+
+  python convert_to_tarred_audio_dataset.py \
+    --manifest_path=<path to the manifest file> \
+    --target_dir=<path to output directory> \
+    --num_shards=<number of tarfiles that will contain the audio>
+    --max_duration=<float representing maximum duration of audio samples> \
+    --min_duration=<float representing minimum duration of audio samples> \
+    --shuffle --shuffle_seed=0
+
+This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter
+audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory
+``--target_dir`` in ``n`` shards, along with separate manifest and metadata files.
+
+The files in the target directory should look similar to the following:
+
+.. code::
+
+  target_dir/
+  ├── audio_1.tar
+  ├── audio_2.tar
+  ├── ...
+  ├── metadata.yaml
+  ├── tarred_audio_manifest.json
+  ├── sharded_manifests/
+      ├── manifest_1.json
+      ├── ...
+      └── manifest_N.json
+
+
+Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that
+filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are
+simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav``
+in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``.
+
+Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag.
+
 Upsampling Datasets
 -------------------
 
@@ -437,7 +468,7 @@ For tarred datasets, shards from the AIS cluster are used by piping ``ais get``
 Tarred Dataset from AIS
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-A tarred dataset can be easily used as described in the `Tarred Datasets`_ section by providing paths to manifests on an AIS cluster.
+A tarred dataset can be easily used as described in the :ref:`Tarred Datasets` section by providing paths to manifests on an AIS cluster.
 For example, a tarred dataset from an AIS cluster can be configured as
 
 .. code::
@@ -445,7 +476,7 @@ For example, a tarred dataset from an AIS cluster can be configured as
   manifest_filepath='ais://bucket/tarred_audio_manifest.json'
   tarred_audio_filepaths='ais://bucket/shard_{1..64}.tar'
 
-`Bucketing Datasets`_ are configured in a similar way by providing paths on an AIS cluster.
+:ref:`Bucketing Datasets` are configured in a similar way by providing paths on an AIS cluster.
 
 Non-tarred Dataset from AIS
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -640,20 +671,14 @@ We recommend to pre-compute the bucket duration bins in order to accelerate the
 The following script may be used:
 
 .. code-block:: bash
-
     $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 manifest.json
-
     Use the following options in your config:
             num_buckets=30
             bucket_duration_bins=[1.78,2.34,2.69,...
     <other diagnostic information about the dataset>
-
 For multi-dataset setups, one may provide multiple manifests and even their weights:
-
 .. code-block:: bash
-
     $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 [[manifest.json,0.7],[other.json,0.3]]
-
     Use the following options in your config:
             num_buckets=30
             bucket_duration_bins=[1.91,3.02,3.56,...
diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py
index a689450c94ba..00c15109b64f 100644
--- a/nemo/collections/asr/data/audio_to_text.py
+++ b/nemo/collections/asr/data/audio_to_text.py
@@ -16,6 +16,7 @@
 import math
 import multiprocessing
 import os
+from collections.abc import Iterable as IterableABC
 from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import braceexpand
@@ -472,6 +473,12 @@ def get_manifest_sample(self, sample_id):
         return self.manifest_processor.collection[sample_id]
 
     def __getitem__(self, index):
+        if isinstance(index, IterableABC):
+            return [self._process_sample(_index) for _index in index]
+        else:
+            return self._process_sample(index)
+
+    def _process_sample(self, index):
         sample = self.manifest_processor.collection[index]
         offset = sample.offset
 
diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py
index 9f3b6b4cf83b..f861a971f5ea 100644
--- a/nemo/collections/asr/models/ctc_bpe_models.py
+++ b/nemo/collections/asr/models/ctc_bpe_models.py
@@ -20,6 +20,7 @@
 from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -27,6 +28,7 @@
 from nemo.collections.asr.models.ctc_models import EncDecCTCModel
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
@@ -129,9 +131,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py
index 5f380619db68..4df02b1177cd 100644
--- a/nemo/collections/asr/models/ctc_models.py
+++ b/nemo/collections/asr/models/ctc_models.py
@@ -25,6 +25,7 @@
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -33,6 +34,7 @@
 from nemo.collections.asr.parts.mixins import ASRModuleMixin, ASRTranscriptionMixin, InterCTCMixin, TranscribeConfig
 from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType, TranscriptionReturnType
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
@@ -319,9 +321,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
index 182acf3904db..39375f08e139 100644
--- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
+++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py
@@ -21,6 +21,7 @@
 from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.ctc import CTCLoss
@@ -30,6 +31,7 @@
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
@@ -169,9 +171,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py
index 6fba163c65e1..bb4e7f718a8e 100644
--- a/nemo/collections/asr/models/rnnt_bpe_models.py
+++ b/nemo/collections/asr/models/rnnt_bpe_models.py
@@ -21,6 +21,7 @@
 from pytorch_lightning import Trainer
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.rnnt import RNNTLoss
@@ -28,6 +29,7 @@
 from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel
 from nemo.collections.asr.parts.mixins import ASRBPEMixin
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging, model_utils
@@ -527,9 +529,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py
index 047e25b8dd5d..386f2a915142 100644
--- a/nemo/collections/asr/models/rnnt_models.py
+++ b/nemo/collections/asr/models/rnnt_models.py
@@ -25,6 +25,7 @@
 from tqdm.auto import tqdm
 
 from nemo.collections.asr.data import audio_to_text_dataset
+from nemo.collections.asr.data.audio_to_text import _AudioTextDataset
 from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs
 from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset
 from nemo.collections.asr.losses.rnnt import RNNTLoss, resolve_rnnt_default_loss_name
@@ -38,6 +39,7 @@
     TranscriptionReturnType,
 )
 from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig
+from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing.parsers import make_parser
@@ -467,9 +469,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]):
             # support datasets that are lists of lists
             collate_fn = dataset.datasets[0].datasets[0].collate_fn
 
+        batch_sampler = None
+        if config.get('use_semi_sorted_batching', False):
+            if not isinstance(dataset, _AudioTextDataset):
+                raise RuntimeError(
+                    "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset "
+                    f"but found dataset of type {type(dataset)}"
+                )
+            # set batch_size and batch_sampler to None to disable automatic batching
+            batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config)
+            config['batch_size'] = None
+            config['drop_last'] = False
+            shuffle = False
+
         return torch.utils.data.DataLoader(
             dataset=dataset,
             batch_size=config['batch_size'],
+            sampler=batch_sampler,
+            batch_sampler=None,
             collate_fn=collate_fn,
             drop_last=config.get('drop_last', False),
             shuffle=shuffle,
diff --git a/nemo/collections/asr/parts/utils/asr_batching.py b/nemo/collections/asr/parts/utils/asr_batching.py
new file mode 100644
index 000000000000..dcbebdc0f949
--- /dev/null
+++ b/nemo/collections/asr/parts/utils/asr_batching.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Iterator, List, Optional, Union
+
+import numpy as np
+import torch
+from torch.utils.data.distributed import DistributedSampler
+
+from nemo.collections.asr.data.audio_to_text import AudioToBPEDataset, AudioToCharDataset
+from nemo.collections.asr.models.asr_model import ASRModel
+from nemo.utils import logging
+
+
+class SemiSortBatchSampler(DistributedSampler):
+    def __init__(
+        self,
+        global_rank: int,
+        world_size: int,
+        durations: List[int],
+        batch_size: int,
+        batch_shuffle: bool = True,
+        drop_last: bool = False,
+        randomization_factor: Optional[float] = None,
+        seed: int = 42,
+    ) -> None:
+        """
+        Semi Sorted Batching, as proposed in _SSB ("Speed up training with variable 
+        length inputs by efficient batching strategies.", Zhenhao Ge et al. (2021).).
+
+        The Semi Sorted Batch Sampler (SSB) samples the indices by their duration 
+        with the addition of pseudo noise that is sampled from the uniform 
+        distribution \mathbb{U}\left[ -delta * r, delta * r \right], where delta is 
+        defined as the difference between the maximum and minimum duration and r is 
+        the randomization factor that controls the strength of the noise (when r = 0, 
+        there will be a strong sorting). The heuristic value of the r according to 
+        the experiments from paper is 0.2. 
+
+        The torch calls the set_epoch method from the distributed data loader sampler 
+        at the end of each epoch to shuffle the samples according to the seed and 
+        epoch number. So the SSB is passed to the dataloader as a sampler with the 
+        dataloader's batch size options and the batch_sampler option set to None to 
+        disable automatical batching. In this case, the sampler has become an iterator 
+        that returns a list of batch indices.
+
+        Args:
+            global_rank: Rank among all GPUs.
+            world_size: The number of GPUs used.
+            durations: Sample durations parsed from `dataset.manifest_processor`.
+            batch_size: Micro batch size or batch size per singe gpu.
+            batch_shuffle: Batch sort before each epoch.
+            drop_last: Drop the last batch if the number of samples is less than batch 
+                size. Defaults to False.
+            randomization_factor: The strength of noise that will be added to the sample
+                duration. If no value is passed, the value 0.2 will be used.
+            seed: Seed for batch shuffleling. Defaults to 42.
+
+        Raises:
+            ValueError: Wrong randomization factor value.
+            RuntimeError: Unexpected behavior.
+
+        .. SSB_: 
+            https://www.isca-speech.org/archive/pdfs/interspeech_2021/ge21_interspeech.pdf
+        """
+        if randomization_factor is None:
+            randomization_factor = 0.1
+            logging.info("Randomization factor not found in config, default value 0.1 will be set.")
+        else:
+            logging.info(f"A randomization factor {randomization_factor} will be used.")
+
+        if randomization_factor < 0.0:
+            raise ValueError(f'Randomization factor must be non-negative but found {randomization_factor}.')
+
+        self.rank: List = global_rank
+        self.num_replicas: int = world_size
+
+        self.durations: np.array = np.array(durations, dtype=np.float32)
+
+        self.shuffle: bool = batch_shuffle
+        self.micro_batch_size: int = batch_size
+        self.drop_last: bool = drop_last
+        self.epoch: int = 0
+        self.seed: int = seed
+        self.randomization_factor: float = randomization_factor
+
+        self.local_num_batches: int = self._calculate_local_num_batches()
+
+        logging.info(f"Semi Sorted Batch Sampler will be used")
+
+    def _calculate_local_num_batches(self) -> int:
+        init_num_samples = len(self.durations)
+
+        # delete batches with a non-integer number of samples
+        if self.drop_last:
+            init_num_samples -= init_num_samples % self.micro_batch_size
+
+        # calculate the number of batches according to the counted number of samples
+        global_num_batches = math.ceil(init_num_samples / self.micro_batch_size)
+
+        # add extra batches to make it divisible by world size (num replicas)
+        num_batches_pad = (self.num_replicas - global_num_batches % self.num_replicas) % self.num_replicas
+        global_num_batches += num_batches_pad
+
+        # calculate the number of batches per rank
+        local_num_batches = global_num_batches // self.num_replicas
+
+        return local_num_batches
+
+    def _make_batches(self) -> List[np.array]:
+        max_duration: float = np.max(self.durations)
+        min_duration: float = np.min(self.durations)
+        bound: float = (max_duration - min_duration) * self.randomization_factor / 2
+
+        # generate pseudo noise
+        noise: np.array = np.random.uniform(low=-bound, high=bound, size=len(self.durations))
+
+        # sort indices accroding to pseudo noise
+        sorted_indices: np.array = np.argsort(self.durations + noise)
+
+        # delete batches with a non-integer number of samples
+        tail = 0
+        if self.drop_last:
+            tail: int = len(sorted_indices) % self.micro_batch_size
+            exclude = np.random.choice(len(sorted_indices), tail, replace=False)
+            sorted_indices = np.delete(sorted_indices, exclude)
+            logging.warning(f"Drop last is set to True, so {len(exclude)} samples will be dropped.")
+
+        global_num_batches: int = math.ceil(len(sorted_indices) / self.micro_batch_size)
+
+        # if the global_num_batches is zero than return empty list
+        if global_num_batches == 0:
+            logging.warning(
+                f"The number of all batches is {global_num_batches}, than dataloader will "
+                "be empty. To avoid this try to decrease batch size or world size or set "
+                "drop_last to False."
+            )
+            return []
+
+        # add extra batches to make it divisible by world size (num replicas)
+        pad_batches_num: int = (self.num_replicas - global_num_batches % self.num_replicas) % self.num_replicas
+        if global_num_batches < self.num_replicas:
+            logging.warning(
+                f"The number of all batches is {global_num_batches}, which is less than the "
+                f"world size of {self.num_replicas}. SSB Sampler will add {pad_batches_num} "
+                "batches. To avoid this try to decrease batch size or world size."
+            )
+
+        if pad_batches_num != 0:
+            # randomly select batch indeces to pad and concatenate them
+            batch_indeces_pad: np.array = np.random.randint(
+                low=0, high=len(sorted_indices), size=pad_batches_num * self.micro_batch_size,
+            )
+            sorted_indices: np.array = np.concatenate(
+                (sorted_indices, sorted_indices[batch_indeces_pad]), axis=0,
+            )
+
+        # local indeces are selected by world size and local rank
+        local_indices: np.array = sorted_indices[self.rank :: self.num_replicas]
+
+        # split local batches
+        size_mask = range(self.micro_batch_size, len(local_indices), self.micro_batch_size)
+        local_batches = np.split(local_indices, size_mask, axis=0)
+
+        if len(local_batches) != self.local_num_batches:
+            raise RuntimeError(
+                f'Number of calculated indices {len(local_batches)} is not equal to calculated '
+                f'number of local batches {self.local_num_batches}.'
+            )
+
+        return local_batches
+
+    def __iter__(self) -> Iterator[List[int]]:
+        local_batches = self._make_batches()
+
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch + 1)
+            indices = torch.randperm(self.local_num_batches, generator=g)
+        else:
+            indices = torch.arange(0, self.local_num_batches)
+
+        for _, index in enumerate(indices):
+            yield local_batches[index]
+
+    def __len__(self) -> int:
+        return self.local_num_batches
+
+
+def get_semi_sorted_batch_sampler(
+    model: ASRModel, dataset: Union[AudioToCharDataset, AudioToBPEDataset], config: dict
+) -> SemiSortBatchSampler:
+    """
+    Instantiates a Semi Sorted (Batch) Sampler.
+
+    Args:
+        model: ASR Model.
+        dataset: Dataset which allow iterate over all object and parse durations.
+        config: Train, Vaidation or Test dataset config.
+
+    Raises:
+        ValueError: Wrong dataset type.
+
+    Returns:
+        SemiSortBatchSampler: Semi Sorted Batch Sampler class.
+    """
+    if not (isinstance(dataset, AudioToCharDataset) or isinstance(dataset, AudioToBPEDataset)):
+        raise ValueError(
+            "Only AudioToCharDataset or AudioToBPEDataset supported with semi sorted batching, "
+            f"but found {type(dataset)}."
+        )
+
+    durations = [sample.duration for sample in dataset.manifest_processor.collection.data]
+
+    sampler = SemiSortBatchSampler(
+        global_rank=model.global_rank,
+        world_size=model.world_size,
+        durations=durations,
+        batch_size=config['batch_size'],
+        batch_shuffle=config.get('shuffle', True),
+        drop_last=config.get('drop_last', False),
+        randomization_factor=config.get('randomization_factor', None),
+        seed=config.get('semi_sort_sampler_seed', 42),
+    )
+
+    return sampler
diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py
index 38ad372f3e51..473ca0f5c416 100644
--- a/nemo/core/optim/lr_scheduler.py
+++ b/nemo/core/optim/lr_scheduler.py
@@ -877,6 +877,14 @@ def prepare_lr_scheduler(
                 batch_size = train_dataloader.batch_sampler.micro_batch_size
             else:
                 raise ValueError(f'Could not find batch_size from batch_sampler: {train_dataloader.batch_sampler}')
+        elif hasattr(train_dataloader, 'sampler') and train_dataloader.sampler is not None:
+            if (
+                hasattr(train_dataloader.sampler, 'micro_batch_size')
+                and train_dataloader.sampler.micro_batch_size is not None
+            ):
+                batch_size = train_dataloader.sampler.micro_batch_size
+            else:
+                raise ValueError(f'Could not find batch_size from sampler: {train_dataloader.sampler}')
         else:
             raise ValueError(f'Could not find batch_size from train_dataloader: {train_dataloader}')
         drop_last = train_dataloader.drop_last
diff --git a/tests/collections/asr/test_asr_samplers.py b/tests/collections/asr/test_asr_samplers.py
new file mode 100644
index 000000000000..0b4d11fe2946
--- /dev/null
+++ b/tests/collections/asr/test_asr_samplers.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+
+import numpy as np
+import pytest
+import soundfile as sf
+import torch
+
+from nemo.collections.asr.data import audio_to_text
+from nemo.collections.asr.parts.utils.asr_batching import SemiSortBatchSampler
+from nemo.collections.asr.parts.utils.manifest_utils import write_manifest
+
+
+class TestASRSamplers:
+    labels = [
+        " ",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+        "'",
+    ]
+
+    @pytest.mark.unit
+    def test_ssb_sampler(self):
+        # Generate random signals
+        data_min_duration = 0.1
+        data_max_duration = 16.7
+
+        random_seed = 42
+        sample_rate = 16000
+
+        _rng = np.random.default_rng(seed=random_seed)
+
+        def generate_samples(num_examples: int) -> list:
+            data_duration = np.round(_rng.uniform(low=data_min_duration, high=data_max_duration, size=num_examples), 3)
+            data_duration_samples = np.floor(data_duration * sample_rate).astype(int)
+            samples = []
+            for data_duration_sample in data_duration_samples:
+                samples.append(_rng.uniform(low=-0.5, high=0.5, size=(data_duration_sample)))
+            return samples
+
+        with tempfile.TemporaryDirectory() as test_dir:
+            # Build metadata for manifest
+            metadata = []
+
+            # Test size of dataloader with and without ssb
+            for num_samples in np.concatenate([np.array([1, 2]), _rng.integers(3, 10, 2), _rng.integers(10, 1000, 2)]):
+                samples = generate_samples(num_samples)
+
+                for n, sample in enumerate(samples):
+                    meta = dict()
+                    signal_filename = f'{n:04d}.wav'
+                    # write audio files
+                    sf.write(os.path.join(test_dir, signal_filename), sample, sample_rate)
+                    # update metadata
+                    meta['audio_filepath'] = os.path.join(test_dir, signal_filename)
+                    meta['duration'] = len(sample) / sample_rate
+                    meta['text'] = 'non empty'
+                    metadata.append(meta)
+
+                # Save manifest
+                manifest_filepath = os.path.join(test_dir, 'manifest.json')
+                write_manifest(manifest_filepath, metadata)
+
+                # Make dataset
+                dataset = audio_to_text.AudioToCharDataset(
+                    manifest_filepath=manifest_filepath,
+                    labels=self.labels,
+                    sample_rate=sample_rate,
+                    max_duration=data_max_duration,
+                    min_duration=data_min_duration,
+                )
+                durations = [sample.duration for sample in dataset.manifest_processor.collection.data]
+
+                # Compare two dataloader
+                for batch_size in _rng.integers(1, n + 20, 5):
+                    batch_size = int(batch_size)
+                    drop_last = True if _rng.integers(0, 2) else False
+                    sampler = SemiSortBatchSampler(
+                        global_rank=0,
+                        world_size=1,
+                        durations=durations,
+                        batch_size=batch_size,
+                        batch_shuffle=True,
+                        drop_last=drop_last,
+                        randomization_factor=0.1,
+                        seed=random_seed,
+                    )
+                    dataloader_with_ssb = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=None,
+                        sampler=sampler,
+                        batch_sampler=None,
+                        collate_fn=lambda x: audio_to_text._speech_collate_fn(x, pad_id=0),
+                    )
+                    dataloader = torch.utils.data.DataLoader(
+                        dataset=dataset,
+                        batch_size=batch_size,
+                        collate_fn=lambda x: audio_to_text._speech_collate_fn(x, pad_id=0),
+                        drop_last=drop_last,
+                        shuffle=True,
+                    )
+
+                    assert abs(len(dataloader) - len(dataloader_with_ssb)) == 0, (
+                        "Different num of batches with batch! Num of batches with ssb is "
+                        f"{len(dataloader_with_ssb)} and without ssb is {len(dataloader)}!"
+                    )
+
+                    dataloader_with_ssb_exception, dataloader_exception = False, False
+
+                    try:
+                        list(dataloader_with_ssb)
+                    except:
+                        dataloader_with_ssb_exception = True
+
+                    try:
+                        list(dataloader)
+                    except:
+                        dataloader_exception = True
+
+                    assert dataloader_with_ssb_exception == dataloader_exception

From 9c80bdd9671ff32d6472fd7e8726220a5e349241 Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:13:11 +0200
Subject: [PATCH 17/39] Added codec checkpoint to docs (#8860)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 docs/source/tts/checkpoints.rst           | 7 +++++++
 docs/source/tts/data/ngc_models_codec.csv | 2 ++
 2 files changed, 9 insertions(+)
 create mode 100644 docs/source/tts/data/ngc_models_codec.csv

diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst
index 9c3de1ab4926..7d5daedd0559 100644
--- a/docs/source/tts/checkpoints.rst
+++ b/docs/source/tts/checkpoints.rst
@@ -152,3 +152,10 @@ End2End models
    :file: data/ngc_models_e2e.csv
    :align: left
    :header-rows: 1
+
+Codec models
+^^^^^^^^^^^^
+.. csv-table::
+   :file: data/ngc_models_codec.csv
+   :align: left
+   :header-rows: 1
diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv
new file mode 100644
index 000000000000..d46567012600
--- /dev/null
+++ b/docs/source/tts/data/ngc_models_codec.csv
@@ -0,0 +1,2 @@
+Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint
+audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small <https://ngc.nvidia.com/catalog/models/nvidia:nemo:audio_codec_16khz_small>`_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo``

From 83a5cad63c91d55b6c2a63a32beecd15ea12580a Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Thu, 11 Apr 2024 16:38:54 -0600
Subject: [PATCH 18/39] fix header (#8892)

Signed-off-by: eharper <eharper@nvidia.com>
---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 1beef67832f0..f7374641d66d 100644
--- a/README.rst
+++ b/README.rst
@@ -36,7 +36,7 @@
 .. _main-readme:
 
 **NVIDIA NeMo Framework**
-===============
+=========================
 
 Latest News
 -----------

From b63cbe0e072a6531e3276e1474cb7019240959e9 Mon Sep 17 00:00:00 2001
From: anmolgupt <14880251+anmolgupt@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:35:08 -0700
Subject: [PATCH 19/39] disable bprop reduce for cpl when SP is enabled (#8889)

* disable bprop reduce for cpl when SP is enabled

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* updated megatron commit in jenkinsfile

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

---------

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 Jenkinsfile                                                     | 2 +-
 .../nlp/modules/common/megatron/adapters/parallel_adapters.py   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6471fa3d011f..c98d13fbed38 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -87,7 +87,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
+             git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \
              pip install . && \
              cd megatron/core/datasets && \
              make'
diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 70ed4d695b3c..419126bd3f18 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -187,6 +187,7 @@ def __init__(
                 bias=False,
                 gather_output=True,
                 init_method=self._get_init_fn(column_init_method),
+                disable_grad_reduce=self._sequence_parallel,
             )
         if gather_output:
             self.linear_out = RowParallelLinear(

From 5da310109f9b09ac950b4bf83e226e955d376728 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Thu, 11 Apr 2024 17:10:24 -0700
Subject: [PATCH 20/39] [Nemo CICD] Add further runners for cpu-intensive-only
 (non-gpu using) jobs (#8894)

* Cancel old runs for PR commit update

* update dependencies for container build

* temp for test

* update back

* Revert "temp for test"

This reverts commit 9f9221155412393d05b2c862880f9128a93b26a4.

* Add further runners for cpu-intensive-only (non-gpu using) jobs
---
 .github/workflows/cicd-main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 5cc990902953..a9509fda51e9 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -31,7 +31,7 @@ jobs:
         nvidia-smi
 
   cicd-cluster-clean:
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-cpu
     steps:
     - name: Clean server from old files
       run: |
@@ -53,7 +53,7 @@ jobs:
 
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-cpu
     # uses: actions/cache@v2
     #container:
 #      image: nvcr.io/nvidia/pytorch:24.01-py3
@@ -196,7 +196,7 @@ jobs:
 
   L0_Unit_Tests_CPU:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-cpu
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 

From acf9a7eebfeb80ba7c7f540331c8d772839ff158 Mon Sep 17 00:00:00 2001
From: Jaemin Choi <minitu77@gmail.com>
Date: Thu, 11 Apr 2024 18:30:24 -0700
Subject: [PATCH 21/39] Add TE guards for DGRAD RS overlap (#8879)

* Add TE guards for DGRAD RS overlap

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

* Fix TE guard

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>

---------

Signed-off-by: Jaemin Choi <jaeminc@nvidia.com>
Co-authored-by: Jaemin Choi <jaeminc@nvidia.com>
---
 .../megatron/gpt_full_te_layer_autocast_spec.py        | 10 ++++++----
 .../nlp/modules/common/megatron/transformer.py         | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
index 19766e4a34ca..02858b119bfa 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py
@@ -134,7 +134,8 @@ def __init__(
                     transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
                         atomic_gemm_flag, False
                     )
-            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
+            if te_version > packaging.version.Version("1.6.0.dev0"):
+                transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -225,9 +226,10 @@ def __init__(self, config, layer_number=1, hidden_dropout=None):
                 if hasattr(config, "tp_comm_overlap_rs")
                 else config.tp_comm_split_rs or config.tp_comm_atomic_rs
             )
-            transformer_layer_args["ub_overlap_rs_dgrad"] = (
-                config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
-            )
+            if te_version > packaging.version.Version("1.6.0.dev0"):
+                transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                    config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+                )
         else:
             transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
             transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs
diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
index b33a996b7987..cb23c4a6b1fd 100644
--- a/nemo/collections/nlp/modules/common/megatron/transformer.py
+++ b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -850,7 +850,8 @@ def __init__(
                     transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get(
                         atomic_gemm_flag, False
                     )
-            transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
+            if te_version > packaging.version.Version("1.6.0.dev0"):
+                transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False)
         else:
             transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True)
             transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True)
@@ -1120,9 +1121,10 @@ def build_layer(layer_number):
                         if hasattr(config, "tp_comm_overlap_rs")
                         else config.tp_comm_split_rs or config.tp_comm_atomic_rs
                     )
-                    transformer_layer_args["ub_overlap_rs_dgrad"] = (
-                        config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
-                    )
+                    if te_version > packaging.version.Version("1.6.0.dev0"):
+                        transformer_layer_args["ub_overlap_rs_dgrad"] = (
+                            config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False
+                        )
                 else:
                     transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag
                     transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs

From 752ed8a822a1135e73b94307099804e26f20b703 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Fri, 12 Apr 2024 06:45:06 -0700
Subject: [PATCH 22/39] Skip validation model gradient zeroing (#8890)

* Skip validation model gradient zeroing

Signed-off-by: Sangkug Lym <slym@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../conf/megatron_gpt_config.yaml             |  1 +
 .../language_modeling/megatron_gpt_model.py   | 21 +++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 7be891a156c8..ea37237f2eac 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -142,6 +142,7 @@ model:
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
   nccl_communicator_config_path: null # Path to the yaml file with NCCL communicator options (min_ctas, max_ctas, and cga_cluster_size)
+  validation_param_sync_overlap: False # Overlap parameter AllGather with validation step.
 
   # FSDP
   fsdp: False # Enable training with torch FSDP.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 8d1d428a9989..a651ada5c38a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -365,6 +365,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
         self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
         self.loss_broadcast_src_rank = None
+        self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False)
 
         self.inference_params = None
 
@@ -585,10 +586,14 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
         no_sync_func = None
         grad_sync_func = None
         param_sync_func = None
-        if not forward_only and self.with_distributed_adam:
-            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
-            grad_sync_func = self.reduce_overlap_gradients
-            param_sync_func = self.sync_overlap_parameters
+        if self.with_distributed_adam:
+            if forward_only:
+                if self.validation_param_sync_overlap:
+                    param_sync_func = self.sync_overlap_parameters
+            else:
+                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                grad_sync_func = self.reduce_overlap_gradients
+                param_sync_func = self.sync_overlap_parameters
 
         # pipeline schedules will get these from self.model.config
         for module in self.get_model_module_list():
@@ -1703,6 +1708,14 @@ def on_load_checkpoint(self, checkpoint) -> None:
                     self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
+    def on_validation_model_zero_grad(self) -> None:
+        """
+         Skip gradient zeroing at the beginning of validation routine.
+         This is needed when overlapping the AllGather of the updated parameters with the following valdation step.
+         """
+        if not self.validation_param_sync_overlap:
+            super().on_validation_model_zero_grad()
+
     def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
         """
         Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk.

From 8bf2bc0a435ef18dd43830f4727d7824ae7e60e4 Mon Sep 17 00:00:00 2001
From: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>
Date: Fri, 12 Apr 2024 10:54:51 -0500
Subject: [PATCH 23/39] Correcting bullets and notes within NeMo Forced Aligner
 (#8903)

* Update nemo_forced_aligner.rst

Correcting bullets and note tags

Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>

* Update nemo_forced_aligner.rst

Note that the bottom is not showing up correctly.

Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>

---------

Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com>
---
 docs/source/tools/nemo_forced_aligner.rst | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst
index a4ed90fa7f9f..aa8d2139653f 100644
--- a/docs/source/tools/nemo_forced_aligner.rst
+++ b/docs/source/tools/nemo_forced_aligner.rst
@@ -45,7 +45,7 @@ Call the ``align.py`` script, specifying the parameters as follows:
 
 * ``model_path``: string specifying the local filepath to a CTC NeMo ASR model which will be used to generate the log-probs which we will use to do alignment. If ``pretrained_name`` is specified, ``model_path`` must not be specified.
 
-	Note: Currently NFA can only use CTC models, or Hybrid CTC-Transducer models (in CTC mode). Pure Transducer models cannot be used.
+	.. note:: Currently NFA can only use CTC models, or Hybrid CTC-Transducer models (in CTC mode). Pure Transducer models cannot be used.
 
 * ``manifest_filepath``: The path to the manifest of the data you want to align, containing ``'audio_filepath'`` and ``'text'`` fields. The audio filepaths need to be absolute paths.
 
@@ -66,7 +66,7 @@ Optional parameters:
 
 * ``additional_segment_grouping_separator``: an optional string used to separate the text into smaller segments. If this is not specified, then the whole text will be treated as a single segment. (Default: ``None``. Cannot be empty string or space (" "), as NFA will automatically produce word-level timestamps for substrings separated by spaces).
 
-	Note: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``.
+	.. note:: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``.
 
 * ``remove_blank_tokens_from_ctm``: a boolean denoting whether to remove <blank> tokens from token-level output CTMs. (Default: False). 
 
@@ -98,13 +98,14 @@ By default, NFA needs to be provided with a 'manifest' file where each line spec
 
 You can omit the ``"text"`` field from the manifest if you specify ``align_using_pred_text=true``. In that case, any ``"text"`` fields in the manifest will be ignored: the ASR model at ``pretrained_name`` or ``model_path`` will be used to transcribe the audio and obtain ``"pred_text"``, which will be used as the reference text for the forced alignment process. The ``"pred_text"`` will also be saved in the output manifest JSON file at ``<output_dir>/<original manifest file name>_with_output_file_paths.json``. To remove the possibility of overwriting ``"pred_text"``, NFA will raise an error if ``align_using_pred_text=true`` and there are existing ``"pred_text"`` fields in the original manifest.
 
-	..note:: NFA does not require ``"duration"`` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in ``"text"`` is.
+	.. note:: NFA does not require ``"duration"`` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in ``"text"`` is.
 
 
 Output CTM file format
 ----------------------
 
 For each utterance specified in a line of ``manifest_filepath``, several CTM files will be generated:
+
 * a CTM file containing token-level alignments at ``<output_dir>/ctm/tokens/<utt_id>.ctm``,
 * a CTM file containing word-level alignments at ``<output_dir>/ctm/words/<utt_id>.ctm``,
 * a CTM file containing segment-level alignments at ``<output_dir>/ctm/segments/<utt_id>.ctm``. If ``additional_segment_grouping_separator`` is specified, the segments will be parts of the text separated by ``additonal_segment_grouping_separator``. If it is not specified, the entire text will be treated as a single segment.
@@ -117,6 +118,7 @@ Note the second item in the line (the 'channel ID', which is required by the CTM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``CTMFileConfig`` (which is passed into the main NFA config) has the following parameters:
+
 * ``remove_blank_tokens``: bool (default ``False``) to specify if the token-level CTM files should have the timestamps of the blank tokens removed.
 * ``minimum_timestamp_duration``: float (default ``0``) to specify the minimum duration that will be applied to all timestamps. If any line in the CTM has a duration lower than this, it will be enlarged from the middle outwards until it meets the ``minimum_timestamp_duration``, or reaches the beginning or end of the audio file. Note that using a non-zero value may cause timestamps to overlap.
 
@@ -124,14 +126,17 @@ Output ASS file format
 ----------------------
 
 NFA will produce the following ASS files, which you can use to generate subtitle videos:
+
 * ASS files with token-level highlighting will be at ``<output_dir>/ass/tokens/<utt_id>.ass,``
 * ASS files with word-level highlighting will be at ``<output_dir>/ass/words/<utt_id>.ass``.
+
 All words belonging to the same segment 'segments' will appear at the same time in the subtitles generated with the ASS files. If you find that your segments are not the right size, you can use set ``ass_file_config.resegment_text_to_fill_space=true`` and specify some number of ``ass_file_config.max_lines_per_segment``.
 
 ``ASSFileConfig`` parameters
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``ASSFileConfig`` (which is passed into the main NFA config) has the following parameters:
+
 * ``fontsize``: int (default value ``20``) which will be the fontsize of the text
 * ``vertical_alignment``: string (default value ``center``) to specify the vertical alignment of the text. Can be one of ``center``, ``top``, ``bottom``.
 * ``resegment_text_to_fill_space``: bool (default value ``False``). If ``True``, the text will be resegmented such that each segment will not take up more than (approximately) ``max_lines_per_segment`` when the ASS file is applied to a video.
@@ -144,6 +149,7 @@ Output JSON manifest file format
 --------------------------------
 
 A new manifest file will be saved at ``<output_dir>/<original manifest file name>_with_output_file_paths.json``. It will contain the same fields as the original manifest, and additionally:
+
 * ``"token_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``)
 * ``"word_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``)
 * ``"segment_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``)
@@ -159,8 +165,9 @@ Ideally you would have some 'true' CTM files to compare with your generated CTM
 
 Alternatively (or additionally), you can visualize the quality of alignments using tools such as Gecko, which can play your audio file and display the predicted alignments at the same time. The Gecko tool requires you to upload an audio file and at least one CTM file. The Gecko tool can be accessed here: https://gong-io.github.io/gecko/. More information about the Gecko tool can be found on its Github page here: https://github.com/gong-io/gecko. 
 
-**Note**: the following may help improve your experience viewing the CTMs in Gecko:
+.. note:: 
+	The following may help improve your experience viewing the CTMs in Gecko:
 
-* setting ``minimum_timestamp_duration`` to a larger number, as Gecko may not display some tokens/words/segments properly if their timestamps are too short.
-* setting ``remove_blank_tokens_from_ctm=true`` if you are analyzing token-level CTMs, as it will make the Gecko visualization less cluttered.
+	* setting ``minimum_timestamp_duration`` to a larger number, as Gecko may not display some tokens/words/segments properly if their timestamps are too short.
+	* setting ``remove_blank_tokens_from_ctm=true`` if you are analyzing token-level CTMs, as it will make the Gecko visualization less cluttered.
 

From f05ecb601556ccc72cf487603ee8774916b88698 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Fri, 12 Apr 2024 11:27:08 -0700
Subject: [PATCH 24/39] Adding distributed checkpointing for bert (#8650)

* Adding distributed checkpointing for bert

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update megatron_bert_model.py

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Adding dist checkpointing to bert

* Simple bug fix

* Fixing parallel state

* Simple bug fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixing bug

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Simple bug fix

* Simple bug fix

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../megatron_lm_ckpt_to_nemo.py               |  2 +-
 .../language_modeling/megatron_retro_eval.py  |  2 +-
 .../multimodal/data/common/webdataset.py      |  2 +-
 .../models/multimodal_llm/neva/neva_model.py  |  8 +--
 .../megatron_bert_embedding_model.py          |  4 +-
 .../megatron/bert/bert_model.py               |  1 +
 .../language_modeling/megatron_base_model.py  |  9 ++-
 .../language_modeling/megatron_bert_model.py  | 69 +++++++++++++++----
 .../language_modeling/megatron_gpt_model.py   |  6 +-
 .../megatron_gpt_prompt_learning_model.py     |  2 +-
 .../megatron_lm_encoder_decoder_model.py      |  2 +-
 .../megatron_retrieval_model.py               |  2 +-
 nemo/collections/nlp/models/nlp_model.py      |  2 +-
 nemo/collections/nlp/parts/nlp_overrides.py   |  4 +-
 nemo/export/quantize/quantizer.py             |  2 +-
 nemo/utils/distributed.py                     |  2 +-
 .../convert_prompt_learning_ckpt_to_nemo.py   |  2 +-
 .../start_retro_model_service.py              |  2 +-
 18 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
index d09c79f7a051..03d6fd94e4e2 100644
--- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
+++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py
@@ -529,7 +529,7 @@ def convert(local_rank, rank, world_size, args):
 
     if args.nemo_file_path:
         if args.model_type == 'gpt':
-            if mcore_output and parallel_state.is_unitialized():
+            if mcore_output and not parallel_state.is_initialized():
                 parallel_state.initialize_model_parallel(
                     tensor_model_parallel_size=args.tensor_model_parallel_size,
                     pipeline_model_parallel_size=args.pipeline_model_parallel_size,
diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py
index 79b1e2debdfa..9978bab78bfc 100644
--- a/examples/nlp/language_modeling/megatron_retro_eval.py
+++ b/examples/nlp/language_modeling/megatron_retro_eval.py
@@ -108,7 +108,7 @@ def main(cfg) -> None:
     }
 
     # check whether the DDP is initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
 
         def dummy():
             return
diff --git a/nemo/collections/multimodal/data/common/webdataset.py b/nemo/collections/multimodal/data/common/webdataset.py
index 8d70a03fa911..79d22f34f77c 100644
--- a/nemo/collections/multimodal/data/common/webdataset.py
+++ b/nemo/collections/multimodal/data/common/webdataset.py
@@ -302,7 +302,7 @@ def run(self, src):
                 epoch = self.epoch
             rng = random.Random()
             # This seed to be deterministic AND the same across all nodes/workers in each epoch
-            if parallel_state.is_unitialized():
+            if not parallel_state.is_initialized():
                 seed = self.seed + epoch
             else:
                 seed = self.seed + epoch + (100 * parallel_state.get_data_parallel_rank())
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 78a46ce3b0db..4556ba1b3e72 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -461,7 +461,7 @@ def model_provider_func(self, pre_process, post_process):
         media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN)
 
         if self.mcore_gpt:
-            if parallel_state.is_unitialized():
+            if not parallel_state.is_initialized():
 
                 def dummy():
                     return
@@ -795,9 +795,7 @@ def setup(self, stage=None):
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
@@ -998,7 +996,7 @@ def generate(
     ) -> OutputType:
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
index 849438d408a5..d974c8182234 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py
@@ -189,9 +189,7 @@ def setup(self, stage=None):
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
 
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
index 749d960b9729..e7ae529fe4e2 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py
@@ -347,6 +347,7 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw
         # Output
         if self.post_process:
             # TODO: Make sure you are passing in the mpu_vocab_size properly
+
             self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,)
 
             self.output_layer = tensor_parallel.ColumnParallelLinear(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 980ea8f9f76d..035d194de09f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -996,10 +996,11 @@ def is_data_parallel_rank_zero(self):
             else:
                 return False
 
-    def _get_total_params_across_model_parallel_groups_gpt_bert(self, model):
+    def _get_total_params_across_model_parallel_groups_gpt_bert(self):
         """Returns the total number of parameters across all model parallel groups."""
         is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False)
         # log number of parameters
+        model = self.get_model_module_list()
         if isinstance(model, list):
             num_parameters_on_device = sum(
                 [sum([p.nelement() for p in model_module.parameters()]) for model_module in model]
@@ -1010,7 +1011,7 @@ def _get_total_params_across_model_parallel_groups_gpt_bert(self, model):
                 and self.cfg.get('share_embeddings_and_output_weights', True)
             ):
                 word_embeddings_weight = (
-                    model[-1].module.shared_embedding_or_output_weight()
+                    model[-1].shared_embedding_or_output_weight()
                     if is_mcore_model
                     else model[-1].word_embeddings_weight()
                 )
@@ -1025,9 +1026,7 @@ def _get_total_params_across_model_parallel_groups_gpt_bert(self, model):
                 and self.cfg.get('share_embeddings_and_output_weights', True)
             ):
                 word_embeddings_weight = (
-                    model.module.shared_embedding_or_output_weight()
-                    if is_mcore_model
-                    else model.word_embeddings_weight()
+                    model.shared_embedding_or_output_weight() if is_mcore_model else model.word_embeddings_weight()
                 )
                 # substract the embedding weights on the last stage
                 num_word_embedding_parameters = sum([p.nelement() for p in word_embeddings_weight])
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
index 82b2b1a96ff4..dc6d81649122 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py
@@ -767,9 +767,7 @@ def setup(self, stage=None):
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
 
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
@@ -1084,25 +1082,70 @@ def input_example(self, max_batch=1, max_dim=256):
         input_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids}
         return tuple([input_dict])
 
+    def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
+        """
+        Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk.
+        When given the sharded_stated_dict, dist_checkpoint.load will load the tensors corresponding to
+        self.state_dict().
+        The sharded tensor mapping is defined in the GPTModel class from mcore.
+        """
+        if self.mcore_bert:
+            module_prefix = f'{prefix}model.'
+            sharded_state_dict = {}
+            for index, module in enumerate(self.get_model_module_list()):
+                if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                    # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(index)
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict[f'model_{index}'] = module_sharded_state_dict
+                else:
+                    module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix)
+                    sharded_state_dict.update(module_sharded_state_dict)
+
+            # reset vp rank
+            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+
+            return sharded_state_dict
+
     def on_save_checkpoint(self, checkpoint) -> None:
         """LightningModule hook:
         https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint
         """
-        if isinstance(self.model, list):
-            for i in range(len(self.model)):
-                parallel_state.set_virtual_pipeline_model_parallel_rank(i)
-                checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint()
-            parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+        if self.mcore_bert:
+            checkpoint['sharded_state_dict'] = self.sharded_state_dict()
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint()
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def on_load_checkpoint(self, checkpoint) -> None:
         """LightningModule hook:
         https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint
         """
-        if isinstance(self.model, list):
-            for i in range(len(self.model)):
-                parallel_state.set_virtual_pipeline_model_parallel_rank(i)
-                self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
-            parallel_state.set_virtual_pipeline_model_parallel_rank(0)
+        if self.mcore_bert:
+            if 'state_dict' in checkpoint and checkpoint['state_dict']:
+                for index, module in enumerate(self.get_model_module_list()):
+                    if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
+                        checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+                    else:
+                        checkpoint_state_dict = checkpoint['state_dict']
+                    # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+                    checkpoint_state_dict = {
+                        key.replace('model.', ''): checkpoint_state_dict.pop(key)
+                        for key in list(checkpoint_state_dict.keys())
+                    }
+                    module.load_state_dict(checkpoint_state_dict, strict=True)
+            else:
+                checkpoint['state_dict'] = {}
+        else:
+            if isinstance(self.model, list):
+                for i in range(len(self.model)):
+                    parallel_state.set_virtual_pipeline_model_parallel_rank(i)
+                    self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True)
+                parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
     def build_transformer_config(self) -> TransformerConfig:
         """ Builds the megatron core gpt transformer config for the model.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index a651ada5c38a..f5b1667be27f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1479,9 +1479,7 @@ def setup(self, stage=None):
         Args:
             stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None.
         """
-        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert(
-            self.model
-        )
+        num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert()
 
         logging.info(
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
@@ -1570,7 +1568,7 @@ def generate(
     ) -> OutputType:
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
index 617a585ef3a9..5ee7a3fcf480 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py
@@ -729,7 +729,7 @@ def generate(
     ):
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
index 3a7ad3d6714c..459bf5b71c7e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -1006,7 +1006,7 @@ def encode(self, tokens_enc, enc_mask, encoder_input=None, batch_data=None, reco
                 Format is not defined and should match the expected format of the used hiddens modules.
         """
         # Check whether the DDP is initialized. This is needed when running inference outside of training loop.
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
index ebe936a8178a..acd85261f7e5 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
@@ -482,7 +482,7 @@ def generate(
     ) -> OutputType:
 
         # check whether the DDP is initialized
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py
index 04bbb2ca17fe..65d8645688fd 100644
--- a/nemo/collections/nlp/models/nlp_model.py
+++ b/nemo/collections/nlp/models/nlp_model.py
@@ -385,7 +385,7 @@ def load_from_checkpoint(
                 sharded_state_dict = model.sharded_state_dict()
                 checkpoint['state_dict'] = sharded_state_dict
                 # dist checkpointing needs torch.distributed to load the checkpoint
-                if parallel_state.is_unitialized():
+                if not parallel_state.is_initialized():
 
                     def dummy():
                         return
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 91f1fab348da..d4a75e3353c7 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -869,7 +869,7 @@ def save_to(self, model, save_path: str):
 
                     sharded_state_dict = model.sharded_state_dict()
                     # dist checkpoint needs torch.distributed to save the checkpoint
-                    if parallel_state.is_unitialized():
+                    if not parallel_state.is_initialized():
 
                         def dummy():
                             return
@@ -1110,7 +1110,7 @@ def restore_from(
         # if we're using dist checkpointing then state_dict will be None
         if state_dict is None:
             # dist checkpointing needs torch.distributed to load the checkpoint
-            if parallel_state.is_unitialized():
+            if not parallel_state.is_initialized():
 
                 def dummy():
                     return
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index 435ca6a496b1..d60ede29e22e 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -133,7 +133,7 @@ def _load_model(
         return model
 
     def _check_ddp_initialized(self, model):
-        if parallel_state.is_unitialized():
+        if not parallel_state.is_initialized():
 
             def dummy():
                 return
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py
index 9649089e40af..443c0216785e 100644
--- a/nemo/utils/distributed.py
+++ b/nemo/utils/distributed.py
@@ -81,7 +81,7 @@ def gather_objects(partial_results_list, main_rank=None):
         pickle.dump(predictions, open(output_fname, "wb"))
     """
     # do not fail when DDP is not initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
         return partial_results_list
 
     rank = parallel_state.get_data_parallel_rank()
diff --git a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
index 61cbbc1ae682..334b3415a93b 100644
--- a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
+++ b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py
@@ -104,7 +104,7 @@ def main(cfg) -> None:
         raise ValueError("need at least a nemo file or checkpoint dir")
 
     # check whether the DDP is initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
 
         def dummy():
             return
diff --git a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
index 4a373dcaf278..ee32f69bf734 100644
--- a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
+++ b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py
@@ -80,7 +80,7 @@ def main(cfg) -> None:
     )
 
     # check whether the DDP is initialized
-    if parallel_state.is_unitialized():
+    if not parallel_state.is_initialized():
 
         def dummy():
             return

From f3d45fd64482b15a6b0f63e7079d6db1be4f46e6 Mon Sep 17 00:00:00 2001
From: Rachit Garg <rachitgarg91@gmail.com>
Date: Fri, 12 Apr 2024 11:30:45 -0700
Subject: [PATCH 25/39] remove fp8 checkpoints for Attention (#8875)

* remove fp8 checkpoints for Attention

Signed-off-by: rachitg <rachitg@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes

Signed-off-by: rachitg <rachitg@nvidia.com>

* set default value and support mha

Signed-off-by: rachitg <rachitg@nvidia.com>

---------

Signed-off-by: rachitg <rachitg@nvidia.com>
Co-authored-by: rachitg <rachitg@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../models/language_modeling/megatron_gpt_model.py    | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index f5b1667be27f..d3f5a7afd631 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -89,6 +89,8 @@
     from megatron.core import InferenceParams, parallel_state, tensor_parallel
     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
     from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+    from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
+    from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject
 
     # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used
     # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec
@@ -1739,6 +1741,15 @@ def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]:
             if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                 parallel_state.set_virtual_pipeline_model_parallel_rank(0)
 
+            # WAR: This is a temporary fix to skip loading FP8 parameters for Dot Product Attention
+            def skip_fp8_load(x):
+                if isinstance(x, ShardedObject) and 'fused_attention' in x.key and '_extra_state' in x.key:
+                    x = LocalNonpersitentObject(x.data)  # use the FP8 state from initialization, not from ckpt
+                return x
+
+            if self.cfg.get('fp8_dot_product_attention', False) or self.cfg.get('fp8_multi_head_attention', False):
+                dict_list_map_inplace(skip_fp8_load, sharded_state_dict)
+
             return sharded_state_dict
 
     def parameters(self):

From ac95b5c8cb49a8b3023f39d1eff75226b0478d1c Mon Sep 17 00:00:00 2001
From: anmolgupt <14880251+anmolgupt@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:49:22 -0700
Subject: [PATCH 26/39] lora a2a after linear out when sp is enabled and
 parallel input (#8882)

* disable reduce for lora CPL bprop

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* a2a for linear out lora

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* moved a2a tp to init

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* add lora config option for enable a2a

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* added custom all2all

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* code cleanup

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../megatron/adapters/parallel_adapters.py    | 51 ++++++++++++++++++-
 nemo/collections/nlp/parts/peft_config.py     |  1 +
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
index 419126bd3f18..5037bb1b3634 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py
@@ -47,6 +47,7 @@
 
 try:
     from megatron.core import ModelParallelConfig
+    from megatron.core.parallel_state import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size
     from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
     from megatron.core.tensor_parallel.mappings import (
         gather_from_sequence_parallel_region,
@@ -146,6 +147,7 @@ def __init__(
         model_parallel_config: Optional[ModelParallelConfig] = None,
         alpha: float | None = None,
         dropout_position: str = 'post',
+        a2a_experimental: bool = False,  # TODO: should rename this or make it a default feature
         **kwargs,
     ):
         super().__init__()
@@ -161,6 +163,9 @@ def __init__(
         self.alpha = alpha if alpha is not None else self.dim
         self.input_is_parallel = input_is_parallel
         self.dropout_position = dropout_position
+        self.tp_world_size = None
+        self.tp_group = None
+        self.use_a2a = a2a_experimental
 
         # megatron_gpt_peft_models will provide this arg, but deprecated ones do not.
         # in case this arg is not provided, use the dummy default config.
@@ -202,12 +207,17 @@ def __init__(
         else:
             # (@adithyare) we use this option to mirror the behavior a column parallel layer with two low-rank column parallel layers
             # if the original column parallel layer uses gather_output=False, then we will use the self.liner_out layer defined below.
+            lin_out_gather_output = True if input_is_parallel else False
+            if self.use_a2a and input_is_parallel and self._sequence_parallel:
+                lin_out_gather_output = False
+                self.tp_world_size = get_tensor_model_parallel_world_size()
+                self.tp_group = get_tensor_model_parallel_group()
             self.linear_out = ColumnParallelLinear(
                 dim,
                 out_features,
                 config=model_parallel_config,
                 bias=False,
-                gather_output=True if input_is_parallel else False,
+                gather_output=lin_out_gather_output,
                 init_method=self._get_init_fn(row_init_method),
             )
 
@@ -291,7 +301,11 @@ def forward(self, x):
             # layernorm after lora is impacted by sequence parallel,
             # hence seq dim need to be scattered right after lora linear layers
             # this function also handles the backward pass correctly
-            x = scatter_to_sequence_parallel_region(x)
+            if self.use_a2a:
+                # all2all hidden_size / TP to seq_len / TP
+                x = all2all_hp2sp(x, self.tp_world_size, self.tp_group)
+            else:
+                x = scatter_to_sequence_parallel_region(x)
 
         if self.norm_position == 'post':
             x = self.layer_norm(x)
@@ -305,6 +319,38 @@ def forward(self, x):
         return x
 
 
+class _All2AllHp2Sp(torch.autograd.Function):
+    """
+    All-2-All from Hidden Parallel to Sequence Parallel
+    This is a temporary workaround and can be updated in the future
+    TODO: Move the functionality to MCore
+    """
+
+    @staticmethod
+    def forward(ctx, input_, world_size, group):
+        ctx.world_size = world_size
+        ctx.group = group
+        send_list = list(input_.chunk(world_size, dim=0))
+        send_list = [tensor.contiguous() for tensor in send_list]
+        receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)]
+        torch.distributed.all_to_all(receive_list, send_list, group=group)
+        x = torch.cat(receive_list, dim=-1)
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        send_list = list(grad_output.chunk(ctx.world_size, dim=-1))
+        send_list = [tensor.contiguous() for tensor in send_list]
+        receive_list = [torch.empty_like(send_list[0]) for _ in range(ctx.world_size)]
+        torch.distributed.all_to_all(receive_list, send_list, group=ctx.group)
+        x = torch.cat(receive_list, dim=0)
+        return x, None, None
+
+
+def all2all_hp2sp(input_, world_size, group):
+    return _All2AllHp2Sp.apply(input_, world_size, group)
+
+
 @dataclass
 class ParallelLinearAdapterConfig(AdapterConfig):
     in_features: int
@@ -321,6 +367,7 @@ class ParallelLinearAdapterConfig(AdapterConfig):
     dropout_position: str = 'post'
     alpha: float | None = None
     network_alpha: int | None = None
+    a2a_experimental: bool = False
     _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__)
 
 
diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py
index e6f0fe267d18..63caa409b218 100644
--- a/nemo/collections/nlp/parts/peft_config.py
+++ b/nemo/collections/nlp/parts/peft_config.py
@@ -184,6 +184,7 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_
             "dropout": lora_cfg.adapter_dropout,
             "alpha": lora_cfg.get("alpha", lora_cfg.adapter_dim),
             "dropout_position": lora_cfg.get("dropout_position", "post"),
+            "a2a_experimental": lora_cfg.get("a2a_experimental", False),
         }
 
         if lora_cfg.weight_tying:

From 08ea4cb15889d604652115de8e4d8544a2a76776 Mon Sep 17 00:00:00 2001
From: Wil Kong <alpha0422@gmail.com>
Date: Sat, 13 Apr 2024 06:50:44 +0800
Subject: [PATCH 27/39] Fix Distributed Fused Adam Issues (#8880)

* Fix distributed fused adam issue with NHWC layout.

* Fix the CUDA graph issue if there's kernel in zero_grad.

* Add option to distribute adam states within node.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/core/optim/distributed_adam.py | 35 +++++++++++++++++++++++++++++
 nemo/utils/callbacks/cuda_graph.py  |  9 +++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py
index 43a784cd7736..32bd7e6c1154 100644
--- a/nemo/core/optim/distributed_adam.py
+++ b/nemo/core/optim/distributed_adam.py
@@ -50,6 +50,11 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam):
         disable_distributed_parameters (bool, optional): use standard
             data-parallel communication instead of ZeRO.
             (default: False)
+        distribute_within_nodes (bool, optional): distribute states
+            within the same node, e.g. DGX. This can improve performance
+            but requires larger memory than distributing within all
+            ranks, especially for pure data parallel models.
+            (default: False).
         **kwargs: keyword arguments to pass to Apex
             DistributedFusedAdam.
 
@@ -59,6 +64,7 @@ def __init__(
         self,
         params: Union[Iterable[torch.nn.Parameter], Iterable[dict]],
         disable_distributed_parameters: bool = False,
+        distribute_within_nodes: bool = False,
         **kwargs,
     ):
 
@@ -71,6 +77,28 @@ def __init__(
             self_groups = [torch.distributed.new_group(ranks=[i]) for i in range(world_size)]
             kwargs['distributed_process_group'] = self_groups[rank]
             kwargs['redundant_process_group'] = kwargs['process_group']
+        elif distribute_within_nodes:
+            world_size = torch.distributed.get_world_size()
+            rank = torch.distributed.get_rank()
+            devices = torch.cuda.device_count()
+            nodes = world_size // devices
+            assert nodes * devices == world_size, "Expected all nodes have teh same amout of devices."
+            node_id = rank // devices
+            device_id = rank % devices
+
+            distributed_pgs = []
+            for i in range(nodes):
+                ranks = [i * devices + j for j in range(devices)]
+                pg = torch.distributed.new_group(ranks=ranks)
+                distributed_pgs.append(pg)
+            kwargs['distributed_process_group'] = distributed_pgs[node_id]
+
+            redundant_pgs = []
+            for i in range(devices):
+                ranks = [i + j * devices for j in range(nodes)]
+                pg = torch.distributed.new_group(ranks=ranks)
+                redundant_pgs.append(pg)
+            kwargs['redundant_process_group'] = redundant_pgs[device_id]
 
         # Make sure dtypes are in right type
         for keyword in ('dtype', 'grad_sync_dtype', 'param_sync_dtype'):
@@ -425,6 +453,13 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet
                 buffers_in.append(buffer_in)
                 buffers_out.append(buffer_out)
             elif torch.is_floating_point(buffer_in) and torch.is_floating_point(param):
+                # Conv with NHWC layout, i.e. shape (N, C, H, W) and stride
+                # (HWC, 1, WC, C), can't `.view(-1)`. Here to turn it to
+                # tensor with shape (N, H, W, C) and stride (HWC, WC, C, 1).
+                # Note: https://github.com/NVIDIA/apex/pull/1794
+                if param.is_contiguous(memory_format=torch.channels_last):
+                    param = param.permute(0, 2, 3, 1)
+
                 # Cast between floating-point dtypes
                 buffer_out = param.detach().view(-1)[param_start:param_end]
                 buffers_in.append(buffer_in)
diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py
index 247c67856c7b..77dc33e7b567 100644
--- a/nemo/utils/callbacks/cuda_graph.py
+++ b/nemo/utils/callbacks/cuda_graph.py
@@ -180,12 +180,19 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) -
             torch.cuda.current_stream().wait_stream(state.stream)
 
         if state.current_iteration == state.capture_iteration:
-            optimizer.zero_grad(**zero_grad_kwargs)
             torch.cuda.synchronize()
             # Sleep for one second to let environment stable
             time.sleep(1)
             rank_zero_info("CUDAGraphCallback: capturing CUDA graph for module %s.", self.__class__.__name__)
             with torch.cuda.graph(state.graph, stream=state.stream, capture_error_mode="global"):
+                # PyTorch CUDA graph doc for whole-network capturing mentions:
+                #
+                #   Sets grads to None before capture, so backward() will create
+                #   .grad attributes with allocations from the graph's private pool
+                #
+                # But it's not necessary, and it can lead to CUDA kernels inside
+                # `zero_grad()` being not captured.
+                optimizer.zero_grad(**zero_grad_kwargs)
                 self.__orig_optimizer_step__(
                     epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure,
                 )

From 21913a015d28293532a4550f1138df4a6d6e26e5 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 12 Apr 2024 18:16:05 -0700
Subject: [PATCH 28/39] update mcore 24.04.12 (#8910)

---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a9509fda51e9..29ea34dba197 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -114,7 +114,7 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \
+                git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \

From ac53e2296fcd9f699c928b62948a0b673c3817bc Mon Sep 17 00:00:00 2001
From: Jie Xin <932141413@qq.com>
Date: Sat, 13 Apr 2024 09:33:30 +0800
Subject: [PATCH 29/39] Support alternative mapping TP->PP->DP (#8909)

* support new tp-pp-dp mapping

Signed-off-by: jxin <jxin@nvidia.com>

* add test

Signed-off-by: jxin <jxin@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refine

Signed-off-by: jxin <jxin@nvidia.com>

* change mcore commit

Signed-off-by: jxin <jxin@nvidia.com>

---------

Signed-off-by: jxin <jxin@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Jenkinsfile                                   |   2 +-
 .../stable_diffusion/ldm/ddpm.py              |   7 +-
 .../language_modeling/megatron_base_model.py  |   1 +
 .../language_modeling/megatron_gpt_model.py   |   8 +-
 .../modules/common/megatron/megatron_init.py  |  95 ++++++-------
 nemo/collections/nlp/parts/nlp_overrides.py   |   1 +
 nemo/utils/app_state.py                       |   9 ++
 tests/collections/nlp/test_initialize.py      | 134 ++++++++++++++++++
 8 files changed, 189 insertions(+), 68 deletions(-)
 create mode 100644 tests/collections/nlp/test_initialize.py

diff --git a/Jenkinsfile b/Jenkinsfile
index c98d13fbed38..55e836eea13a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -87,7 +87,7 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \
+             git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
              pip install . && \
              cd megatron/core/datasets && \
              make'
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 33a194500a69..a96c3c47e44e 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -1770,12 +1770,7 @@ def fwd_bwd_step(self, dataloader_iter, forward_only):
             # we can avoid this broadcast by updating the PTL log function to accept specific ranks
             if parallel_state.get_pipeline_model_parallel_world_size() > 1:
                 if self.loss_broadcast_src_rank is None:
-                    dp_size = parallel_state.get_data_parallel_world_size()
-                    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-                    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-                    rank_in_dp_tp_group = torch.distributed.get_rank() % (dp_size * tp_size)
-                    last_pipeline_stage_offset = (tp_size * dp_size) * (pp_size - 1)
-                    self.loss_broadcast_src_rank = last_pipeline_stage_offset + rank_in_dp_tp_group
+                    self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank()
                 torch.distributed.broadcast(
                     loss_mean, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(),
                 )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 035d194de09f..f431d43716b9 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -195,6 +195,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             pipeline_model_parallel_size=cfg.get('pipeline_model_parallel_size', 1),
             virtual_pipeline_model_parallel_size=vp_size,
             pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0),
+            use_tp_pp_dp_mapping=cfg.get('use_tp_pp_dp_mapping', False),
             context_parallel_size=cfg.get('context_parallel_size', 1),
             micro_batch_size=cfg.get('micro_batch_size'),
             global_batch_size=cfg.get('global_batch_size'),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index d3f5a7afd631..ede72439615e 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1310,13 +1310,7 @@ def on_validation_epoch_end(self):
         # it should be casted to other pipeline stages for logging.
         if parallel_state.get_pipeline_model_parallel_world_size() > 1:
             if self.loss_broadcast_src_rank is None:
-                dp_size = parallel_state.get_data_parallel_world_size()
-                cp_size = parallel_state.get_context_parallel_world_size()
-                tp_size = parallel_state.get_tensor_model_parallel_world_size()
-                pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-                rank_in_dp_tp_group = torch.distributed.get_rank() % (dp_size * cp_size * tp_size)
-                last_pipeline_stage_offset = (tp_size * cp_size * dp_size) * (pp_size - 1)
-                self.loss_broadcast_src_rank = last_pipeline_stage_offset + rank_in_dp_tp_group
+                self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank()
             torch.distributed.broadcast(
                 averaged_loss, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(),
             )
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index 7ba2e28008ac..5d5b65b360ee 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -32,6 +32,7 @@
 try:
     from megatron.core import tensor_parallel
     from megatron.core.parallel_state import (
+        RankGenerator,
         get_pipeline_model_parallel_rank,
         set_expert_model_parallel_rank,
         set_expert_model_parallel_world_size,
@@ -74,6 +75,7 @@ def initialize_model_parallel_for_nemo(
     init_mpi_proc_group=False,
     seed=1234,
     apex_transformer_log_level=30,
+    use_tp_pp_dp_mapping=False,
 ):
 
     if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
@@ -84,6 +86,7 @@ def initialize_model_parallel_for_nemo(
     app_state.global_rank = global_rank
     app_state.world_size = world_size
     app_state.local_rank = local_rank
+    app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping
     app_state.expert_model_parallel_size = expert_model_parallel_size
     app_state.tensor_model_parallel_size = tensor_model_parallel_size
     app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
@@ -108,6 +111,7 @@ def initialize_model_parallel_for_nemo(
         pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank,
         context_parallel_size_=context_parallel_size,
         expert_model_parallel_size_=expert_model_parallel_size,
+        use_tp_pp_dp_mapping=use_tp_pp_dp_mapping,
     )
 
     # update apex.transformer globals
@@ -192,6 +196,7 @@ def fake_initialize_model_parallel(
     virtual_pipeline_model_parallel_size_=None,
     expert_model_parallel_size_=1,
     context_parallel_size_=1,
+    use_tp_pp_dp_mapping=False,
 ):
     """
     Fake initialize model data parallel groups so that we can instantiate model parallel models before DDP is initialized.
@@ -241,24 +246,29 @@ def fake_initialize_model_parallel(
     if virtual_pipeline_model_parallel_size_ is not None:
         virtual_pipeline_model_parallel_rank = 0
 
+    rank_generator = RankGenerator(
+        tp=tensor_model_parallel_size,
+        ep=expert_model_parallel_size_,
+        dp=data_parallel_size,
+        pp=pipeline_model_parallel_size,
+        cp=context_parallel_size,
+        order='tp-pp-dp' if use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
+    )
+
     # Build the data-parallel groups.
     all_data_parallel_group_ranks_with_cp = []
-    for i in range(pipeline_model_parallel_size):
-        start_rank = i * num_pipeline_model_parallel_groups
-        end_rank = (i + 1) * num_pipeline_model_parallel_groups
-        for j in range(context_parallel_size * tensor_model_parallel_size):
-            ranks = range(start_rank + j, end_rank, context_parallel_size * tensor_model_parallel_size)
-            if rank in ranks:
-                data_parallel_group = list(ranks)
-                logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}')
-        for j in range(tensor_model_parallel_size):
-            ranks_with_cp = range(start_rank + j, end_rank, tensor_model_parallel_size)
-            all_data_parallel_group_ranks_with_cp.append(list(ranks_with_cp))
-            if rank in ranks_with_cp:
-                data_parallel_group_with_cp = list(ranks_with_cp)
-                logging.info(
-                    f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}'
-                )
+    for ranks in rank_generator.get_ranks('dp'):
+        if rank in ranks:
+            data_parallel_group = list(ranks)
+            logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}')
+
+    for ranks_with_cp in rank_generator.get_ranks('dp-cp'):
+        all_data_parallel_group_ranks_with_cp.append(ranks_with_cp)
+        if rank in ranks_with_cp:
+            data_parallel_group_with_cp = ranks_with_cp
+            logging.info(
+                f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}'
+            )
 
     data_parallel_rank = data_parallel_group.index(rank)
     logging.info(
@@ -268,20 +278,11 @@ def fake_initialize_model_parallel(
 
     # Build the context-parallel groups.
     all_context_parallel_group_ranks = []
-    for i in range(pipeline_model_parallel_size):
-        for j in range(data_parallel_size):
-            start_rank = (
-                i * num_pipeline_model_parallel_groups + j * tensor_model_parallel_size * context_parallel_size
-            )
-            end_rank = (
-                i * num_pipeline_model_parallel_groups + (j + 1) * tensor_model_parallel_size * context_parallel_size
-            )
-            for k in range(tensor_model_parallel_size):
-                ranks = range(start_rank + k, end_rank, tensor_model_parallel_size)
-                all_context_parallel_group_ranks.append(list(ranks))
-                if rank in ranks:
-                    context_parallel_group = list(ranks)
-                    logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}')
+    for ranks in rank_generator.get_ranks('cp'):
+        all_context_parallel_group_ranks.append(ranks)
+        if rank in ranks:
+            context_parallel_group = ranks
+            logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}')
 
     context_parallel_rank = context_parallel_group.index(rank)
     logging.info(f'All context parallel group ranks: {all_context_parallel_group_ranks}')
@@ -289,11 +290,7 @@ def fake_initialize_model_parallel(
 
     # Build the model-parallel groups.
     all_model_parallel_group_ranks = []
-    for i in range(data_parallel_size * context_parallel_size):
-        ranks = [
-            data_parallel_group_ranks_with_cp[i]
-            for data_parallel_group_ranks_with_cp in all_data_parallel_group_ranks_with_cp
-        ]
+    for ranks in rank_generator.get_ranks('tp-pp'):
         all_model_parallel_group_ranks.append(ranks)
         if rank in ranks:
             logging.info(f'Rank {rank} has model parallel group: {list(ranks)}')
@@ -302,11 +299,10 @@ def fake_initialize_model_parallel(
     # Build the tensor model-parallel groups.
     all_tensor_model_parallel_group_ranks = []
     tensor_model_parallel_group = None
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
-        all_tensor_model_parallel_group_ranks.append(list(ranks))
+    for ranks in rank_generator.get_ranks('tp'):
+        all_tensor_model_parallel_group_ranks.append(ranks)
         if rank in ranks:
-            tensor_model_parallel_group = list(ranks)
+            tensor_model_parallel_group = ranks
             logging.info(f'Rank {rank} has tensor model parallel group: {tensor_model_parallel_group}')
 
     tensor_model_parallel_rank = tensor_model_parallel_group.index(rank)
@@ -317,17 +313,9 @@ def fake_initialize_model_parallel(
     # EP rank
     expert_model_parallel_rank = 0
     if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1:
-        tensor_and_data_group_size: int = tensor_model_parallel_size * data_parallel_size
-        num_tensor_and_data_groups: int = world_size // tensor_and_data_group_size
-        tensor_and_expert_group_size: int = tensor_model_parallel_size * expert_model_parallel_size_
-        num_expert_groups: int = data_parallel_size // expert_model_parallel_size_
-        for i in range(num_tensor_and_data_groups):
-            for j in range(num_expert_groups):
-                start_rank = i * tensor_and_data_group_size + j * tensor_and_expert_group_size
-                end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size
-                ranks = range(start_rank, end_rank)
-                if rank in ranks:
-                    expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size
+        for ranks in rank_generator.get_ranks('ep', independent_ep=True):
+            if rank in ranks:
+                expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size
 
     # Build the pipeline model-parallel groups and embedding groups
     # (first and last rank in each pipeline model-parallel group).
@@ -336,11 +324,10 @@ def fake_initialize_model_parallel(
     pipeline_model_parallel_group = None
     embedding_group = None
     embedding_rank = None
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
-        all_pipeline_model_parallel_group_ranks.append(list(ranks))
+    for ranks in rank_generator.get_ranks('pp'):
+        all_pipeline_model_parallel_group_ranks.append(ranks)
         if rank in ranks:
-            pipeline_model_parallel_group = list(ranks)
+            pipeline_model_parallel_group = ranks
             logging.info(f'Rank {rank} has pipeline model parallel group: {pipeline_model_parallel_group}')
 
         # Setup embedding group (to exchange gradients between
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index d4a75e3353c7..983b76784a66 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -137,6 +137,7 @@ def init_model_parallel(sharp: bool, nccl_communicator_config_path: str = None)
                 nccl_communicator_config_path=nccl_communicator_config_path,
                 use_sharp=sharp,
                 expert_model_parallel_size=app_state.expert_model_parallel_size,
+                order='tp-pp-dp' if app_state.use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
             )
 
             # assert that fake tp and pp rank match after model parallel init
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index 8ba9880219ec..34a03fc28871 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -55,6 +55,7 @@ def __init__(self):
         self._is_megatron_initialized = False
         self._data_parallel_size = None
         self._data_parallel_group = None
+        self._use_tp_pp_dp_mapping = False
         self._megatron_checkpoint_version = None
         self._use_fp8 = False
         self._context_parallel_size = None
@@ -191,6 +192,14 @@ def pipeline_model_parallel_size(self, size):
         """
         self._pipeline_model_parallel_size = size
 
+    @property
+    def use_tp_pp_dp_mapping(self):
+        return self._use_tp_pp_dp_mapping
+
+    @use_tp_pp_dp_mapping.setter
+    def use_tp_pp_dp_mapping(self, use_new_mapping):
+        self._use_tp_pp_dp_mapping = use_new_mapping
+
     @property
     def virtual_pipeline_model_parallel_size(self):
         """ Property returns the number of GPUs in each model parallel group.
diff --git a/tests/collections/nlp/test_initialize.py b/tests/collections/nlp/test_initialize.py
new file mode 100644
index 000000000000..b8e27573ce61
--- /dev/null
+++ b/tests/collections/nlp/test_initialize.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+
+
+def old_fake_initialize_model_parallel(
+    world_size,
+    rank,
+    tensor_model_parallel_size_,
+    pipeline_model_parallel_size_,
+    pipeline_model_parallel_split_rank_=None,
+    virtual_pipeline_model_parallel_size_=None,
+    expert_model_parallel_size_=1,
+    context_parallel_size_=1,
+):
+    # Get world size and rank. Ensure some consistencies.
+    tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size)
+    pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size)
+    model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size
+    context_parallel_size = min(context_parallel_size_, world_size)
+
+    assert (
+        world_size % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size) == 0
+    ), f'world_size: {world_size} must be divisible by tensor_model_parallel_size: {tensor_model_parallel_size} times pipeline_model_parallel_size {pipeline_model_parallel_size} times context_parallel_size {context_parallel_size}'
+    data_parallel_size = world_size // (
+        tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size
+    )
+
+    num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
+    num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
+
+    virtual_pipeline_model_parallel_rank = None
+    if virtual_pipeline_model_parallel_size_ is not None:
+        virtual_pipeline_model_parallel_rank = 0
+
+    # Build the tensor model-parallel groups.
+    tensor_model_parallel_group = None
+    for i in range(num_tensor_model_parallel_groups):
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
+        if rank in ranks:
+            tensor_model_parallel_group = list(ranks)
+
+    tensor_model_parallel_rank = tensor_model_parallel_group.index(rank)
+
+    # EP rank
+    expert_model_parallel_rank = 0
+    if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1:
+        tensor_and_data_group_size: int = tensor_model_parallel_size * data_parallel_size
+        num_tensor_and_data_groups: int = world_size // tensor_and_data_group_size
+        tensor_and_expert_group_size: int = tensor_model_parallel_size * expert_model_parallel_size_
+        num_expert_groups: int = data_parallel_size // expert_model_parallel_size_
+        for i in range(num_tensor_and_data_groups):
+            for j in range(num_expert_groups):
+                start_rank = i * tensor_and_data_group_size + j * tensor_and_expert_group_size
+                end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size
+                ranks = range(start_rank, end_rank)
+                if rank in ranks:
+                    expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size
+
+    # Build the pipeline model-parallel groups and embedding groups
+    # (first and last rank in each pipeline model-parallel group).
+    pipeline_model_parallel_group = None
+    for i in range(num_pipeline_model_parallel_groups):
+        ranks = range(i, world_size, num_pipeline_model_parallel_groups)
+        if rank in ranks:
+            pipeline_model_parallel_group = list(ranks)
+
+    pipeline_model_parallel_rank = pipeline_model_parallel_group.index(rank)
+
+    return (
+        tensor_model_parallel_rank,
+        pipeline_model_parallel_rank,
+        expert_model_parallel_rank,
+        model_parallel_size,
+        data_parallel_size,
+        pipeline_model_parallel_split_rank_,
+        virtual_pipeline_model_parallel_rank,
+    )
+
+
+@pytest.mark.parametrize(
+    'nodes, num_gpu, tp, pp, cp, ep',
+    [
+        (1, 1, 1, 1, 1, 1),
+        (4, 8, 2, 4, 1, 1),
+        (8, 8, 8, 8, 1, 1),
+        (16, 8, 4, 8, 1, 1),
+        (16, 8, 4, 8, 4, 1),
+        (32, 8, 8, 8, 1, 1),
+        (32, 8, 4, 8, 1, 4),
+        (32, 8, 8, 8, 4, 1),
+    ],
+)
+def test_fake_initialize(nodes, num_gpu, tp, pp, cp, ep):
+    (
+        tensor_model_parallel_rank,
+        pipeline_model_parallel_rank,
+        expert_model_parallel_rank,
+        model_parallel_size,
+        data_parallel_size,
+        pipeline_model_parallel_split_rank,
+        virtual_pipeline_model_parallel_rank,
+    ) = old_fake_initialize_model_parallel(nodes * num_gpu, 0, tp, pp, None, None, ep, cp)
+
+    (
+        m_tensor_model_parallel_rank,
+        n_pipeline_model_parallel_rank,
+        n_expert_model_parallel_rank,
+        n_model_parallel_size,
+        n_data_parallel_size,
+        n_pipeline_model_parallel_split_rank,
+        n_virtual_pipeline_model_parallel_rank,
+    ) = fake_initialize_model_parallel(nodes * num_gpu, 0, tp, pp, None, None, ep, cp)
+    assert m_tensor_model_parallel_rank == tensor_model_parallel_rank
+    assert n_pipeline_model_parallel_rank == pipeline_model_parallel_rank
+    assert n_expert_model_parallel_rank == expert_model_parallel_rank
+    assert n_model_parallel_size == model_parallel_size
+    assert n_data_parallel_size == data_parallel_size
+    assert n_pipeline_model_parallel_split_rank == pipeline_model_parallel_split_rank
+    assert n_virtual_pipeline_model_parallel_rank == virtual_pipeline_model_parallel_rank

From cb22d71e335bc25bfe09947c64a2223550fc65ae Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Fri, 12 Apr 2024 20:03:36 -0600
Subject: [PATCH 30/39] update package info (#8793)

Signed-off-by: eharper <eharper@nvidia.com>
---
 Dockerfile           | 2 +-
 nemo/package_info.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 970c34a690d4..fa825d61f015 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -141,7 +141,7 @@ COPY . .
 
 # start building the final container
 FROM nemo-deps as nemo
-ARG NEMO_VERSION=1.23.0
+ARG NEMO_VERSION=2.0.0
 
 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
 # version information as runtime environment variable for introspection purposes
diff --git a/nemo/package_info.py b/nemo/package_info.py
index e0ff2247e6ad..b253927a6b38 100644
--- a/nemo/package_info.py
+++ b/nemo/package_info.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 
-MAJOR = 1
-MINOR = 23
+MAJOR = 2
+MINOR = 0
 PATCH = 0
 PRE_RELEASE = 'rc0'
 

From 378a9b3d9845a02eacc392e267f2e66dc62f151f Mon Sep 17 00:00:00 2001
From: Rachit Garg <rachitgarg91@gmail.com>
Date: Sat, 13 Apr 2024 10:05:18 -0700
Subject: [PATCH 31/39] Rachitg/dpa (#8911)

* remove fp8 checkpoints for Attention

Signed-off-by: rachitg <rachitg@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes

Signed-off-by: rachitg <rachitg@nvidia.com>

* set default value and support mha

Signed-off-by: rachitg <rachitg@nvidia.com>

* skip by default

Signed-off-by: rachitg <rachitg@nvidia.com>

---------

Signed-off-by: rachitg <rachitg@nvidia.com>
Co-authored-by: rachitg <rachitg@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index ede72439615e..4493532f88bf 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1741,7 +1741,7 @@ def skip_fp8_load(x):
                     x = LocalNonpersitentObject(x.data)  # use the FP8 state from initialization, not from ckpt
                 return x
 
-            if self.cfg.get('fp8_dot_product_attention', False) or self.cfg.get('fp8_multi_head_attention', False):
+            if self.cfg.get('skip_fp8_attention_checkpoint_load', True):
                 dict_list_map_inplace(skip_fp8_load, sharded_state_dict)
 
             return sharded_state_dict

From de983ff6eb164944197c0e96807c3ee74119057c Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Sat, 13 Apr 2024 15:28:26 -0700
Subject: [PATCH 32/39] update mcore (#8917)

---
 .github/workflows/cicd-main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 29ea34dba197..c4350a42f59b 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -114,7 +114,7 @@ jobs:
             # Megatron Core installation
             git clone https://github.com/NVIDIA/Megatron-LM.git && \
                 pushd Megatron-LM && \
-                git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \
+                git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \
                 pip install . && \
                   pushd megatron/core/datasets && \
                   make && \

From dca6f7427b2c1c19a28d1023dcc5a1d789f523ea Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 15 Apr 2024 09:30:43 -0700
Subject: [PATCH 33/39] Remove precision args in trainer due to PTL update
 (#8908)

* Fix precision args in trainer due to PTL update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* roll back one change

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../multimodal_llm/neva/convert_hf_llava_to_neva.py    |  2 +-
 .../megatron_change_num_partitions.py                  | 10 +++-------
 .../convert_baichuan2_hf_to_nemo.py                    |  2 +-
 .../convert_chatglm_hf_to_nemo.py                      |  2 +-
 .../convert_mistral_7b_hf_to_nemo.py                   |  2 +-
 .../convert_mixtral_hf_to_nemo.py                      |  2 +-
 .../convert_starcoder2_hf_to_nemo.py                   |  2 +-
 7 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py b/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py
index c9263ea85bbf..2cbb4c2b3b82 100644
--- a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py
+++ b/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py
@@ -205,7 +205,7 @@ def convert(args):
     nemo_config.precision = precision
     print(f"nemo_config: {nemo_config}")
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = hf_config["hidden_size"]
     head_num = hf_config["num_attention_heads"]
diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py
index 436661e01b5d..c035346e3bf1 100644
--- a/examples/nlp/language_modeling/megatron_change_num_partitions.py
+++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py
@@ -938,7 +938,7 @@ def main():
         # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
         # precision plugins and precision to exist
         precision = None
-    trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision)
+    trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
 
     if tp_size < 0 or pp_size < 0:
         logging.info(f"Loading model config from {args.model_file} to get TP and PP size")
@@ -1205,9 +1205,7 @@ def main():
         if vp_size > 1:
             set_virtual_parallel_rank_safely(None)
 
-        trainer = Trainer(
-            plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision
-        )
+        trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
 
         with open_dict(model.cfg):
             if args.tokenizer_model_path is not None:
@@ -1413,9 +1411,7 @@ def main():
                 app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size
             )
 
-            trainer = Trainer(
-                plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision
-            )
+            trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu")
             if args.tokenizer_model_path is not None:
                 with open_dict(model.cfg):
                     model.cfg.tokenizer.model = args.tokenizer_model_path
diff --git a/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
index 585741de9b9a..b87f7e028cdb 100644
--- a/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py
@@ -175,7 +175,7 @@ def convert(args):
     nemo_config.precision = precision
     print(f"nemo_config: {nemo_config}")
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = hf_config["hidden_size"]
     head_num = hf_config["num_attention_heads"]
diff --git a/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
index c3f210deefac..363e4de09ef7 100644
--- a/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py
@@ -142,7 +142,7 @@ def convert(args):
 
     nemo_config.precision = precision
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = hf_config["hidden_size"]
     head_num = hf_config["num_attention_heads"]
diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
index db0fe28cbf73..cb11bb5da564 100644
--- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py
@@ -193,7 +193,7 @@ def convert(args):
     nemo_config.precision = precision
     logging.info(f"nemo_config: {nemo_config}")
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = nemo_config.hidden_size
     head_num = nemo_config.num_attention_heads
diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index d8ad9d5030b8..ac323757a2f6 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -194,7 +194,7 @@ def convert(args):
     nemo_config.precision = precision
     print(f"nemo_config: {nemo_config}")
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = nemo_config.hidden_size
     head_num = nemo_config.num_attention_heads
diff --git a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
index eccca3a04621..fc898c797a9e 100644
--- a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py
@@ -194,7 +194,7 @@ def convert(args):
     nemo_config.precision = precision
     logging.info(f"nemo_config: {nemo_config}")
 
-    trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy())
+    trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
 
     hidden_size = nemo_config.hidden_size
     head_num = nemo_config.num_attention_heads

From e9d826657d2f10f7e632f091996908f85e64fa2e Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Mon, 15 Apr 2024 09:30:55 -0700
Subject: [PATCH 34/39] Fix module.training for neva in FusedAttn backward
 (#8877)

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/collections/multimodal/data/neva/neva_dataset.py          | 2 +-
 .../multimodal/models/multimodal_llm/neva/neva_model.py        | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 15d755a7d59a..71d9bda12de1 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -782,7 +782,7 @@ class DataCollatorForSupervisedDataset(object):
 
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         max_len = max(instance['tokens'].shape[0] for instance in instances)
-        max_len = (max_len - 1) // 4 * 4 + 4
+        max_len = (max_len - 1) // 64 * 64 + 64
         for instance in instances:
             pad_len = max_len - instance['tokens'].shape[0]
             instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index 4556ba1b3e72..cff8ab1a7b5f 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -387,9 +387,6 @@ def __init__(
     def freeze_llm(self, mm_cfg):
         for param in chain(self.embedding.parameters(), self.decoder.parameters(), self.output_layer.parameters(),):
             param.requires_grad = False
-        self.embedding = self.embedding.eval()
-        self.decoder = self.decoder.eval()
-        self.output_layer = self.output_layer.eval()
 
     def forward(
         self, *args, **kwargs,

From 32e630220d2c24550a869f43ce618949ccf1a1a5 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Tue, 16 Apr 2024 01:17:16 -0400
Subject: [PATCH 35/39] Updates for TRT-LLM 0.9 (#8873)

* upgrade to trtllm0.9

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update gpt to config based export

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* fix for lora checkpoint

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix for in flight batching case

* Update falcon for trt-llm 0.9

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Removed unused import and comment

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Co-authored-by: abharwani <abharwani@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/export/trt_llm/decoder/falcon.py     |  6 +--
 nemo/export/trt_llm/decoder/gpt.py        | 46 +++++++++++++----------
 nemo/export/trt_llm/decoder/llama.py      |  6 +--
 nemo/export/trt_llm/tensorrt_llm_build.py |  4 ++
 nemo/export/trt_llm/tensorrt_llm_model.py | 18 +++------
 nemo/export/trt_llm/tensorrt_llm_run.py   |  5 ++-
 6 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py
index b0e69d2b99c4..91edc7794607 100644
--- a/nemo/export/trt_llm/decoder/falcon.py
+++ b/nemo/export/trt_llm/decoder/falcon.py
@@ -17,8 +17,7 @@
 
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.models.falcon.model import FalconDecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
 from typing_extensions import override
 
 from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
@@ -119,8 +118,7 @@ def build_decoder(self, layer):
             world_size=self.tensor_parallel,
             tp_size=self.tensor_parallel,
             pp_size=1,
-            quant_mode=QuantMode(0),
-            quant_kwargs=None,
+            quantization=QuantConfig(),
             max_lora_rank=layer.max_lora_rank,
             use_parallel_embedding=False,
         )
diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py
index 294ccb737c1f..8af4e4ef01e4 100644
--- a/nemo/export/trt_llm/decoder/gpt.py
+++ b/nemo/export/trt_llm/decoder/gpt.py
@@ -17,6 +17,7 @@
 
 from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType
 from tensorrt_llm.models.gpt.model import GPTDecoderLayer
+from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
 from typing_extensions import override
 
 from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
@@ -85,13 +86,10 @@ class GPTDecoderLayerBuilder(DecoderLayerBuilder):
     @override
     def build_decoder(self, layer):
         rotary_pct = layer.rotary_pct
-        position_embedding_type = (
-            PositionEmbeddingType.rope_gpt_neox
-            if layer.position_embedding_type == "rope"
-            else PositionEmbeddingType.learned_absolute
-        )
 
-        assert not (position_embedding_type == PositionEmbeddingType.rope_gpt_neox and rotary_pct == 0.0)
+        position_embedding_type = "rope_gpt_neox" if layer.position_embedding_type == "rope" else "learned_absolute"
+
+        assert not (position_embedding_type == "rope_gpt_neox" and rotary_pct == 0.0)
 
         bias_qkv = layer.attention.qkv.bias is not None
 
@@ -99,23 +97,33 @@ def build_decoder(self, layer):
         if layer.rotary_scaling is not None:
             rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)}
 
-        return GPTDecoderLayer(
+        config = PretrainedConfig(
+            architecture=None,
+            dtype=self.dtype,
+            logits_dtype=self.dtype,
+            vocab_size=layer.vocab_size,
+            max_position_embeddings=self.max_position_embeddings,
             hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_layers,
             num_attention_heads=self.num_attention_heads,
-            max_position_embeddings=self.max_position_embeddings,
-            num_layers=self.num_layers,
-            dtype=self.dtype,
-            apply_query_key_layer_scaling=False,
-            attention_mask_type=AttentionMaskType.causal,
+            num_key_value_heads=self.num_kv_heads,
             hidden_act=self.hidden_act,
+            intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel,
+            norm_epsilon=layer.norm_epsilon,
             position_embedding_type=position_embedding_type,
-            rotary_embedding_percentage=rotary_pct,
-            rotary_base=layer.rotary_base,
-            rotary_scaling=rotary_scaling,
-            inter_size=layer.ffn_hidden_size_local * self.tensor_parallel,
-            bias=bias_qkv,
-            num_kv_heads=self.num_kv_heads,
-            tp_group=self.tp_group,
+            world_size=self.tensor_parallel,
             tp_size=self.tensor_parallel,
+            pp_size=1,
             max_lora_rank=layer.max_lora_rank,
+            quantization=QuantConfig(),
         )
+
+        config.set_if_not_exist('hidden_act', self.hidden_act)
+        config.set_if_not_exist('apply_query_key_layer_scaling', False)
+        config.set_if_not_exist('bias', bias_qkv)
+        config.set_if_not_exist('rotary_base', layer.rotary_base)
+        config.set_if_not_exist('rotary_scaling', rotary_scaling)
+        config.set_if_not_exist('rotary_pct', rotary_pct)
+        config.set_if_not_exist('moe_num_experts', 0)
+
+        return GPTDecoderLayer(config=config, layer_idx=self.layer_id,)
diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py
index e554e18608f7..873c0306375b 100644
--- a/nemo/export/trt_llm/decoder/llama.py
+++ b/nemo/export/trt_llm/decoder/llama.py
@@ -18,8 +18,7 @@
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.layers import MoeConfig
 from tensorrt_llm.models.llama.model import LLaMADecoderLayer
-from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig
 from typing_extensions import override
 
 from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
@@ -118,9 +117,8 @@ def build_decoder(self, layer):
             world_size=self.tensor_parallel,
             tp_size=self.tensor_parallel,
             pp_size=1,
-            quant_mode=QuantMode(0),
-            quant_kwargs=None,
             max_lora_rank=layer.max_lora_rank,
+            quantization=QuantConfig(),
         )
 
         config.set_if_not_exist('mlp_bias', False)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index 0941a6d3dbba..3ad27a2eb9a6 100644
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -27,6 +27,7 @@
 from tensorrt_llm._utils import np_dtype_to_trt
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
+from tensorrt_llm.models.modeling_utils import add_lora
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -170,6 +171,9 @@ def _build_impl(tensorrt_llm_model, args):
     timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache"
     timing_cache = timing_cache_file
 
+    if args.use_lora_plugin is not None:
+        add_lora(tensorrt_llm_model, args.max_lora_rank)
+
     builder = Builder()
     apply_query_key_layer_scaling = False
 
diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py
index b2da7855ccdc..52e9c4960fc9 100644
--- a/nemo/export/trt_llm/tensorrt_llm_model.py
+++ b/nemo/export/trt_llm/tensorrt_llm_model.py
@@ -144,15 +144,7 @@ def forward(
         if attention_mask is not None:
             attention_mask = expand_mask(attention_mask, shape(input_ids, -1))
 
-        for layer_idx, (layer, past, pointer, host_pointer, max_attention_window_size) in enumerate(
-            zip(
-                self.layers,
-                kv_cache_params.past_key_value,
-                kv_cache_params.kv_cache_block_pointers,
-                kv_cache_params.host_kv_cache_block_pointers,
-                kv_cache_params.host_max_attention_window_sizes,
-            )
-        ):
+        for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)):
 
             decoder_params = {
                 "hidden_states": hidden_states,
@@ -161,8 +153,8 @@ def forward(
                 "kv_cache_params": KeyValueCacheParams(
                     past_key_value=[past],
                     host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths,
-                    kv_cache_block_pointers=[pointer],
-                    host_max_attention_window_sizes=max_attention_window_size,
+                    kv_cache_block_pointers=kv_cache_params.kv_cache_block_pointers,
+                    host_max_attention_window_sizes=kv_cache_params.host_max_attention_window_sizes,
                     cache_indirection=kv_cache_params.cache_indirection,
                     host_sink_token_length=kv_cache_params.host_sink_token_length,
                     host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers,
@@ -329,8 +321,8 @@ def prepare_inputs(
                 past_key_value=model_inputs['past_key_value'],
                 host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'],
                 host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'],
-                kv_cache_block_pointers=model_inputs['kv_cache_block_pointers_list'],
-                host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers_list'],
+                kv_cache_block_pointers=model_inputs['kv_cache_block_pointers'],
+                host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers'],
                 cache_indirection=model_inputs['cache_indirection'],
                 host_sink_token_length=model_inputs['host_sink_token_length'],
             ),
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index cdc0b78d6c18..1e24f4f207a4 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -24,12 +24,14 @@
 import torch
 from mpi4py.futures import MPIPoolExecutor
 from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_manager import LoraManager
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import LoraManager, ModelConfig, SamplingConfig
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 from transformers import PreTrainedTokenizer
 
 from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group
 from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder
+
 from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine  # isort:skip
 from nemo.export.trt_llm.nemo_utils import to_word_list_format  # isort:skip
 
@@ -90,6 +92,7 @@ def _read_config(config_path: Path):
     model_config = ModelConfig(
         model_name=config["builder_config"]["name"],
         max_batch_size=config["builder_config"]["max_batch_size"],
+        max_beam_width=config["builder_config"]["max_beam_width"],
         vocab_size=config["builder_config"]["vocab_size"],
         num_layers=config["builder_config"]["num_layers"],
         num_heads=num_heads,

From c6c45c41ecf4ce7d115ec66d50bb3acc763ff4b0 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Tue, 16 Apr 2024 08:26:25 -0400
Subject: [PATCH 36/39] Huvu/mcore retro (#8861)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update branch

Signed-off-by: eharper <eharper@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749)

* Add dist ckpt support for regular optimizers

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fix imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Pin lhotse=1.19.2 in r1.23.0 (#8303)

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Cache Aware Streaming tutorial notebook (#8296)

* add notebook

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename old notebook to Buffered_Streaming

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* call setup_streaming_params in set_default_att_context_size method

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links to tutorials in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* remove hard-coding

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename var

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* add deallocate pipeline output optimization (#8279)

* add deallocate pipeline output optimization

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix memory leak caused by context parallelism hanging references by omegaconf (#8299)

* save cp_size to self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* use parallel_state instead of self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* remove assertion (#8302)

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Update PEFT Doc (#8262)

* update peft doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove old prompt learning doc and notebook

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Merge branch 'r1.23.0' into chcui/update_peft_doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks  (#8242) (#8324)

* Rebasing canary changes at current main

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move the changes from asr transformer to nlp transformer as originally intended

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update eval to strip spaces before punctuations

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update pc strip

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Refactor: `PromptedAudioToTextLhotseDataset` and `EncDecMultiTaskModel` (#8247)

* Create a separate CanaryDataset and use it inside `transformer_bpe_models.py`. Ditches `token_sequence_format`.

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [canary] Refactor: move changes in transformer_bpe_models.py to Canar… (#8252)

* [canary] Refactor: move changes in transformer_bpe_models.py to CanaryModel

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryModel` to `EncDecMultiTaskModel` and remove inheritance from `EncDecTransfModelBPE`; add a separate config for this model

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryDataset` to `PromptedAudioToTextLhotseDataset`; add `prompt_format_fn` argument; clean-up the `_canary_prompt_format` function a bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move tokenization into `prompt_format_fn`, fix usage, add docs

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Backward-compatible utterance validation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Improve type annotations

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* config and prompt_fn registration changes from review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix transcribe config

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Refactor Canary to follow schema of remaining ASR models (#8260)

* Initial draft of multi task beam decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Stabilize inference

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update AED Multi Task model to mostly conform to Archetype-Type format. Update config

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add change decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Remove redundant imports

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove asr transformer dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* copy token_classifier from nlp to asr

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Address comments

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add typing to beam decoding

Signed-off-by: smajumdar <titu1994@gmail.com>

* Make prompt format configurable

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* drop asr dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>

* fix transcribe, update asr evaluator

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Extend the docs for the canary prompt_fn

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Incorporate changes from Nithin's code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* training bug fix and adding launch script for speech_multitask (#8270)

* bug fix and adding launch script for speech_multitask

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* update launch script example in speech_to_text_aed.py

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* Fix: drop_last must be true in validation/test otherwise the training will hang

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

* revert to current transcribe API

Signed-off-by: stevehuang52 <heh@nvidia.com>

* revert changes to NLP, update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update eval utils

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Remove DALI; rename compute_audio_loss to compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set default use_model_transcribe=False

Signed-off-by: stevehuang52 <heh@nvidia.com>

* change os.path.dirname to pathlib

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Test for CanaryTokenizer + refactoring (#8285)

* Test for CanaryTokenizer

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Attempt at refactor...

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Update config for AED models (#8294)

Signed-off-by: smajumdar <titu1994@gmail.com>

* set default calculate_wer=False in transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 1

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 2

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Document compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
(cherry picked from commit d10726da72f74eb5a95056843d1f9e2562a5051c)

Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* add code for calling mcore_retro in NeMo

* add code for calling mcore_retro in NeMo

* runnable, training curve match retro mcore and nemo

* working on retro inference

* working on megatron_retro_eval.py and megatron_retro_inference.yaml

* refactoring text_generation_utils code and retro inference relevant files

* clean PR

* resolving quick hacks (reading number of train/valid samples from workdir, discrepancy in total samples and samples with neighbors retrieved, tokenizers)

* clean repository

* revert changes to inference/eval code to original in main

* clean code

* runable training code, with already implemented eval code

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* revert to original eval code files

* revert to original eval code files 2

* revert to original eval code files 3

* revert to original eval code files 4

* clean code

* clean code

* update my code to support changes from lastest main

* commit before rebase r1.23.0

* Multimodal r1.23.0 bug fix  (#8315)

* Rename quick-gelu

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* ddpm config guard

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix ddpm edit api

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix insert_image_token cfg issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* neva updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add back jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update default neva template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* copy paste files from r1.23.0

* clean PR

* Fixes for MoE parameter passing & use of AutoTokenizer/Model for mistral. (#8272)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Remove asr webapp (#8347)

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove _target_ at model level in aed config (#8351)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* revert changes for tts and asr

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357)

* Add change_vocabulary and save_tokenizers() support

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* Change default (#8371)

Signed-off-by: smajumdar <titu1994@gmail.com>

* implement retro's own fwd_bwd_step() and validation_step() to not have argument first_val_step, which the MLM commit doesn't support

* adding megatron compile_helpers(), in future can be fixed with correct MLM commit

* bug fix in fast-conformer-aed.yaml and adding jenkins test for speech_to_text_aed model (#8368)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Enable megatron core loggers for GPT pretraining (#8354)

* Logging changes tested for gpt_pretraining

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* Additional args

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* mcore ds fix (#8283)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert apex installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn off the fusion for jenkins

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* addressing Eric's reviews

* adding existing implementation RETRO files

* adding existing implementation RETRO files

* Add Finetuning tutorial with HF Datasets (#8356)

* Add Finetuning tutorial with HF Datasets

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update on Som comments

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* release updates (#8378)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mock ds test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* mcore ds fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* data input fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* MCore dataset compatibility for tokenizers (#8390)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Mcore customization doc (#8298)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* initial placeholder

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* add to intro/index.rst

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* initial content update

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add diff images

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

size

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* minor fixes

* minor language change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean changes

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* wer fix (#8404)

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* updated link to pubmed (#8402)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Update NFA video download link (#8406)

* update nfa nasa video link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update link in markdown

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* revert changes (#8410)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix dreambooth data sampler issue (#8400)

* Turn on drop last

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fixed errors in the CTM gen functions (#8416)

Signed-off-by: Taejin Park <tango4j@gmail.com>

* add ensemble decoding fix (#8427)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* SDE bugfix log (#8430)

Signed-off-by: George <gzelenfroind@nvidia.com>

* mcore customization doc minor fix (#8421)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* NeMo-Mistral to HF converter bugfix. (#8353)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fixing mcore bert for TP, PP and SP (#8336)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add settings to suppress bf16 compile errors in CI on V100 (#8481)

* Add settings to suppress bf16 compile errors in CI on V100

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* MoE parameter passing (#8255)

* MoE parameter passing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass EP/MoE params in consumer scripts.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* PR fixes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Use latest commit of mcore-0.5

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* CI fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update k2 version (#8478) (#8492)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add fp8 support for SD/Update notebook paths (#8489)

* Add fp8 support for SD/Update notebook paths

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* pin to 0.5.0 (#8465)

Signed-off-by: eharper <eharper@nvidia.com>

* Update NeMo Multimodal Requirements (#8515)

* Update requirements_multimodal.txt

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update github raw content link (#8517)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Add dep notice for notebooks (#8522)

* add dep notice

Signed-off-by: eharper <eharper@nvidia.com>

* revert

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Revert FP8 integration (#8520)

* Revert FP8 integration

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update data prep notebook (#8532)

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* before update branch with latest r1.23.0

* update to run with MLM ae2817b3dde4efb1515061a5311d01d8f85bd99c (runnable training and saving checkpoint)

* remove compile_helpers

* reverse changes from main branch to r1.23.0

* adding *_legacy files

* update MLM commit in Jenkinsfile to latest

* debugging Jenkinstest: test different mcore import in retro_dataset

* update Jenkinsfile edit megatron_retro_mutransfer_pretrain_legacy.py

* removing all mcore RETRO to pass the Jenkinstest

* fixing import legacy problem for tests/collections/nlp/test_indexed_retrieval_dataset.py

* update Jenkinsfile file to use TE v0.7

* update NeMo to work with latest mcore RETRO (solving TE problems)

* update TE commit Jenkinsfile to be the same with r1.23.0's Jenkinsfile

* update commit for MLM

* jenkinstest debugging

* temporary fix RETRO's __init__ for jenkinstest

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* add model.data.dataloader_type=cyclic to jenkinsfile

* update code to work with latest megatron-lm main 81dab6067

* update M-LM commit in Jenkinsfile to latest main M-LM 81dab6067

* fix to by pass CI test bf16 problem (following this PR https://github.com/NVIDIA/NeMo/pull/8481/files)

* isort and black

* adjusting model.micro_batch_size to 1

* fix BRANCH = 'r1.23.0'

* replace tutorials dir from main branch to huvu/mcore_retro

* fix minor merges conflict

* update Jenkinsfile

* runnable with a temporary fix from Jacek (unfound -unfinished problem)

* runnable with a temporary fix from Jacek (unfound -unfinished problem)

* modified nlp_overrides.py back to original

* fix checkpoint from Jacek Bieniusiewicz

* config Jenkinsfile test

* set RETRO Jenkins MBS to 1

* black fix

* isort fix

* update TE commit

* update to latest Jenkinsfile with latest container and commits

* remove new RETRO jenkinstest

* merge latest main

* put RETRO Jenkinstest to the right place

* update code for megatron_retro_pretraining_legacy.py

* untrack ipa_cmudict-0.7b_nv23.01.txt

* untrack ipa_cmudict-0.7b_nv23.01.txt

* set config in megatron_retro_pretraining_legacy.py to megatron_retro_config_legacy

* update new RETRO jenkinstest to run faster

* merging latest main, and edit Jenkinstest

* update Jenkinstest for new RETRO to run faster

* fix isort

* fix whitespace

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: George <gzelenfroind@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 Jenkinsfile                                   |  75 +-
 .../conf/megatron_bert_config.yaml            |   2 +-
 .../conf/megatron_retro_config.yaml           | 248 +++++--
 .../conf/megatron_retro_config_legacy.yaml    | 127 ++++
 .../megatron_retro_pretraining.py             |  76 +-
 .../megatron_retro_pretraining_legacy.py      | 102 +++
 .../megatron/retro_dataset.py                 | 557 ++++-----------
 .../megatron/retro_dataset_legacy.py          | 469 +++++++++++++
 .../nlp/models/language_modeling/__init__.py  |   1 +
 .../language_modeling/megatron_gpt_model.py   |  15 +-
 .../megatron_retrieval_model.py               |   2 +-
 .../language_modeling/megatron_retro_model.py | 651 ++++++++++++++++++
 .../nlp/modules/common/tokenizer_utils.py     |   9 +
 nemo/utils/callbacks/nemo_model_checkpoint.py |  15 +-
 .../nlp/test_indexed_retrieval_dataset.py     |   2 +-
 15 files changed, 1795 insertions(+), 556 deletions(-)
 mode change 100644 => 100755 examples/nlp/language_modeling/conf/megatron_retro_config.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py
 create mode 100644 nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py
 create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_retro_model.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 55e836eea13a..83e6daa8ccb7 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -125,6 +125,7 @@ pipeline {
         sh 'python tests/core_ptl/check_imports.py --domain "nlp"'
       }
     }
+
     stage('L0: Unit Tests GPU') {
       steps {
         sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads'
@@ -3517,6 +3518,64 @@ pipeline {
       failFast true
       steps {
         sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+            trainer.num_nodes=1 \
+            trainer.devices=2 \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            model.data.data_prefix=['none'] \
+            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
+            model.mcore_gpt=True \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.optim.name=distributed_fused_adam \
+            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
+            model.data.num_workers=4 \
+            model.micro_batch_size=1 \
+            model.data.shuffle_documents=False \
+            trainer.val_check_interval=30 \
+            +trainer.num_sanity_val_steps=0 \
+            model.init_method_std=0.023 \
+            model.optim.lr=6.0e-4 \
+            model.megatron_amp_O2=True \
+            model.data.splits_string=\'\"98,2,0\"\' \
+            model.data.dataloader_type=cyclic \
+            trainer.max_steps=10"
+        sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+            trainer.num_nodes=1 \
+            trainer.devices=2 \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            model.data.data_prefix=['none'] \
+            exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \
+            model.mcore_gpt=True \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.optim.name=distributed_fused_adam \
+            model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \
+            model.data.num_workers=4 \
+            model.micro_batch_size=1 \
+            model.data.shuffle_documents=False \
+            trainer.val_check_interval=30 \
+            +trainer.num_sanity_val_steps=0 \
+            model.init_method_std=0.023 \
+            model.optim.lr=6.0e-4 \
+            model.megatron_amp_O2=True \
+            model.data.splits_string=\'\"98,2,0\"\' \
+            model.data.dataloader_type=cyclic \
+            trainer.max_steps=20"
+        sh "rm -rf examples/nlp/language_modeling/mcore_retro_results"
+      }
+    }
+    stage('L2: (Legacy) Megatron RETRO Pretraining and Resume Training') {
+      when {
+        anyOf {
+          branch 'main'
+          changeRequest target: 'main'
+        }
+      }
+      failFast true
+      steps {
+        sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
         trainer.devices=2 \
         trainer.num_nodes=1 \
         trainer.accelerator=gpu \
@@ -3527,7 +3586,7 @@ pipeline {
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
         trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \
+        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
         model.data.data_prefix='' \
         model.data.knn_index='' \
         model.data.retrieval_prefix='' \
@@ -3546,7 +3605,7 @@ pipeline {
         model.enc_cross_attention=[1] \
         model.dec_cross_attention=[1] \
         +model.data.mock=True"
-        sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+        sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
         trainer.devices=2 \
         trainer.num_nodes=1 \
         trainer.accelerator=gpu \
@@ -3557,7 +3616,7 @@ pipeline {
         trainer.precision=16 \
         trainer.gradient_clip_val=1.0 \
         trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \
+        exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \
         model.data.data_prefix='' \
         model.data.knn_index='' \
         model.data.retrieval_prefix='' \
@@ -3576,10 +3635,10 @@ pipeline {
         model.enc_cross_attention=[1] \
         model.dec_cross_attention=[1] \
         +model.data.mock=True"
-        sh "rm -rf examples/nlp/language_modeling/retro_results"
+        sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
       }
     }
-    stage('L2: Megatron RETRO muTransfer Pretraining Performance') {
+    stage('L2: (Legacy) Megatron RETRO muTransfer Pretraining Performance') {
       when {
         anyOf {
           branch 'main'
@@ -3600,7 +3659,7 @@ pipeline {
                 trainer.limit_val_batches=0 \
                 trainer.gradient_clip_val=1.0 \
                 +trainer.num_sanity_val_steps=0 \
-                exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \
+                exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results/ \
                 +exp_manager.version=smalltest \
                 model.data.neighbors=2 \
                 model.megatron_amp_O2=False \
@@ -3651,7 +3710,7 @@ import torch
 if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()):
     import sys
     sys.exit(0)
-event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
+event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_legacy_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0]
 ea = EventAccumulator(str(event_file)).Reload()
 vals = []
 for i in ea.Scalars('reduced_train_loss'):
@@ -3659,7 +3718,7 @@ for i in ea.Scalars('reduced_train_loss'):
 training_curve = pd.DataFrame({'loss': vals})
 gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv')
 assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
-        sh "rm -rf examples/nlp/language_modeling/retro_results"
+        sh "rm -rf examples/nlp/language_modeling/retro_legacy_results"
       }
     }
     stage('L2: BioMegatron Bert NER Task') {
diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
index 58e874386c44..bc66ae717ebb 100644
--- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml
@@ -158,4 +158,4 @@ model:
       name: CosineAnnealing
       warmup_steps: 500
       constant_steps: 50000
-      min_lr: 2e-5
+      min_lr: 2e-5
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
old mode 100644
new mode 100755
index dafdcf542f11..159bb163ad0a
--- a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
@@ -1,127 +1,257 @@
 defaults:
-  - .@model: megatron_model_base_config
+  - _self_
+  - optional tp_overlap@model.ub_tp_comm_overlap_cfg:
 
-name: test_retro
+name: megatron_retro
 restore_from_path: null # used when starting from a .nemo file
 
 trainer:
-  devices: 2
+  devices: 1
   num_nodes: 1
   accelerator: gpu
   precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
-  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
-  limit_val_batches: null
-  limit_test_batches: null
-  accumulate_grad_batches: 1
+  limit_val_batches: 50
+  limit_test_batches: 500
+  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
   gradient_clip_val: 1.0
+  benchmark: False
+  enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
 
 exp_manager:
   explicit_log_dir: null
   exp_dir: null
-  name: megatron_retro
+  name: ${name}
   create_wandb_logger: False
   wandb_logger_kwargs:
     project: null
     name: null
   resume_if_exists: True
   resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint} 
   create_checkpoint_callback: True
   checkpoint_callback_params:
     monitor: val_loss
     save_top_k: 10
     mode: min
     always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
     filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}'
     model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
 
-
 model:
-  version: 1 # indicate the retro model version
+  # use RETROModel from megatron.core, since RETRO model inherited from gpt, mcore_gpt is used
+  mcore_gpt: True
 
-  # model parallelism 
-  micro_batch_size: 4
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet
+  # specify micro_batch_size, global_batch_size, and model parallelism
+  # gradient accumulation will be done automatically based on data_parallel_size
+  micro_batch_size: 16 # limited by GPU memory
+  global_batch_size: 256     # will be overrided by value from RETRO preprocessed workdir
+  rampup_batch_size: null # Should be a list of 3 values: [<start_batch_size>, <batch_size_increment>, <rampup_samples>]
+  tensor_model_parallel_size: 1 # intra-layer model parallelism
+  pipeline_model_parallel_size: 1 # inter-layer model parallelism
+  virtual_pipeline_model_parallel_size: null # interleaved pipeline
 
   # model architecture
-  encoder_seq_length: 2048
-  max_position_embeddings: ${.encoder_seq_length}
-
-  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-
-  dump_debug_info: False # dump out the debug information
-  dump_debug_info_to_file: False # dump out the debug information to files
-
-  # retro architecture
-  chunk_size: 64   # the chunk size used to retrive
-  enc_num_layers: 4    # total number of encoder layers
-  dec_num_layers: 6    # total number of decoder layers
-  enc_cross_attention: [3]    # layer numbers for cross attention in encoder
-  dec_cross_attention: [3, 5]    # layer numbers for chunked cross attention in decoder
-  add_position_embedding: False   # whether use the absolute position encoding
-
+  encoder_seq_length: 512     # will be overrided by value from RETRO preprocessed workdir
+  max_position_embeddings: ${.encoder_seq_length}     # will be overrided by value from RETRO preprocessed workdir
+  num_layers: 12
+  hidden_size: 768
+  ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size.
+  num_attention_heads: 12
+  init_method_std: 0.023 # Standard deviation of the zero mean normal distribution used for weight initialization.')
+  use_scaled_init_method: True # use scaled residuals initialization
+  hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
+  attention_dropout: 0.1 # Dropout probability for attention
+  ffn_dropout: 0.1 # Dropout probability in the feed-forward layer.
+  kv_channels: 64 # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
+  apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number.
+  normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
+  layernorm_epsilon: 1e-5
+  do_layer_norm_weight_decay: False # True means weight decay on all params
   make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
   pre_process: True # add embedding
   post_process: True # add pooler
-  bert_binary_head: True # BERT binary head
+  persist_layer_norm: True # Use of persistent fused layer norm kernel.
+  bias: True # Whether to use bias terms in all weight matrices.
+  activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
+  headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
+  transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
+  openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
+  normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
+  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental.
+  rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
+  attention_type: 'multihead' # Attention type. Options ['multihead']
+  share_embeddings_and_output_weights: True # Share embedding and output layer weights.
+  overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
+  seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
-  megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
-  grad_allreduce_chunk_size_mb: 125
-
-  megatron_lm_compatible: False # a flag to indicate whether the model is compatible with Megatron LM
+  retro:  # specific arguments for RETRO model
+    retro_project_dir: null
+    retro_encoder_num_layers: 2
+    retro_encoder_hidden_dropout: 0.1
+    retro_encoder_attention_dropout: 0.1
+    retro_num_neighbors: 2
+    retro_num_retrieved_chunks: 2
+    retro_verify_neighbor_count: True
 
   tokenizer:
     library: 'megatron'
-    type: 'GPT2BPETokenizer'
-    model: null
-    vocab_file: null
-    merge_file: null 
+    type: null      # will be overrided by value from RETRO preprocessed workdir 
+    model: null     # will be overrided by value from RETRO preprocessed workdir 
+    vocab_file: null     # will be overrided by value from RETRO preprocessed workdir 
+    merge_file: null      # will be overrided by value from RETRO preprocessed workdir 
     delimiter: null # only used for tabular tokenizer
+    sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
 
-  # precision
+  # Mixed precision
   native_amp_init_scale: 4294967296 # 2 ** 32
   native_amp_growth_interval: 1000
+  hysteresis: 2 # Gradient scale hysteresis
+  fp32_residual_connection: False # Move residual connections to fp32
   fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
 
-  # miscellaneous
-  seed: 1234
+  # Megatron O2-style half-precision
+  megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
+  grad_allreduce_chunk_size_mb: 125
+
+  # Fusion
+  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism..
+  gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2.
+  bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
+  bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
+  masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
+  get_attention_mask_from_fusion: False # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages.
 
+  # Miscellaneous
+  seed: 1234     # will be overrided by value from RETRO preprocessed workdir 
+  resume_from_checkpoint: null # manually set the checkpoint file to load from
+  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
+  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
+  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
+  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages
+
+  ## Activation Checkpointing
+  # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
+  # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  # 'full' will checkpoint the entire transformer layer.
+  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_method: null # 'uniform', 'block'
+  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
+  # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
+  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+  activations_checkpoint_num_layers: null
+  # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory.
+  # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage.
+  num_micro_batches_with_partial_activation_checkpoints: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed
+  # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is
+  # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint
+  # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'.
+  # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage.
+  activations_checkpoint_layers_per_pipeline: null
+  # This feature is valid only when used with pipeline-model-parallelism.
+  # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later
+  # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than
+  # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage
+  # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints',
+  # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path.
+
+  ## Sequence Parallelism
+  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
+  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
+  sequence_parallel: False
+
+  ## Transformer Engine
+  transformer_engine: True
+  fp8: False # enables fp8 in TransformerLayer forward
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
+  fp8_margin: 0 # scaling margin 
+  fp8_interval: 1 # scaling update interval
+  fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
+  fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
+  reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration
+  use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
+  ub_tp_comm_overlap: False
+  # Use userbuffer backend to overlap tensor-parallel communications with computes.
+  # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models.
+  ub_tp_comm_overlap_cfg: null
+  # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`,
+  # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings.
+  # If the configuration file is not provided a default setting is used for all communicators.
+
+  ## Flash Attention
+  use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True or mcore_gpt=True
+  
   data:
-    # Path to data must be specified by the user.
-    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+   # Path to data must be specified by the user.
+    # Supports List, String and Dictionary
+    # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
     # Or see example below: 
     # data_prefix: 
     #   - .5
     #   - /raid/data/pile/my-gpt3_00_text_document
     #   - .5
     #   - /raid/data/pile/my-gpt3_01_text_document
-    data_prefix: ???  # list of training datasets
-    knn_index: ???  # list of KNN map index files
-    retrieval_prefix: ???   # a singe path to retrieval data
+    # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test}
+    # Or see example below:
+    # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}"
+    data_prefix: ???     # will be overrided by value from RETRO preprocessed workdir 
     index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
-    data_impl: retmmap   # for retro model, this is the only allowed type
-    splits_string: 900,50,50
-    seq_length: ${model.encoder_seq_length}  # must be multiple of the chunk_size in your dataset
+    data_impl: mmap
+    splits_string: 98,2,0
+    seq_length: ${model.encoder_seq_length}     # will be overrided by value from RETRO preprocessed workdir 
     skip_warmup: True
-    num_workers: 0
+    num_workers: 2
     dataloader_type: single # cyclic
-    neighbors: 2  # number of retrieved neighbors
+    reset_position_ids: False # Reset position ids after end-of-document token
+    reset_attention_mask: False # Reset attention mask after end-of-document token
+    eod_mask_loss: False # Mask loss for the end of document tokens
+    validation_drop_last: True # Set to false if the last partial validation samples is to be consumed
+    no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token
+    pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size
+    shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled
+    exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem
+    retro_data:
+      retro_block_size: 10000
+      retro_chunk_length: 64
+      retro_split_preprocessing: 98,2,0
+      retro_neighbor_dirs: null
+
+
+  # Nsys profiling options
+  nsys_profile:
+    enabled: False
+    start_step: 10  # Global batch to start profiling
+    end_step: 10 # Global batch to end profiling
+    ranks: [0] # Global rank IDs to profile
+    gen_shape: False # Generate model and kernel details including input shapes
   
   optim:
-    name: fused_adam
-    lr: 1e-4
-    weight_decay: 0.01 
+    name: distributed_fused_adam
+    lr: 6.0e-4
+    weight_decay: 0.1 
     betas: 
     - 0.9
-    - 0.98
+    - 0.95
     sched:
       name: CosineAnnealing
-      warmup_steps: 500
-      constant_steps: 50000
-      min_lr: 1e-5
+      min_lr: 6.0e-5
+      warmup_steps: null
+      max_steps: 750000
+
+  gc_interval: 0
+  # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector.
+  # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`.
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml
new file mode 100644
index 000000000000..dafdcf542f11
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml
@@ -0,0 +1,127 @@
+defaults:
+  - .@model: megatron_model_base_config
+
+name: test_retro
+restore_from_path: null # used when starting from a .nemo file
+
+trainer:
+  devices: 2
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch.
+  max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
+  log_every_n_steps: 10
+  val_check_interval: 100
+  limit_val_batches: null
+  limit_test_batches: null
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_retro
+  create_wandb_logger: False
+  wandb_logger_kwargs:
+    project: null
+    name: null
+  resume_if_exists: True
+  resume_ignore_no_checkpoint: True
+  create_checkpoint_callback: True
+  checkpoint_callback_params:
+    monitor: val_loss
+    save_top_k: 10
+    mode: min
+    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
+    filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}'
+    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
+
+
+model:
+  version: 1 # indicate the retro model version
+
+  # model parallelism 
+  micro_batch_size: 4
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet
+
+  # model architecture
+  encoder_seq_length: 2048
+  max_position_embeddings: ${.encoder_seq_length}
+
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+
+  dump_debug_info: False # dump out the debug information
+  dump_debug_info_to_file: False # dump out the debug information to files
+
+  # retro architecture
+  chunk_size: 64   # the chunk size used to retrive
+  enc_num_layers: 4    # total number of encoder layers
+  dec_num_layers: 6    # total number of decoder layers
+  enc_cross_attention: [3]    # layer numbers for cross attention in encoder
+  dec_cross_attention: [3, 5]    # layer numbers for chunked cross attention in decoder
+  add_position_embedding: False   # whether use the absolute position encoding
+
+  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
+  pre_process: True # add embedding
+  post_process: True # add pooler
+  bert_binary_head: True # BERT binary head
+
+  megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
+  grad_allreduce_chunk_size_mb: 125
+
+  megatron_lm_compatible: False # a flag to indicate whether the model is compatible with Megatron LM
+
+  tokenizer:
+    library: 'megatron'
+    type: 'GPT2BPETokenizer'
+    model: null
+    vocab_file: null
+    merge_file: null 
+    delimiter: null # only used for tabular tokenizer
+
+  # precision
+  native_amp_init_scale: 4294967296 # 2 ** 32
+  native_amp_growth_interval: 1000
+  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
+
+  # miscellaneous
+  seed: 1234
+
+  data:
+    # Path to data must be specified by the user.
+    # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
+    # Or see example below: 
+    # data_prefix: 
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_00_text_document
+    #   - .5
+    #   - /raid/data/pile/my-gpt3_01_text_document
+    data_prefix: ???  # list of training datasets
+    knn_index: ???  # list of KNN map index files
+    retrieval_prefix: ???   # a singe path to retrieval data
+    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
+    data_impl: retmmap   # for retro model, this is the only allowed type
+    splits_string: 900,50,50
+    seq_length: ${model.encoder_seq_length}  # must be multiple of the chunk_size in your dataset
+    skip_warmup: True
+    num_workers: 0
+    dataloader_type: single # cyclic
+    neighbors: 2  # number of retrieved neighbors
+  
+  optim:
+    name: fused_adam
+    lr: 1e-4
+    weight_decay: 0.01 
+    betas: 
+    - 0.9
+    - 0.98
+    sched:
+      name: CosineAnnealing
+      warmup_steps: 500
+      constant_steps: 50000
+      min_lr: 1e-5
diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py
index c84656d4b657..2a0c04f695f6 100644
--- a/examples/nlp/language_modeling/megatron_retro_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py
@@ -12,88 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 
+# To suppress BF16 compile related issue in the CI runs with turing/V100
+import torch._dynamo
+import torch.multiprocessing as mp
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
-from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
-from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
-from nemo.collections.nlp.parts.nlp_overrides import (
-    CustomProgressBar,
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    NLPSaveRestoreConnector,
-)
+from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
+torch._dynamo.config.suppress_errors = True
+
 
 @hydra_runner(config_path="conf", config_name="megatron_retro_config")
 def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
-    plugins = []
-    strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True if megatron_amp_O2 else False,
-        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-        find_unused_parameters=False,
-    )
-
-    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if cfg.trainer.precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-                hysteresis=cfg.model.get('hysteresis', 2),
-            )
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-        if megatron_amp_O2:
-            plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
-        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
-        # precision plugins and precision to exist
-        cfg.trainer.precision = None
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    callbacks = []
-    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
-    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
-        callbacks.append(CustomProgressBar())
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
-
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
     exp_manager(trainer, cfg.exp_manager)
 
-    # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint)
-    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
-
-    # load existing nemo retro model
-    if cfg.get("restore_from_path", None) is not None:
-        save_restore_connector = NLPSaveRestoreConnector()
-        if os.path.isdir(cfg.restore_from_path):
-            save_restore_connector.model_extracted_dir = cfg.restore_from_path
-        model = MegatronRetrievalModel.restore_from(
-            restore_path=cfg.restore_from_path,
-            trainer=trainer,
-            override_config_path=cfg.model,
-            save_restore_connector=save_restore_connector,
-            strict=False,
-        )
-    else:
-        model = MegatronRetrievalModel(cfg.model, trainer)
+    model = MegatronRetroModel(cfg.model, trainer)
 
     trainer.fit(model)
 
diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py b/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py
new file mode 100644
index 000000000000..4653222b3438
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from omegaconf.omegaconf import OmegaConf, open_dict
+from pytorch_lightning import Trainer
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
+from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
+
+from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
+from nemo.collections.nlp.parts.nlp_overrides import (
+    CustomProgressBar,
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    NLPSaveRestoreConnector,
+)
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
+
+@hydra_runner(config_path="conf", config_name="megatron_retro_config_legacy")
+def main(cfg) -> None:
+    logging.info("\n\n************** Experiment configuration ***********")
+    logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+    megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False)
+    plugins = []
+    strategy = NLPDDPStrategy(
+        no_ddp_communication_hook=True if megatron_amp_O2 else False,
+        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
+        find_unused_parameters=False,
+    )
+
+    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
+        scaler = None
+        if cfg.trainer.precision in [16, '16', '16-mixed']:
+            scaler = GradScaler(
+                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
+                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
+                hysteresis=cfg.model.get('hysteresis', 2),
+            )
+            plugin_precision = '16-mixed'
+        else:
+            plugin_precision = 'bf16-mixed'
+        if megatron_amp_O2:
+            plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
+        else:
+            plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler))
+        # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both
+        # precision plugins and precision to exist
+        cfg.trainer.precision = None
+
+    if cfg.get('cluster_type', None) == 'BCP':
+        plugins.append(TorchElasticEnvironment())
+
+    callbacks = []
+    # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks
+    if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar:
+        callbacks.append(CustomProgressBar())
+    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks)
+
+    exp_manager(trainer, cfg.exp_manager)
+
+    # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint)
+    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
+
+    # load existing nemo retro model
+    if cfg.get("restore_from_path", None) is not None:
+        save_restore_connector = NLPSaveRestoreConnector()
+        if os.path.isdir(cfg.restore_from_path):
+            save_restore_connector.model_extracted_dir = cfg.restore_from_path
+        model = MegatronRetrievalModel.restore_from(
+            restore_path=cfg.restore_from_path,
+            trainer=trainer,
+            override_config_path=cfg.model,
+            save_restore_connector=save_restore_connector,
+            strict=False,
+        )
+    else:
+        model = MegatronRetrievalModel(cfg.model, trainer)
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
index f0a501d7cc13..377bff309b7c 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py
@@ -12,32 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""RETRO Style dataset."""
+"""RETRO style dataset."""
 
 import os
-from typing import List
+import time
 
 import numpy as np
 import torch
+from omegaconf.dictconfig import DictConfig
 
 from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
     get_datasets_weights_and_num_samples,
     get_train_valid_test_split_,
 )
 from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
-from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import (
-    _build_index_mappings,
-    get_indexed_dataset_,
-)
-from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import (
-    KNNIndex,
-    MMapRetrievalIndexedDataset,
-)
+from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import deallocate_indexed_dataset_memory
+from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_dataset as make_indexed_dataset
 from nemo.core import Dataset
 from nemo.utils import logging
 
 try:
-    from megatron.core import parallel_state
+    from megatron.core import mpu, tensor_parallel
+    from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+    from megatron.core.datasets.retro.config import RetroGPTChunkDatasets
+    from megatron.core.datasets.retro.query.multi_split_gpt_dataset import (
+        MultiSplitGPTDataset,
+        MultiSplitGPTDatasetConfig,
+    )
+    from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets
+    from megatron.core.models.retro import RetroConfig
+
+    from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 
     HAVE_MEGATRON_CORE = True
 
@@ -45,425 +50,163 @@
 
     HAVE_MEGATRON_CORE = False
 
-__all__ = [
-    "RETRODataset",
-    "build_train_valid_test_datasets",
-    "MockRETRODataset",
-    "build_mock_train_valid_test_datasets",
-]
-
 
 class RETRODataset(Dataset):
-    """
-    Dataset for RETRO model.
-
-    It constructs single data record from the training/retrieval indexed retrieval dataset and knn index file.
-    The KNN index file maps data chunk id to K-nearest neighbors in the the retrieval dataset chunk ids.
-    First, it loads a long sequence (2048) from training dataset. Then for each chunk in the sequence, it finds the kNN 
-    chunks from the retrieval dataset using the KNN index. Lastly, compute the masks based on pad id.
-    """
-
-    def __init__(
-        self,
-        cfg,
-        trainer,
-        tokenizer,
-        name: str,
-        data_prefix: str,
-        documents,  # document ids in the indexed_dataset used for this dataset
-        indexed_dataset: MMapRetrievalIndexedDataset,
-        num_samples: int,  # number of data samples,  max_steps * global_batch_size
-        seq_length: int,  # input seq length
-        seed: int,
-        knn_index: KNNIndex,
-        retrieval_index: MMapRetrievalIndexedDataset,
-    ):
-        if not HAVE_MEGATRON_CORE:
-            raise ImportError(
-                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
-            )
-
+    def __init__(self, cfg, retro_config: RetroConfig, tokenizer, mcore_retro_dataset, number_samples_with_neighbors):
         super().__init__()
-        self.name = name
-        self.indexed_dataset: MMapRetrievalIndexedDataset = indexed_dataset
-        self.knn_index: KNNIndex = knn_index
-        self.retrieval_index: MMapRetrievalIndexedDataset = retrieval_index
-        self.chunk_size = self.indexed_dataset.chunk_size
-
-        # make sure seq_length is a multiple of chunk_size
-        assert seq_length % self.chunk_size == 0
-        # Checks
-        assert np.min(documents) >= 0
-        assert np.max(documents) < indexed_dataset.sizes.shape[0]
 
+        self.reset_position_ids = cfg.data.get('reset_position_ids', False)
+        self.reset_attention_mask = cfg.data.get('reset_attention_mask', False)
+        self.eod_mask_loss = cfg.data.get('eod_mask_loss', False)
         self.eos_id = tokenizer.eos_id
-        self.pad_id = tokenizer.pad_id
-
-        assert self.retrieval_index._index.retrieval_db
-        self._validate_pad_id()
-
-        # save index mappings to a configurable dir
-        self.index_mapping_dir = cfg.data.get('index_mapping_dir', None)
-        self.neighbors = cfg.data.get('neighbors', self.knn_index.K)
-        # the number of neighbors cannot exceed the max number of neighbors in the index
-        assert self.neighbors <= self.knn_index.K
-        # create index_mapping_dir on rank 0
-        if torch.distributed.is_available() and torch.distributed.is_initialized():
-            if torch.distributed.get_rank() == 0:
-                if self.index_mapping_dir is not None and not os.path.isdir(self.index_mapping_dir):
-                    os.makedirs(self.index_mapping_dir)
-            torch.distributed.barrier()
-
-        # Build index mappings.
-        self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
-            self.name,
-            data_prefix,
-            documents,
-            self.indexed_dataset.sizes,
-            num_samples,
-            seq_length,
-            seed,
-            index_mapping_dir=self.index_mapping_dir,
-        )
-        if len(self.doc_idx) > np.iinfo('int32').max:
-            raise "number of epochs exceeds the maximum number for int32 used by sample_idx"
-        self.padding_context = np.ones(2 * self.chunk_size, dtype=self.retrieval_index._index.dtype) * self.pad_id
-
-    def _validate_pad_id(self):
-        # validate the pad_id matches the dataset pad_id
-        ptr, size = self.retrieval_index._index[0]
-        ptr += size * np.dtype(self.retrieval_index._index.dtype).itemsize
-        # padded chunk_size of pad_ids at the end of the doc
-        retrieval_paddings = np.frombuffer(
-            self.retrieval_index._bin_buffer,
-            dtype=self.retrieval_index._index.dtype,
-            count=self.chunk_size,
-            offset=ptr,
-        )
-        assert (retrieval_paddings == self.pad_id).all()
+        self.retro_config = retro_config
+        self.mcore_retro_dataset = mcore_retro_dataset
+        self.number_samples_with_neighbors = number_samples_with_neighbors  # quick fix for problems of mismatch in processed/indexed retro data, # of GPT samples is different from # of samples with neighbors retrieved
+        self.tokenizer = tokenizer
 
-        ptr, size = self.indexed_dataset._index[0]
-        ptr += (size - 1) * np.dtype(self.indexed_dataset._index.dtype).itemsize
-        data_paddings = np.frombuffer(
-            self.indexed_dataset._bin_buffer, dtype=self.indexed_dataset._index.dtype, count=1, offset=ptr
-        )
-        # the last element is either a padding or an eos
-        assert (data_paddings == self.pad_id).all() or (data_paddings == self.eos_id).all()
+        return
 
     def __len__(self):
-        # -1 is due to data structure used to retieve the index:
-        #    sample i --> [sample_idx[i], sample_idx[i+1])
-        return self.sample_idx.shape[0] - 1
-
-    def _get_chunks(self, chunk_id: int, num_chunks: int, chunks: List):
-        """
-        starting from chunk_id, loop for num_chunks, get the 
-        KNN chunk ids from retrieval dataset, and get the chunk token ids,
-        put them into the chunks list
-        """
-        for i in range(chunk_id, chunk_id + num_chunks):
-            knn = self.knn_index.get_KNN_chunk_ids(i)
-            for rid in knn[: self.neighbors]:
-                if rid < 0:
-                    # no neighbor, just pad it
-                    one_chunk = self.padding_context
-                else:
-                    one_chunk = self.retrieval_index.get_chunk(rid)
-                chunks.append(one_chunk)
-
-    def _get_text(self, idx: int) -> np.ndarray:
-        # Get the shuffled index.
-        idx = self.shuffle_idx[idx]
-        # Start and end documents and offsets.
-        doc_index_f = self.sample_idx[idx][0]
-        doc_index_l = self.sample_idx[idx + 1][0]
-        offset_f = self.sample_idx[idx][1]
-        offset_l = self.sample_idx[idx + 1][1]
-        # If we are within the same document, just extract the chunk.
-        if doc_index_f == doc_index_l:
-            sample = self.indexed_dataset.get(
-                self.doc_idx[doc_index_f], offset=offset_f, length=offset_l - offset_f + 1
-            )
-            chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f)
-            num_chunks = (offset_l - offset_f) // self.chunk_size
-            chunks = []
-            self._get_chunks(chunk_id, num_chunks, chunks)
-            chunks = np.stack(chunks, axis=0).reshape(num_chunks, self.neighbors, -1).astype(np.int64)
-        else:
-            # Otherwise, get the rest of the initial document.
-            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)]
-            num_chunks = (self.indexed_dataset._index.sizes[self.doc_idx[doc_index_f]] - offset_f) // self.chunk_size
-            total_chunks = num_chunks
-            chunks = []
-            chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f)
-            self._get_chunks(chunk_id, num_chunks, chunks)
-            # Loop over all in between documents and add the entire document.
-            for i in range(doc_index_f + 1, doc_index_l):
-                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
-                chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[i], 0)
-                num_chunks = self.indexed_dataset._index.sizes[self.doc_idx[i]] // self.chunk_size
-                total_chunks += num_chunks
-                self._get_chunks(chunk_id, num_chunks, chunks)
-                # And finally add the relevant portion of last document.
-            chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_l], 0)
-            num_chunks = (offset_l) // self.chunk_size
-            total_chunks += num_chunks
-            self._get_chunks(chunk_id, num_chunks, chunks)
-            sample_list.append(self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1))
-            sample = np.concatenate(sample_list)
-            chunks = np.stack(chunks, axis=0).reshape(total_chunks, self.neighbors, -1).astype(np.int64)
-        return sample.astype(np.int64), chunks
+        return len(self.mcore_retro_dataset.chunk_dataset.sample_dataset)
 
-    def __getitem__(self, idx):
-        text, retrieved = self._get_text(idx)
-        text = torch.from_numpy(text)
-        retrieved = torch.from_numpy(retrieved)
-        tokens = text[:-1].contiguous()
-        labels = text[1:].contiguous()
-        hidden_mask = tokens != self.pad_id
-        context_mask = retrieved != self.pad_id
-        return {
-            'tokens': tokens,
-            'labels': labels,
-            'tokens_mask': hidden_mask,
-            'loss_mask': hidden_mask,
-            'retrieved_emb_mask': context_mask,
-            'retrieved_ids': retrieved,
-        }
+    def _get_text(self, idx: int):
+        # return the tokens ids of idx
+        # Caveat: these tokens are got from the already pre-tokenized data file, mcore's GPTDataset doesn't run __getitem__, only run _query_document_sample_shuffle_indices
+        return self.mcore_retro_dataset[idx]
 
+    def __getitem__(self, idx):
 
-def build_train_valid_test_datasets(
-    cfg,
-    trainer,
-    data_prefix: List[str],
-    data_impl: str,
-    splits_string: str,
-    train_valid_test_num_samples,
-    seq_length: int,
-    seed: int,
-    skip_warmup: bool,
-    tokenizer,
-    retrieval_prefix: str,
-    knn_map_path: List[str],
-):
-    """Build train, valid, and test RETRO datasets.
-       There is one to one mapping between data_prefix and knn_map_path.
-       Currently only supports one retrieval dataset.
-    """
-    # make sure there is one to one mapping  between data_prefix and knn_map_path
-    assert len(data_prefix) == len(knn_map_path)
-
-    # Single dataset.
-    if len(data_prefix) == 1:
-        return _build_train_valid_test_datasets(
-            cfg,
-            trainer,
-            data_prefix[0],
-            data_impl,
-            splits_string,
-            train_valid_test_num_samples,
-            seq_length,
-            seed,
-            skip_warmup,
-            tokenizer,
-            retrieval_prefix,
-            knn_map_path[0],
-        )
-
-    # Blending dataset.
-    # Parse the values.
-    output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples)
-    prefixes, weights, datasets_train_valid_test_num_samples = output
-    train_n, valid_n, test_n = map(sum, zip(*datasets_train_valid_test_num_samples))
-
-    # Build individual datasets.
-    train_datasets = []
-    valid_datasets = []
-    test_datasets = []
-    for i in range(len(prefixes)):
-        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
-            cfg,
-            trainer,
-            prefixes[i],
-            data_impl,
-            splits_string,
-            datasets_train_valid_test_num_samples[i],
-            seq_length,
-            seed,
-            skip_warmup,
-            tokenizer,
-            retrieval_prefix,
-            knn_map_path[i],
+        # quick fix for problems of mismatch in processed/indexed retro data, # of GPT samples is different from # of samples with neighbors retrieved
+        idx = idx % self.number_samples_with_neighbors
+
+        sample = self._get_text(idx)
+
+        # Unpack
+        tokens_ = torch.from_numpy(sample['text'])
+        tokens_ = tokens_.long()  # size should be [seq_length]
+        labels = tokens_[1:].contiguous()
+        tokens = tokens_[:-1].contiguous()
+        neighbor_tokens = torch.from_numpy(sample['neighbor_tokens'])
+        neighbor_tokens = neighbor_tokens.long()  # size should be [l, k, r]
+
+        # note: [l, k, r]  => [l*k, r]
+        # note: 2x == neighbor, continuation
+        neighbor_tokens = neighbor_tokens.view(-1, self.retro_config.retro_retrieved_length).long()
+
+        # Get the masks and postition ids for tokens and neighbor_tokens
+        tokens = torch.unsqueeze(
+            tokens, 0
+        )  # get_ltor_masks_and_position_ids takes as input tokens arguments as a batch (2D tensor), so need to convert tokens from 1D to 2D
+        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+            tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss
         )
-        if train_ds:
-            train_datasets.append(train_ds)
-        if valid_ds:
-            valid_datasets.append(valid_ds)
-        if test_ds:
-            test_datasets.append(test_ds)
-
-    # Blend.
-    blending_train_dataset = None
-    if train_datasets:
-        blending_train_dataset = BlendableDataset(train_datasets, weights, train_n)
-    blending_valid_dataset = None
-    if valid_datasets:
-        blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_n)
-    blending_test_dataset = None
-    if test_datasets:
-        blending_test_dataset = BlendableDataset(test_datasets, weights, test_n)
-
-    return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
-
-
-def _build_train_valid_test_datasets(
-    cfg,
-    trainer,
-    data_prefix: str,
-    data_impl: str,
-    splits_string: str,
-    train_valid_test_num_samples,
-    seq_length: int,
-    seed: int,
-    skip_warmup: bool,
-    tokenizer,
-    retrieval_prefix: str,
-    knn_map_path: str,
-):
-    """Build train, valid, and test datasets."""
-
-    # Indexed dataset.
-    indexed_dataset: MMapRetrievalIndexedDataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
-    knn_index: KNNIndex = KNNIndex(knn_map_path, skip_warmup)
-    retrieval_index: MMapRetrievalIndexedDataset = get_indexed_dataset_(retrieval_prefix, data_impl, skip_warmup)
-
-    total_num_of_documents = indexed_dataset.sizes.shape[0]
-    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
-
-    # Print stats about the splits.
-    logging.info(' > dataset split:')
-
-    def print_split_stats(name, index):
-        logging.info('    {}:'.format(name))
-        logging.info(
-            '     document indices in [{}, {}) total of {} '
-            'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
+        tokens, attention_mask, loss_mask, position_ids = tokens[0], attention_mask[0], loss_mask[0], position_ids[0]
+        _, _, neighbor_position_ids = get_ltor_masks_and_position_ids(  # neighbor_tokens is already a 2D array
+            neighbor_tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss
         )
-
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
-
-    def build_dataset(index, name):
-        dataset = None
-        if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32)
-            dataset = RETRODataset(
-                cfg,
-                trainer,
-                tokenizer,
-                name,
-                data_prefix,
-                documents,
-                indexed_dataset,
-                train_valid_test_num_samples[index],
-                seq_length,
-                seed,
-                knn_index,
-                retrieval_index,
-            )
-        return dataset
-
-    train_dataset = build_dataset(0, 'train')
-    valid_dataset = build_dataset(1, 'valid')
-    test_dataset = build_dataset(2, 'test')
-
-    return (train_dataset, valid_dataset, test_dataset)
-
-
-class MockRETRODataset(torch.utils.data.Dataset):
-    def __init__(self, cfg, trainer, tokenizer, name, size):
-        super().__init__()
-        self.name = name
-        self.tokenizer = tokenizer
-        self._cfg = cfg
-        self.size = size
-        seed_val = parallel_state.get_data_parallel_rank() * 131 + 97
-        torch.manual_seed(seed_val)
-
-    def __len__(self):
-        return self.size
-
-    def __getitem__(self, idx):
-        vocab_size = self.tokenizer.vocab_size
-
-        neighbors = self._cfg.data.neighbors
-        input_length = self._cfg.data.seq_length
-        chunks = input_length // self._cfg.chunk_size
-        chunk_size = self._cfg.chunk_size
-        pad_id = self.tokenizer.pad_id
-
-        all_tokens = torch.randint(0, vocab_size, (input_length + 1,))
-        # make sure the eod happens at the end of each chunk, can add paddings to it
-        # e.g. [..., id, id, pad, pad, pad, eod]  each has chunk_size, each sentence
-        # has length of multiple of chunk_size
-        hidden = all_tokens[:-1]
-        labels = all_tokens[1:]
-
-        hidden_mask = hidden != pad_id
-        # to mask out the token ids [id, id,  eod, id, pad, eod, id, id]
-        # so attention is not across eod, mask should be:
-        # [false, true,  true, true,  true, true,  true,  true]
-        # [false, false, true, true,  true, true,  true,  true]
-        # [false, false, false,true,  true, true,  true,  true]
-        # [true,  true,  true, false, true, true,  true,  true]
-        # [true,  true,  true, true,  true, true,  true,  true]
-        # [true,  true,  true, false, true, false, true,  true]
-        # [true,  true,  true, true,  true, true,  false, true]
-        # [true,  true,  true, true,  true, true,  false, false]
-        retrieved = torch.randint(0, vocab_size, (chunks, neighbors, 2 * chunk_size))
-
-        context_mask = retrieved != pad_id
+        neighbor_attention_mask = torch.zeros(
+            [1, 1]
+        )  # just a dummy values, since the batch neighbor_attention_mask will be set to None in megatron_retro_model.py following Lawrence's implementation
 
         return {
-            'tokens': hidden,
+            'tokens': tokens,
             'labels': labels,
-            'tokens_mask': hidden_mask,
-            'loss_mask': hidden_mask,
-            'retrieved_emb_mask': context_mask,
-            'retrieved_ids': retrieved,
+            'loss_mask': loss_mask,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'context_input_ids': neighbor_tokens,
+            'context_attention_mask': neighbor_attention_mask,
+            'context_position_ids': neighbor_position_ids,
         }
 
 
-def build_mock_train_valid_test_datasets(
-    cfg, trainer, splits_string, tokenizer, mock_data_size,
+def build_train_valid_test_datasets(
+    cfg, retro_config: RetroConfig, train_valid_test_num_samples, seq_length, tokenizer,
 ):
-    """Build train, valid, and test datasets."""
-
-    splits = get_train_valid_test_split_(splits_string, mock_data_size)
 
-    # Print stats about the splits.
-    logging.info(' > dataset split:')
-
-    def print_split_stats(name, index):
-        logging.info('    {}:'.format(name))
-        logging.info(
-            '     document indices in [{}, {}) total of {} '
-            'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
+    # gpt dataset
+    train_ds, valid_ds, test_ds = gpt_train_valid_test_datasets_provider(cfg, train_valid_test_num_samples, tokenizer)
+
+    gpt_datasets = {
+        "train": (train_ds, train_valid_test_num_samples[0]),
+        "valid": (valid_ds, train_valid_test_num_samples[1]),
+        "test": (test_ds, train_valid_test_num_samples[2]),
+    }
+
+    retro_train_ds, retro_valid_ds, retro_test_ds = get_retro_datasets(
+        config=retro_config, gpt_datasets=gpt_datasets, sample_length=seq_length, eod_token_id=tokenizer.eos_id,
+    )
+
+    train_ds = (
+        RETRODataset(
+            cfg=cfg,
+            retro_config=retro_config,
+            tokenizer=tokenizer,
+            mcore_retro_dataset=retro_train_ds,
+            number_samples_with_neighbors=train_valid_test_num_samples[0],
+        )
+        if retro_train_ds
+        else None
+    )
+    valid_ds = (
+        RETRODataset(
+            cfg=cfg,
+            retro_config=retro_config,
+            tokenizer=tokenizer,
+            mcore_retro_dataset=retro_valid_ds,
+            number_samples_with_neighbors=train_valid_test_num_samples[1],
+        )
+        if retro_valid_ds
+        else None
+    )
+    test_ds = (
+        RETRODataset(
+            cfg=cfg,
+            retro_config=retro_config,
+            tokenizer=tokenizer,
+            mcore_retro_dataset=retro_test_ds,
+            number_samples_with_neighbors=train_valid_test_num_samples[2],
         )
+        if retro_test_ds
+        else None
+    )
 
-    print_split_stats('train', 0)
-    print_split_stats('validation', 1)
-    print_split_stats('test', 2)
+    return train_ds, valid_ds, test_ds
 
-    def build_dataset(index, name):
-        dataset = None
-        if splits[index + 1] > splits[index]:
-            dataset = MockRETRODataset(cfg, trainer, tokenizer, name, splits[index + 1] - splits[index],)
-        return dataset
 
-    train_dataset = build_dataset(0, 'train')
-    valid_dataset = build_dataset(1, 'valid')
-    test_dataset = build_dataset(2, 'test')
+def gpt_train_valid_test_datasets_provider(cfg, train_val_test_num_samples, tokenizer):
+    """Build the train test and validation datasets.
+       Implemented from train_valid_test_datasets_provider in M-LM/pretrain_gpt.py
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
 
-    return (train_dataset, valid_dataset, test_dataset)
+    def is_dataset_built_on_rank():
+        return (
+            mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+        ) and mpu.get_tensor_model_parallel_rank() == 0
+
+    data_config = MultiSplitGPTDatasetConfig(
+        random_seed=cfg.seed,
+        sequence_length=cfg.data.seq_length,
+        blend=cfg.data.data_prefix,
+        split=cfg.data.splits_string,
+        split_preprocessing=cfg.data.retro_data.retro_split_preprocessing,
+        path_to_cache=None,
+        return_document_ids=False,
+        reset_position_ids=cfg.data.get('reset_position_ids', False),
+        reset_attention_mask=cfg.data.get('reset_attention_mask', False),
+        eod_mask_loss=cfg.data.get('eod_mask_loss', False),
+        tokenizer=tokenizer,
+    )
+
+    print("> building train, validation, and test datasets for GPT ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        MultiSplitGPTDataset, train_val_test_num_samples, is_dataset_built_on_rank, data_config
+    ).build()
+
+    print("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
diff --git a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py
new file mode 100644
index 000000000000..f0a501d7cc13
--- /dev/null
+++ b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RETRO Style dataset."""
+
+import os
+from typing import List
+
+import numpy as np
+import torch
+
+from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import (
+    get_datasets_weights_and_num_samples,
+    get_train_valid_test_split_,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset
+from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import (
+    _build_index_mappings,
+    get_indexed_dataset_,
+)
+from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import (
+    KNNIndex,
+    MMapRetrievalIndexedDataset,
+)
+from nemo.core import Dataset
+from nemo.utils import logging
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+__all__ = [
+    "RETRODataset",
+    "build_train_valid_test_datasets",
+    "MockRETRODataset",
+    "build_mock_train_valid_test_datasets",
+]
+
+
+class RETRODataset(Dataset):
+    """
+    Dataset for RETRO model.
+
+    It constructs single data record from the training/retrieval indexed retrieval dataset and knn index file.
+    The KNN index file maps data chunk id to K-nearest neighbors in the the retrieval dataset chunk ids.
+    First, it loads a long sequence (2048) from training dataset. Then for each chunk in the sequence, it finds the kNN 
+    chunks from the retrieval dataset using the KNN index. Lastly, compute the masks based on pad id.
+    """
+
+    def __init__(
+        self,
+        cfg,
+        trainer,
+        tokenizer,
+        name: str,
+        data_prefix: str,
+        documents,  # document ids in the indexed_dataset used for this dataset
+        indexed_dataset: MMapRetrievalIndexedDataset,
+        num_samples: int,  # number of data samples,  max_steps * global_batch_size
+        seq_length: int,  # input seq length
+        seed: int,
+        knn_index: KNNIndex,
+        retrieval_index: MMapRetrievalIndexedDataset,
+    ):
+        if not HAVE_MEGATRON_CORE:
+            raise ImportError(
+                "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
+            )
+
+        super().__init__()
+        self.name = name
+        self.indexed_dataset: MMapRetrievalIndexedDataset = indexed_dataset
+        self.knn_index: KNNIndex = knn_index
+        self.retrieval_index: MMapRetrievalIndexedDataset = retrieval_index
+        self.chunk_size = self.indexed_dataset.chunk_size
+
+        # make sure seq_length is a multiple of chunk_size
+        assert seq_length % self.chunk_size == 0
+        # Checks
+        assert np.min(documents) >= 0
+        assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+        self.eos_id = tokenizer.eos_id
+        self.pad_id = tokenizer.pad_id
+
+        assert self.retrieval_index._index.retrieval_db
+        self._validate_pad_id()
+
+        # save index mappings to a configurable dir
+        self.index_mapping_dir = cfg.data.get('index_mapping_dir', None)
+        self.neighbors = cfg.data.get('neighbors', self.knn_index.K)
+        # the number of neighbors cannot exceed the max number of neighbors in the index
+        assert self.neighbors <= self.knn_index.K
+        # create index_mapping_dir on rank 0
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            if torch.distributed.get_rank() == 0:
+                if self.index_mapping_dir is not None and not os.path.isdir(self.index_mapping_dir):
+                    os.makedirs(self.index_mapping_dir)
+            torch.distributed.barrier()
+
+        # Build index mappings.
+        self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings(
+            self.name,
+            data_prefix,
+            documents,
+            self.indexed_dataset.sizes,
+            num_samples,
+            seq_length,
+            seed,
+            index_mapping_dir=self.index_mapping_dir,
+        )
+        if len(self.doc_idx) > np.iinfo('int32').max:
+            raise "number of epochs exceeds the maximum number for int32 used by sample_idx"
+        self.padding_context = np.ones(2 * self.chunk_size, dtype=self.retrieval_index._index.dtype) * self.pad_id
+
+    def _validate_pad_id(self):
+        # validate the pad_id matches the dataset pad_id
+        ptr, size = self.retrieval_index._index[0]
+        ptr += size * np.dtype(self.retrieval_index._index.dtype).itemsize
+        # padded chunk_size of pad_ids at the end of the doc
+        retrieval_paddings = np.frombuffer(
+            self.retrieval_index._bin_buffer,
+            dtype=self.retrieval_index._index.dtype,
+            count=self.chunk_size,
+            offset=ptr,
+        )
+        assert (retrieval_paddings == self.pad_id).all()
+
+        ptr, size = self.indexed_dataset._index[0]
+        ptr += (size - 1) * np.dtype(self.indexed_dataset._index.dtype).itemsize
+        data_paddings = np.frombuffer(
+            self.indexed_dataset._bin_buffer, dtype=self.indexed_dataset._index.dtype, count=1, offset=ptr
+        )
+        # the last element is either a padding or an eos
+        assert (data_paddings == self.pad_id).all() or (data_paddings == self.eos_id).all()
+
+    def __len__(self):
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        return self.sample_idx.shape[0] - 1
+
+    def _get_chunks(self, chunk_id: int, num_chunks: int, chunks: List):
+        """
+        starting from chunk_id, loop for num_chunks, get the 
+        KNN chunk ids from retrieval dataset, and get the chunk token ids,
+        put them into the chunks list
+        """
+        for i in range(chunk_id, chunk_id + num_chunks):
+            knn = self.knn_index.get_KNN_chunk_ids(i)
+            for rid in knn[: self.neighbors]:
+                if rid < 0:
+                    # no neighbor, just pad it
+                    one_chunk = self.padding_context
+                else:
+                    one_chunk = self.retrieval_index.get_chunk(rid)
+                chunks.append(one_chunk)
+
+    def _get_text(self, idx: int) -> np.ndarray:
+        # Get the shuffled index.
+        idx = self.shuffle_idx[idx]
+        # Start and end documents and offsets.
+        doc_index_f = self.sample_idx[idx][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
+        offset_f = self.sample_idx[idx][1]
+        offset_l = self.sample_idx[idx + 1][1]
+        # If we are within the same document, just extract the chunk.
+        if doc_index_f == doc_index_l:
+            sample = self.indexed_dataset.get(
+                self.doc_idx[doc_index_f], offset=offset_f, length=offset_l - offset_f + 1
+            )
+            chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f)
+            num_chunks = (offset_l - offset_f) // self.chunk_size
+            chunks = []
+            self._get_chunks(chunk_id, num_chunks, chunks)
+            chunks = np.stack(chunks, axis=0).reshape(num_chunks, self.neighbors, -1).astype(np.int64)
+        else:
+            # Otherwise, get the rest of the initial document.
+            sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)]
+            num_chunks = (self.indexed_dataset._index.sizes[self.doc_idx[doc_index_f]] - offset_f) // self.chunk_size
+            total_chunks = num_chunks
+            chunks = []
+            chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f)
+            self._get_chunks(chunk_id, num_chunks, chunks)
+            # Loop over all in between documents and add the entire document.
+            for i in range(doc_index_f + 1, doc_index_l):
+                sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
+                chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[i], 0)
+                num_chunks = self.indexed_dataset._index.sizes[self.doc_idx[i]] // self.chunk_size
+                total_chunks += num_chunks
+                self._get_chunks(chunk_id, num_chunks, chunks)
+                # And finally add the relevant portion of last document.
+            chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_l], 0)
+            num_chunks = (offset_l) // self.chunk_size
+            total_chunks += num_chunks
+            self._get_chunks(chunk_id, num_chunks, chunks)
+            sample_list.append(self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1))
+            sample = np.concatenate(sample_list)
+            chunks = np.stack(chunks, axis=0).reshape(total_chunks, self.neighbors, -1).astype(np.int64)
+        return sample.astype(np.int64), chunks
+
+    def __getitem__(self, idx):
+        text, retrieved = self._get_text(idx)
+        text = torch.from_numpy(text)
+        retrieved = torch.from_numpy(retrieved)
+        tokens = text[:-1].contiguous()
+        labels = text[1:].contiguous()
+        hidden_mask = tokens != self.pad_id
+        context_mask = retrieved != self.pad_id
+        return {
+            'tokens': tokens,
+            'labels': labels,
+            'tokens_mask': hidden_mask,
+            'loss_mask': hidden_mask,
+            'retrieved_emb_mask': context_mask,
+            'retrieved_ids': retrieved,
+        }
+
+
+def build_train_valid_test_datasets(
+    cfg,
+    trainer,
+    data_prefix: List[str],
+    data_impl: str,
+    splits_string: str,
+    train_valid_test_num_samples,
+    seq_length: int,
+    seed: int,
+    skip_warmup: bool,
+    tokenizer,
+    retrieval_prefix: str,
+    knn_map_path: List[str],
+):
+    """Build train, valid, and test RETRO datasets.
+       There is one to one mapping between data_prefix and knn_map_path.
+       Currently only supports one retrieval dataset.
+    """
+    # make sure there is one to one mapping  between data_prefix and knn_map_path
+    assert len(data_prefix) == len(knn_map_path)
+
+    # Single dataset.
+    if len(data_prefix) == 1:
+        return _build_train_valid_test_datasets(
+            cfg,
+            trainer,
+            data_prefix[0],
+            data_impl,
+            splits_string,
+            train_valid_test_num_samples,
+            seq_length,
+            seed,
+            skip_warmup,
+            tokenizer,
+            retrieval_prefix,
+            knn_map_path[0],
+        )
+
+    # Blending dataset.
+    # Parse the values.
+    output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples)
+    prefixes, weights, datasets_train_valid_test_num_samples = output
+    train_n, valid_n, test_n = map(sum, zip(*datasets_train_valid_test_num_samples))
+
+    # Build individual datasets.
+    train_datasets = []
+    valid_datasets = []
+    test_datasets = []
+    for i in range(len(prefixes)):
+        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+            cfg,
+            trainer,
+            prefixes[i],
+            data_impl,
+            splits_string,
+            datasets_train_valid_test_num_samples[i],
+            seq_length,
+            seed,
+            skip_warmup,
+            tokenizer,
+            retrieval_prefix,
+            knn_map_path[i],
+        )
+        if train_ds:
+            train_datasets.append(train_ds)
+        if valid_ds:
+            valid_datasets.append(valid_ds)
+        if test_ds:
+            test_datasets.append(test_ds)
+
+    # Blend.
+    blending_train_dataset = None
+    if train_datasets:
+        blending_train_dataset = BlendableDataset(train_datasets, weights, train_n)
+    blending_valid_dataset = None
+    if valid_datasets:
+        blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_n)
+    blending_test_dataset = None
+    if test_datasets:
+        blending_test_dataset = BlendableDataset(test_datasets, weights, test_n)
+
+    return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
+
+
+def _build_train_valid_test_datasets(
+    cfg,
+    trainer,
+    data_prefix: str,
+    data_impl: str,
+    splits_string: str,
+    train_valid_test_num_samples,
+    seq_length: int,
+    seed: int,
+    skip_warmup: bool,
+    tokenizer,
+    retrieval_prefix: str,
+    knn_map_path: str,
+):
+    """Build train, valid, and test datasets."""
+
+    # Indexed dataset.
+    indexed_dataset: MMapRetrievalIndexedDataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
+    knn_index: KNNIndex = KNNIndex(knn_map_path, skip_warmup)
+    retrieval_index: MMapRetrievalIndexedDataset = get_indexed_dataset_(retrieval_prefix, data_impl, skip_warmup)
+
+    total_num_of_documents = indexed_dataset.sizes.shape[0]
+    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+    # Print stats about the splits.
+    logging.info(' > dataset split:')
+
+    def print_split_stats(name, index):
+        logging.info('    {}:'.format(name))
+        logging.info(
+            '     document indices in [{}, {}) total of {} '
+            'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
+        )
+
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32)
+            dataset = RETRODataset(
+                cfg,
+                trainer,
+                tokenizer,
+                name,
+                data_prefix,
+                documents,
+                indexed_dataset,
+                train_valid_test_num_samples[index],
+                seq_length,
+                seed,
+                knn_index,
+                retrieval_index,
+            )
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+class MockRETRODataset(torch.utils.data.Dataset):
+    def __init__(self, cfg, trainer, tokenizer, name, size):
+        super().__init__()
+        self.name = name
+        self.tokenizer = tokenizer
+        self._cfg = cfg
+        self.size = size
+        seed_val = parallel_state.get_data_parallel_rank() * 131 + 97
+        torch.manual_seed(seed_val)
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx):
+        vocab_size = self.tokenizer.vocab_size
+
+        neighbors = self._cfg.data.neighbors
+        input_length = self._cfg.data.seq_length
+        chunks = input_length // self._cfg.chunk_size
+        chunk_size = self._cfg.chunk_size
+        pad_id = self.tokenizer.pad_id
+
+        all_tokens = torch.randint(0, vocab_size, (input_length + 1,))
+        # make sure the eod happens at the end of each chunk, can add paddings to it
+        # e.g. [..., id, id, pad, pad, pad, eod]  each has chunk_size, each sentence
+        # has length of multiple of chunk_size
+        hidden = all_tokens[:-1]
+        labels = all_tokens[1:]
+
+        hidden_mask = hidden != pad_id
+        # to mask out the token ids [id, id,  eod, id, pad, eod, id, id]
+        # so attention is not across eod, mask should be:
+        # [false, true,  true, true,  true, true,  true,  true]
+        # [false, false, true, true,  true, true,  true,  true]
+        # [false, false, false,true,  true, true,  true,  true]
+        # [true,  true,  true, false, true, true,  true,  true]
+        # [true,  true,  true, true,  true, true,  true,  true]
+        # [true,  true,  true, false, true, false, true,  true]
+        # [true,  true,  true, true,  true, true,  false, true]
+        # [true,  true,  true, true,  true, true,  false, false]
+        retrieved = torch.randint(0, vocab_size, (chunks, neighbors, 2 * chunk_size))
+
+        context_mask = retrieved != pad_id
+
+        return {
+            'tokens': hidden,
+            'labels': labels,
+            'tokens_mask': hidden_mask,
+            'loss_mask': hidden_mask,
+            'retrieved_emb_mask': context_mask,
+            'retrieved_ids': retrieved,
+        }
+
+
+def build_mock_train_valid_test_datasets(
+    cfg, trainer, splits_string, tokenizer, mock_data_size,
+):
+    """Build train, valid, and test datasets."""
+
+    splits = get_train_valid_test_split_(splits_string, mock_data_size)
+
+    # Print stats about the splits.
+    logging.info(' > dataset split:')
+
+    def print_split_stats(name, index):
+        logging.info('    {}:'.format(name))
+        logging.info(
+            '     document indices in [{}, {}) total of {} '
+            'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
+        )
+
+    print_split_stats('train', 0)
+    print_split_stats('validation', 1)
+    print_split_stats('test', 2)
+
+    def build_dataset(index, name):
+        dataset = None
+        if splits[index + 1] > splits[index]:
+            dataset = MockRETRODataset(cfg, trainer, tokenizer, name, splits[index + 1] - splits[index],)
+        return dataset
+
+    train_dataset = build_dataset(0, 'train')
+    valid_dataset = build_dataset(1, 'valid')
+    test_dataset = build_dataset(2, 'test')
+
+    return (train_dataset, valid_dataset, test_dataset)
diff --git a/nemo/collections/nlp/models/language_modeling/__init__.py b/nemo/collections/nlp/models/language_modeling/__init__.py
index f63d289f8925..437a7003483b 100644
--- a/nemo/collections/nlp/models/language_modeling/__init__.py
+++ b/nemo/collections/nlp/models/language_modeling/__init__.py
@@ -17,4 +17,5 @@
     MegatronGPTPromptLearningModel,
 )
 from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
 from nemo.collections.nlp.models.language_modeling.transformer_lm_model import TransformerLMModel
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 4493532f88bf..43cc8c26444f 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1481,7 +1481,7 @@ def setup(self, stage=None):
             f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, '
             f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, '
             f'Number of model parameters on device: {num_parameters_on_device:.2e}. '
-            f'Total number of model parameters: {total_num_parameters:.2e}.'
+            f'Number of precise model parameters on device: {total_num_parameters}.'
         )
 
         resume_checkpoint_path = self.trainer.ckpt_path
@@ -1548,11 +1548,14 @@ def setup_validation_data(self, cfg):
 
     def setup_test_data(self, cfg):
         if hasattr(self, '_test_ds'):
-            consumed_samples = 0
-            logging.info(
-                f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}'
-            )
-            self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
+            if self._test_ds is not None:
+                consumed_samples = 0
+                logging.info(
+                    f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}'
+                )
+                self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples)
+            else:
+                self._test_dl = None
 
     def generate(
         self,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
index acd85261f7e5..42323e503f7d 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py
@@ -23,7 +23,7 @@
     MegatronPretrainingRandomSampler,
     MegatronPretrainingSampler,
 )
-from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import (
+from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset_legacy import (
     build_mock_train_valid_test_datasets,
     build_train_valid_test_datasets,
 )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
new file mode 100644
index 000000000000..8cc39056554c
--- /dev/null
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -0,0 +1,651 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import json
+import os
+import queue
+import types
+import warnings
+from dataclasses import fields
+from functools import partial
+from typing import Any, Dict, Iterator, List, Optional, Union
+
+import torch
+from omegaconf import OmegaConf, open_dict
+from omegaconf.dictconfig import DictConfig
+from pytorch_lightning.accelerators import CPUAccelerator
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import (
+    MegatronPretrainingRandomSampler,
+    MegatronPretrainingSampler,
+)
+
+# from nemo.collections.nlp.data.language_modeling.megatron.retro_dummy_dataset import build_train_valid_test_datasets as dummy_build_train_valid_test_datasets  # turn on when running with dummy data
+from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import build_train_valid_test_datasets
+from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.modules.common.megatron.build_model import build_model
+from nemo.collections.nlp.modules.common.megatron.module import Float16Module
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    ApexGuardDefaults,
+    average_losses_across_data_parallel_group,
+    get_all_params_for_weight_decay_optimization,
+    get_ltor_masks_and_position_ids,
+    get_params_for_weight_decay_optimization,
+)
+from nemo.collections.nlp.modules.common.text_generation_strategy import TextGenerationStrategy
+from nemo.collections.nlp.modules.common.text_generation_utils import (
+    generate,
+    get_computeprob_response,
+    get_default_length_params,
+    get_default_sampling_params,
+    megatron_gpt_generate,
+)
+from nemo.collections.nlp.modules.common.transformer.text_generation import (
+    LengthParam,
+    OutputType,
+    SamplingParam,
+    TextGeneration,
+)
+from nemo.collections.nlp.parts import utils_funcs
+from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank
+from nemo.core.classes import Exportable
+from nemo.core.classes.common import PretrainedModelInfo
+from nemo.core.neural_types import ChannelType, NeuralType
+from nemo.utils import logging
+
+try:
+    import apex.transformer.pipeline_parallel.utils
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+    HAVE_APEX = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_APEX = False
+
+try:
+    from megatron.core import InferenceParams, parallel_state
+    from megatron.core.models.retro import RetroModel as MCoreRetroModel
+    from megatron.core.models.retro.config import RetroConfig
+    from megatron.core.models.retro.decoder_spec import get_retro_decoder_block_spec
+    from megatron.core.models.retro.utils import get_config_path as get_retro_config_path
+    from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
+    from megatron.core.transformer.transformer_config import TransformerConfig
+    from megatron.core.utils import init_method_normal, scaled_init_method_normal
+
+    # TODO @tmoon: Use once available in Megatron-LM
+    # from megatron.core.pipeline_parallel.schedules import DataIteratorList
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    TransformerConfig = ApexGuardDefaults
+
+    HAVE_MEGATRON_CORE = False
+
+try:
+    import transformer_engine
+    from transformer_engine.pytorch import module as te_module
+
+    HAVE_TE = True
+
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+
+class MegatronRetroModel(MegatronGPTModel):
+    """
+    Megatron Retro pretraining
+    """
+
+    def load_retro_config(self, cfg: DictConfig):
+        assert cfg.retro.get('retro_project_dir') is not None, "`--retro-project-dir` must be set to use Retro."
+
+        # Retro config path.
+        retro_config_path = get_retro_config_path(cfg.retro.get('retro_project_dir'))
+        assert os.path.exists(retro_config_path), "retro project dir missing config.json."
+
+        # Load retro config.
+        with open(retro_config_path) as f:
+
+            # Parse config.
+            retro_preprocess_config = types.SimpleNamespace(**json.load(f))
+
+            # Retro data path is relative to data path (via hard or soft links).
+            data_dir = get_retro_data_dir(cfg.retro.get('retro_project_dir'))
+            data_path = list(retro_preprocess_config.retro_gpt_data_path)
+            if len(data_path) % 2 == 0:
+                for i in range(len(data_path) - 1, -1, -2):
+                    data_path[i] = os.path.join(data_dir, data_path[i])
+            else:
+                assert len(data_path) == 1
+                data_path[0] = os.path.join(data_dir, data_path[0])
+
+            # Update args.
+            cfg.global_batch_size = retro_preprocess_config.retro_gpt_global_batch_size
+            cfg.seed = retro_preprocess_config.retro_gpt_seed
+            cfg.data.data_prefix = data_path
+            cfg.encoder_seq_length = retro_preprocess_config.retro_gpt_seq_length
+            cfg.data.seq_length = retro_preprocess_config.retro_gpt_seq_length
+            cfg.max_position_embeddings = retro_preprocess_config.retro_gpt_seq_length
+            # cfg.data.splits_string = retro_preprocess_config.retro_gpt_split      # remove because lastest RETRO data-object have separate RETRO training split and RETRO preprocessing split
+            cfg.tokenizer.model = (
+                cfg.retro.get('retro_project_dir') + '/' + retro_preprocess_config.retro_gpt_tokenizer_model
+            )
+            cfg.tokenizer.type = retro_preprocess_config.retro_gpt_tokenizer_type
+            cfg.tokenizer.vocab_file = retro_preprocess_config.retro_gpt_vocab_file
+            cfg.tokenizer.merge_file = retro_preprocess_config.retro_gpt_merge_file
+            with open_dict(cfg):
+                cfg.retro_train_samples_with_neighbors = retro_preprocess_config.retro_gpt_train_samples
+                cfg.retro_valid_samples_with_neighbors = retro_preprocess_config.retro_gpt_valid_samples
+            cfg.data.retro_data.retro_block_size = retro_preprocess_config.retro_block_size
+            cfg.data.retro_data.retro_chunk_length = retro_preprocess_config.retro_gpt_chunk_length
+            cfg.data.retro_data.retro_split_preprocessing = retro_preprocess_config.retro_gpt_split
+            cfg.data.retro_data.retro_neighbor_dirs = retro_preprocess_config.retro_neighbor_dirs
+
+        return cfg
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer):
+
+        # override pre-processing arguments with retro pre-processing arguments
+        cfg = self.load_retro_config(cfg)
+
+        super().__init__(cfg, trainer=trainer)
+
+        logging.info(
+            "\n\n************** Experiment configuration (after overriding with RETRO's workdir values) ***********"
+        )
+        logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
+
+        return
+
+    def model_provider_func(self, pre_process, post_process):
+        """Model depends on pipeline paralellism."""
+        if self.mcore_gpt:
+            self.retro_model_config = self.build_retro_config()
+            model = MCoreRetroModel(
+                config=self.retro_model_config,
+                transformer_layer_spec=get_retro_decoder_block_spec(
+                    self.retro_model_config, use_transformer_engine=True
+                ),
+                vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),
+                max_sequence_length=self.cfg.data.get('seq_length', 512),
+                pre_process=pre_process,
+                post_process=post_process,
+                parallel_output=True,
+                share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True),
+                position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
+                rotary_percent=self.cfg.get('rotary_percentage', 1.0),
+                seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+            )
+
+            return model
+        else:
+            assert self.mcore_gpt == True, "Currently only support mcore Retro."
+
+    def forward(
+        self, tokens, text_position_ids, attention_mask, labels, context_input_ids, context_position_ids, context_mask
+    ):
+        output_tensor = self.model(
+            tokens,
+            text_position_ids,
+            attention_mask,
+            context_input_ids=context_input_ids,
+            context_position_ids=context_position_ids,
+            context_mask=context_mask,
+            labels=labels,
+        )
+        return output_tensor
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None, **extra) -> Any:
+        # batch = {'prompts': List, 'neighbors': List[List]}
+
+        inference_config = self.get_inference_config()
+
+        if torch.distributed.get_rank() == 0:
+            logging.info("inference_config: ")
+            logging.info(inference_config)
+
+        if inference_config is None:
+            return None
+        else:
+            # need to overwrite some configuration, make it immutable
+            inference_config = inference_config.copy()
+            compute_logprob = inference_config['compute_logprob']
+            if compute_logprob:
+                inference_config['inputs'] = batch['prompts']
+                inference_config['neighbors'] = batch['neighbors']
+                inference_config['tokens_to_generate'] = 1
+                inference_config['all_probs'] = True
+                inference_config["add_BOS"] = False
+                inference_config['greedy'] = True
+                inference_config['retro_inference'] = inference_config['retro_inference']
+                response = generate(self, **inference_config)
+                compute_prob_response = get_computeprob_response(self.tokenizer, response, batch)
+                return compute_prob_response
+            else:
+                inference_config['inputs'] = batch['prompts']
+                inference_config['neighbors'] = batch['neighbors']
+                inference_config['retro_inference'] = inference_config['retro_inference']
+                return generate(self, **inference_config)
+
+    def get_batch(self, data_iterator):
+        """Generate a batch."""
+
+        # Broadcast data.
+        if data_iterator is not None:
+            # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx
+            data = next(data_iterator)
+            if isinstance(data, tuple):
+                data = data[0]
+        else:
+            data = None
+
+        batch = {
+            'tokens': data["tokens"],
+            'labels': data["labels"],
+            'loss_mask': data["loss_mask"],
+            'attention_mask': data["attention_mask"],
+            'position_ids': data["position_ids"],
+            'context_input_ids': data["context_input_ids"],
+            'context_attention_mask': data["context_attention_mask"],
+            'context_position_ids': data["context_position_ids"],
+        }
+
+        return batch
+
+    def get_forward_output_and_loss_func(self, validation_step=False):
+        def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
+
+            # Get data batch
+            batch = self.get_batch(dataloader_iter)
+
+            # Transfer needed data to GPU
+            required_keys = set()
+            if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+                required_keys.update(batch.keys())
+            else:
+                required_keys.add('attention_mask')
+                if parallel_state.is_pipeline_first_stage():
+                    required_keys.update(
+                        ('tokens', 'position_ids', 'context_input_ids', 'context_position_ids', 'context_mask')
+                    )
+                if parallel_state.is_pipeline_last_stage():
+                    required_keys.update(('labels', 'loss_mask'))
+            if self.get_attention_mask_from_fusion:
+                required_keys.remove('attention_mask')
+            batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()}
+
+            # reshape context_input_ids and context_position_ids for RETRO from [bs, l*k, r] => [bs*l*k, r]
+            context_input_ids = batch['context_input_ids']
+            context_position_ids = batch['context_position_ids']
+            context_input_ids = context_input_ids.view(-1, context_input_ids.shape[-1]).long()
+            context_position_ids = context_position_ids.view(-1, context_position_ids.shape[-1]).long()
+            batch['context_input_ids'] = context_input_ids
+            batch['context_position_ids'] = context_position_ids
+
+            # slice batch along sequence dimension for context parallelism
+            batch = self.get_batch_on_this_context_parallel_rank(batch)
+
+            # Model forward pass
+            forward_args = {
+                'input_ids': batch['tokens'],
+                'position_ids': batch['position_ids'],
+                'attention_mask': batch['attention_mask'],
+                'context_input_ids': batch['context_input_ids'],
+                'context_position_ids': batch['context_position_ids'],
+                'context_mask': None,  # batch neighbor_attention_mask will be set to None following Lawrence's implementation
+                'labels': batch['labels'],
+                'loss_mask': batch['loss_mask'],
+            }
+
+            if not self.mcore_gpt:
+                forward_args['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers
+                if not self.use_loss_mask:
+                    forward_args.pop('loss_mask')
+            else:
+                # TODO: @eharper can we add this to mcore?
+                forward_args.pop('loss_mask')
+            output_tensor = model(**forward_args)
+
+            def loss_func(output_tensor):
+                # Loss for a micro-batch (ub)
+                loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor)
+                if validation_step and not self.cfg.data.get('validation_drop_last', True):
+                    num_valid_tokens_in_ub = batch['loss_mask'].sum()
+                    if loss_for_ub.isnan():
+                        assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input'
+                        loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub)
+                    else:
+                        loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub
+
+                    loss_sum_and_ub_size_all_gpu = torch.cat(
+                        [
+                            loss_sum_for_ub.clone().detach().view(1),
+                            torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(),
+                        ]
+                    )
+                    # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds)
+                    torch.distributed.all_reduce(
+                        loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group()
+                    )
+                    return loss_for_ub, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu}
+                else:
+                    reduced_loss = average_losses_across_data_parallel_group([loss_for_ub])
+                    return loss_for_ub, {'avg': reduced_loss}
+
+            return output_tensor, loss_func
+
+        return fwd_output_and_loss_func
+
+    def get_forward_output_only_func(self):
+        def fwd_output_only_func(dataloader_iter, model):
+            batch = next(dataloader_iter)
+            extra_arg = {}
+            if len(batch) == 5:
+                batch = [x.cuda() for x in batch]
+                tokens, attention_mask, position_ids, context_input_ids, context_position_ids, context_mask = batch
+                attention_mask = attention_mask[0:1]
+            else:
+                (
+                    tokens,
+                    attention_mask,
+                    position_ids,
+                    context_input_ids,
+                    context_position_ids,
+                    context_mask,
+                    set_inference_key_value_memory,
+                    inference_max_sequence_len,
+                ) = batch
+                tokens = tokens.cuda()
+                position_ids = position_ids.cuda()
+                if attention_mask is not None:
+                    attention_mask = attention_mask.cuda()
+                    attention_mask = attention_mask[0:1]
+                context_input_ids = context_input_ids.cuda()
+                context_position_ids = context_position_ids.cuda()
+                context_mask = None
+                if self.mcore_gpt:
+                    # if first step, then clear KV cache, otherwise reuse inference_paarms
+                    if set_inference_key_value_memory[0].item():
+                        self.inference_params = InferenceParams(
+                            max_batch_size=tokens.size(0), max_sequence_length=inference_max_sequence_len[0].item()
+                        )
+                    extra_arg['inference_params'] = self.inference_params
+                else:
+                    extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item()
+                    extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
+            output_tensor = model(
+                tokens,
+                position_ids,
+                attention_mask,
+                context_input_ids=context_input_ids,
+                context_position_ids=context_position_ids,
+                context_mask=None,  # batch neighbor_attention_mask will be set to None following Lawrence's implementation
+                **extra_arg,
+            )
+
+            # Advance inference sequence offset.
+            if self.inference_params:
+                # if last stage, then (final) output is [b, s, h], otherwise it's [s, b, h]
+                if parallel_state.is_pipeline_last_stage():
+                    self.inference_params.sequence_len_offset += output_tensor.size(1)
+                else:
+                    self.inference_params.sequence_len_offset += output_tensor.size(0)
+
+            def id_func(output_tensor):
+                return output_tensor, {'logits': output_tensor}
+
+            return output_tensor, id_func
+
+        return fwd_output_only_func
+
+    def build_retro_config(self) -> RetroConfig:
+        """ This method build RetroConfig from the already built TransformerConfig
+        by adding Retro relevant variables. This method runs after running build_transformer_config() method.
+        """
+        retro_config = self.transformer_config
+
+        # retro model args
+        retro_config.retro_project_dir = self.cfg.retro.get('retro_project_dir')
+        retro_config.retro_block_size = self.cfg.data.retro_data.get('retro_block_size')
+        retro_config.retro_chunk_length = self.cfg.data.retro_data.get('retro_chunk_length')
+        retro_config.retro_encoder_num_layers = self.cfg.retro.get('retro_encoder_num_layers', 2)
+        retro_config.retro_encoder_hidden_dropout = self.cfg.retro.get('retro_encoder_hidden_dropout', 0.1)
+        retro_config.retro_encoder_attention_dropout = self.cfg.retro.get('retro_encoder_attention_dropout', 0.1)
+        retro_config.retro_num_neighbors = self.cfg.retro.get('retro_num_neighbors', 2)
+        retro_config.retro_num_retrieved_chunks = self.cfg.retro.get('retro_num_retrieved_chunks', 2)
+        retro_config.retro_verify_neighbor_count = self.cfg.retro.get('retro_verify_neighbor_count', True)
+        retro_config.retro_retrieved_length = retro_config.retro_num_retrieved_chunks * retro_config.retro_chunk_length
+        retro_config.retro_split_preprocessing = self.cfg.data.retro_data.get('retro_split_preprocessing')
+        retro_config.retro_neighbor_dirs = self.cfg.data.retro_data.get('retro_neighbor_dirs')
+        logging.info("retro_config: ")
+        logging.info(retro_config)
+
+        # Validate Transformer Engine version.
+        from importlib.metadata import version
+
+        from pkg_resources import packaging
+
+        te_version = packaging.version.Version(version("transformer-engine"))
+        if te_version >= packaging.version.Version("1.3"):
+            try:
+                os.environ["NVTE_FLASH_ATTN"] = "0"
+                os.environ["NVTE_FUSED_ATTN"] = "0"
+                assert os.getenv("NVTE_FLASH_ATTN") == "0"
+                assert os.getenv("NVTE_FUSED_ATTN") == "0"
+            except Exception as e:
+                raise Exception(
+                    "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN and NVTE_FUSED_ATTN most both be defined and set to '0'. Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s."
+                    % (os.getenv("NVTE_FLASH_ATTN", "[unset]"), os.getenv("NVTE_FUSED_ATTN", "[unset]"),)
+                )
+
+        return retro_config
+
+    def build_train_valid_test_datasets(self):
+        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+        # self._reconfigure_val_batches()
+        logging.info('Building mcore RETRO datasets.')
+        if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.")
+        global_batch_size = self.cfg.global_batch_size
+        # max_train_steps = self.trainer.max_steps
+        # eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches # check this carefully, we want to match mcore dataset value, should this computed, or overriden?
+        # test_iters = self.trainer.limit_test_batches
+
+        # getting train_valid_test_num_samples from values in RETRO's workdir
+        train_valid_test_num_samples = [  # compute the number of training/validating samples from workdir/query/train_*; dividing number of chunks for (2048/64)
+            self.cfg.retro_train_samples_with_neighbors,
+            self.cfg.retro_valid_samples_with_neighbors,
+            0,
+        ]
+
+        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            train_valid_test_num_samples[
+                1
+            ] = 1  # This is to make sure we only have one epoch on every validation iteration
+
+        self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets(
+            cfg=self.cfg,
+            retro_config=self.retro_model_config,
+            train_valid_test_num_samples=train_valid_test_num_samples,
+            seq_length=self.cfg.data.seq_length,
+            tokenizer=self.tokenizer,
+        )
+
+        if self._train_ds is not None:
+            logging.info(f'Length of train dataset: {len(self._train_ds)}')
+        if self._validation_ds is not None:
+            logging.info(f'Length of val dataset: {len(self._validation_ds)}')
+        if self._test_ds is not None:
+            logging.info(f'Length of test dataset: {len(self._test_ds)}')
+        logging.info(f'Finished building mcore RETRO datasets.')
+
+        return self._train_ds, self._validation_ds, self._test_ds
+
+    def build_pretraining_data_loader(
+        self, dataset, consumed_samples, dataset_type=None, drop_last=True, pad_samples_to_global_batch_size=False
+    ):
+        """Buld dataloader given an input dataset."""
+
+        logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
+        # Megatron sampler
+        if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
+            if self.cfg.data.dataloader_type == 'single':
+                batch_sampler = MegatronPretrainingSampler(
+                    total_samples=len(dataset),
+                    consumed_samples=consumed_samples,
+                    micro_batch_size=self.cfg.micro_batch_size,
+                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                    drop_last=drop_last,
+                    global_batch_size=self.cfg.global_batch_size,
+                    rampup_batch_size=self.cfg.get('rampup_batch_size', None),
+                    pad_samples_to_global_batch_size=pad_samples_to_global_batch_size,
+                )
+            elif self.cfg.data.dataloader_type == 'cyclic':
+                batch_sampler = MegatronPretrainingRandomSampler(
+                    total_samples=len(dataset),
+                    consumed_samples=consumed_samples,
+                    micro_batch_size=self.cfg.micro_batch_size,
+                    data_parallel_rank=parallel_state.get_data_parallel_rank(),
+                    data_parallel_size=parallel_state.get_data_parallel_world_size(),
+                    drop_last=self.cfg.get('drop_last', True),
+                )
+            else:
+                raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"')
+        else:
+            raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"')
+
+        return torch.utils.data.DataLoader(
+            dataset,
+            batch_sampler=batch_sampler,
+            num_workers=self.cfg.data.num_workers,
+            pin_memory=True,
+            persistent_workers=True if self.cfg.data.num_workers > 0 else False,
+        )
+
+    def fwd_bwd_step(self, dataloader_iter, forward_only):
+
+        # handle asynchronous grad reduction
+        no_sync_func = None
+        grad_sync_func = None
+        param_sync_func = None
+        if not forward_only and self.with_distributed_adam:
+            no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+            grad_sync_func = self.reduce_overlap_gradients
+            param_sync_func = self.sync_overlap_parameters
+
+        # pipeline schedules will get these from self.model.config
+        for module in self.get_model_module_list():
+            module.config.no_sync_func = no_sync_func
+            module.config.grad_sync_func = grad_sync_func
+            module.config.param_sync_func = param_sync_func
+
+        # run forward and backwards passes for an entire global batch
+        # we do this inside training_step to support pipeline parallelism
+        fwd_bwd_function = get_forward_backward_func()
+
+        # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready
+        losses_reduced_per_micro_batch = fwd_bwd_function(
+            forward_step_func=self.get_forward_output_and_loss_func(forward_only),
+            data_iterator=self._make_data_iterator_list(dataloader_iter),
+            model=self.model,
+            num_microbatches=get_num_microbatches(),
+            forward_only=forward_only,
+            seq_length=self.cfg.encoder_seq_length,
+            micro_batch_size=self.cfg.micro_batch_size,
+        )
+
+        # only the last stages of the pipeline return losses
+        if losses_reduced_per_micro_batch:
+            if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                # average loss across micro batches
+                loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+                loss_tensor = torch.concat(loss_tensors_list)
+                loss_mean = loss_tensor.mean()
+            else:
+                # Get the total loss since micro batches sizes are not uniform
+                loss_sum_tensors_list = [
+                    loss_sum['loss_sum_and_ub_size']
+                    for loss_sum in losses_reduced_per_micro_batch
+                    if loss_sum['loss_sum_and_ub_size'][1] > 0
+                ]
+                loss_sum = (
+                    torch.vstack(loss_sum_tensors_list).sum(axis=0)
+                    if len(loss_sum_tensors_list) > 0
+                    else torch.tensor([0.0, 0.0]).cuda()
+                )
+                return loss_sum
+        else:
+            # we're not on the last pipeline stage so no losses
+            if forward_only:
+                loss_mean = []
+            else:
+                loss_mean = torch.tensor(0.0).cuda()
+
+        return loss_mean
+
+    def validation_step(self, dataloader_iter, dataloader_idx=0):
+        """
+            Our dataloaders produce a micro-batch and then we fetch
+            a number of microbatches depending on the global batch size and model parallel size
+            from the dataloader to produce a list of microbatches.
+            The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions.
+        """
+        mode = 'test' if self.trainer.testing else 'val'
+        # Initialize userbuffer communicators.
+        if self.initialize_ub:
+            self.initialize_ub_func()
+
+        if isinstance(self.model, list):
+            for model_module in self.model:
+                model_module.eval()
+        else:
+            self.model.eval()
+
+        if self.cfg.get('fp8', False):
+            first_val_step = self.prev_step_training and not self.training
+            self.prev_step_training = self.training
+        else:
+            first_val_step = None
+
+        with torch.no_grad():
+            loss = self.fwd_bwd_step(dataloader_iter, True)
+
+        if isinstance(self.model, list):
+            for model_module in self.model:
+                model_module.train()
+        else:
+            self.model.train()
+
+        if mode == 'val':
+            # Append with the correct dataloader_idx in case of multiple dataloaders
+            if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1:
+                self.validation_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.validation_step_outputs.append(loss)
+        else:
+            if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1:
+                self.test_step_outputs[dataloader_idx].append(loss)
+            else:
+                self.test_step_outputs.append(loss)
+
+        return loss
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 84df4a6965e1..67c94ae5d608 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -114,6 +114,7 @@ def get_tokenizer(
         tokenizer_name = get_megatron_tokenizer(tokenizer_name)
 
     if tokenizer_name == 'sentencepiece':
+        logging.info("tokenizer_model: " + str(tokenizer_model))
         return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
             model_path=tokenizer_model, special_tokens=special_tokens, legacy=True
         )
@@ -195,6 +196,14 @@ def get_nmt_tokenizer(
         logging.info(f'Using regex tokenization')
         return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file)
     elif library == 'megatron':
+
+        if model_name == 'GPTSentencePieceTokenizer':
+            logging.info("tokenizer_model: ")
+            logging.info(tokenizer_model)
+            return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
+                model_path=tokenizer_model, legacy=legacy
+            )
+
         if model_name in megatron_tokenizer_model_map:
             model_name = megatron_tokenizer_model_map[model_name]
         logging.info(
diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py
index 059ce4455977..e532297d9747 100644
--- a/nemo/utils/callbacks/nemo_model_checkpoint.py
+++ b/nemo/utils/callbacks/nemo_model_checkpoint.py
@@ -357,12 +357,15 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri
             barrier_before: Synchronize ranks before removing the marker file.
               Defaults to False.
         """
-        if barrier_before and torch.distributed.is_initialized():
-            torch.distributed.barrier()
-        if is_global_rank_zero():
-            marker_path = NeMoModelCheckpoint.format_checkpoint_unfinished_marker_path(checkpoint_path)
-            if marker_path.exists():
-                marker_path.unlink()
+        try:
+            if barrier_before and torch.distributed.is_initialized():
+                torch.distributed.barrier()
+            if is_global_rank_zero():
+                marker_path = NeMoModelCheckpoint.format_checkpoint_unfinished_marker_path(checkpoint_path)
+                if marker_path.exists():
+                    marker_path.unlink()
+        except:
+            return
 
     def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None:
         # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed.
diff --git a/tests/collections/nlp/test_indexed_retrieval_dataset.py b/tests/collections/nlp/test_indexed_retrieval_dataset.py
index e35c3ab36840..5110651b34a6 100644
--- a/tests/collections/nlp/test_indexed_retrieval_dataset.py
+++ b/tests/collections/nlp/test_indexed_retrieval_dataset.py
@@ -28,7 +28,7 @@
     MMapRetrievalIndexedDatasetBuilder,
     merge_knn_files,
 )
-from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import RETRODataset
+from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset_legacy import RETRODataset
 
 try:
     from megatron.core import parallel_state

From d2ab8435222e4928b057ff271a957f143d3b75b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Tue, 16 Apr 2024 09:18:49 -0400
Subject: [PATCH 37/39] Extended input configuration + Lhotse multimodal (mixed
 audio and text-only) dataloading (#8581)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* wip

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Partially working config groups

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Working test with abasic group in the input config

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Working test with nested groups in input config

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Working test with specifying a YAML path for input_cfg

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* a very rough example of text dataloading via lhotse

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Cleaner integration of multimodal audio/text loading that allows to control the effective audio vs text size (requires latest lhotse)

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* remove obsolete test

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix an import in export_utils.py (#8571)

Signed-off-by: w4-jinhyeonkim <131935801+w4-jinhyeonkim@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Yttm deprecation (#8322)

* yttm deprecation init commit

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* removed tests

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* bug fix

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* path fix

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* fixing path

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* updated tests to spm

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* updated Jenkinsfile

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* new model with spm in tests

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* yttm removed

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

* updated aayn config

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>

---------

Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fixed missing copy import in rnnt_decoder.py (#8580)

* Added copy import to rnnt_decoding.py

Signed-off-by: Isaac McFadyen <isaac@imcf.me>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Isaac McFadyen <isaac@imcf.me>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix bug in RNNT Joint WER calculation for fused batch (#8587)

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fixed Context Parallel HtoD sync (#8557)

* Fixed cp HtoD sync

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* change default and add key to config files (#8594)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix triton import guards (#8552)

* Fix triton import guards

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

* Update attention.py

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>

---------

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add config key for dropout position in LoRA adapter (#8583)

Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix ia3 mlp infused adapter (#8597)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Prevent Redundant Gather for LoRA Sequence Parallel (#8602)

* enable layernorm output gathered

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Accelerate `transcribe_speech.py` for short-form data: pre-sorting support (#8564)

* POC using bucketing in transcribe_speech.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* extend to multi task aed

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fixes for aed multi task text/lang field selectors

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* remove assert

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* expose option for bucket buffer size

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fixes, ctc support

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* support pre-sorting manifests in transcribe_speech.py

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* cleanup

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* reorder transcriptions back to original manifest order

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* remove bucketing entirely

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* code review changes

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* code review changes--amend

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* refactor text_field/lang_field passing

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix reordering bug; disable presorting for multi task for now

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add support for presort + multi task model

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Code reviews

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix jenkins tests, add user-friendly error msg for canary

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Bump min required lhotse version

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add some documentation about this config format and the multimodal features

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add caution about multiple shards

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address Tom's code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add copyright header

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix (hopefully) issue with forced ascii encoding in CI

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Support resolving input_cfg path into config contents

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Code review changes in docs

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix unicode decode error

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: w4-jinhyeonkim <131935801+w4-jinhyeonkim@users.noreply.github.com>
Signed-off-by: AlexGrinch <grinchuk.alexey@gmail.com>
Signed-off-by: Isaac McFadyen <isaac@imcf.me>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: w4-jinhyeonkim <131935801+w4-jinhyeonkim@users.noreply.github.com>
Co-authored-by: Aleksey Grinchuk (Oleksii Hrinchuk) <grinchuk.alexey@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Isaac McFadyen <isaac@imcf.me>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: Selvaraj Anandaraj <anandaraj@wisc.edu>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Michal Futrega <mfutrega@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 docs/source/asr/datasets.rst                  | 135 ++++++
 nemo/collections/common/data/lhotse/cutset.py | 220 ++++++++-
 .../common/data/lhotse/dataloader.py          | 156 +++++-
 .../common/data/lhotse/nemo_adapters.py       |  16 +-
 .../common/data/lhotse/text_adapters.py       |  97 ++++
 .../tokenizers/sentencepiece_tokenizer.py     |   6 +-
 requirements/requirements_asr.txt             |   2 +-
 .../common/test_lhotse_dataloading.py         | 453 +++++++++++++++++-
 8 files changed, 1042 insertions(+), 43 deletions(-)
 create mode 100644 nemo/collections/common/data/lhotse/text_adapters.py

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 7612c6a3f630..ca16d0538a31 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -664,6 +664,141 @@ Some other Lhotse related arguments we support:
 
 The full and always up-to-date list of supported options can be found in ``LhotseDataLoadingConfig`` class.
 
+Extended multi-dataset configuration format
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Combining a large number of datasets and defining weights for them can be tricky.
+We offer an extended configuration format that allows you to explicitly define datasets,
+dataset groups, and their weights either inline in the experiment configuration,
+or as a path to a separate YAML file.
+
+In addition to the features above, this format introduces a special ``tags`` dict-like field.
+The keys and values in ``tags`` are automatically attached to every sampled example, which
+is very useful when combining multiple datasets with different properties.
+The dataset class which converts these examples to tensors can partition the mini-batch and apply
+different processing to each group.
+For example, you may want to construct different prompts for the model using metadata in ``tags``.
+
+.. note:: When fine-tuning a model that was trained with ``input_cfg`` option, typically you'd only need
+    to override the following options: ``input_cfg=null`` and ``manifest_filepath=path/to/manifest.json``.
+
+Example 1. Combine two datasets with equal weights and attach custom metadata in ``tags`` to each cut:
+
+.. code-block:: yaml
+
+    input_cfg:
+      - type: nemo_tarred
+        manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+        tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+        weight: 0.4
+        tags:
+          lang: en
+          pnc: no
+      - type: nemo_tarred
+        manifest_filepath: /path/to/other/manifest__OP_0..512_CL_.json
+        tarred_audio_filepath: /path/to/other/tarred_audio/audio__OP_0..512_CL_.tar
+        weight: 0.6
+        tags:
+          lang: pl
+          pnc: yes
+
+Example 2. Combine multiple (4) datasets, corresponding to different tasks (ASR, AST).
+Each task gets its own group and its own weight.
+Then within each task, each dataset get its own within-group weight as well.
+The final weight is the product of outer and inner weight:
+
+.. code-block:: yaml
+
+    input_cfg:
+      - type: group
+        weight: 0.7
+        tags:
+          task: asr
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /path/to/asr1/manifest__OP_0..512_CL_.json
+            tarred_audio_filepath: /path/to/tarred_audio/asr1/audio__OP_0..512_CL_.tar
+            weight: 0.6
+            tags:
+              source_lang: en
+              target_lang: en
+          - type: nemo_tarred
+            manifest_filepath: /path/to/asr2/manifest__OP_0..512_CL_.json
+            tarred_audio_filepath: /path/to/asr2/tarred_audio/audio__OP_0..512_CL_.tar
+            weight: 0.4
+            tags:
+              source_lang: pl
+              target_lang: pl
+      - type: group
+        weight: 0.3
+        tags:
+          task: ast
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /path/to/ast1/manifest__OP_0..512_CL_.json
+            tarred_audio_filepath: /path/to/ast1/tarred_audio/audio__OP_0..512_CL_.tar
+            weight: 0.2
+            tags:
+              source_lang: en
+              target_lang: pl
+          - type: nemo_tarred
+            manifest_filepath: /path/to/ast2/manifest__OP_0..512_CL_.json
+            tarred_audio_filepath: /path/to/ast2/tarred_audio/audio__OP_0..512_CL_.tar
+            weight: 0.8
+            tags:
+              source_lang: pl
+              target_lang: en
+
+Configuring multi-modal dataloading
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Our configuration format supports specifying data sources from other modalities than just audio.
+At this time, this support is extended to text-only data. We provide the following parser types:
+
+* ``txt`` for raw text files, sharded or unsharded. This can represent, for example, language modeling data.
+* ``txt_pair`` for pairs of raw text files, sharded or unsharded. This can represent, for example, machine translation data.
+
+The key strength of this approach is that we can easily combine audio datasets and text datasets,
+and benefit from every other technique we described above such as dynamic data mixing, data weighting, dynamic bucketing, and so on.
+To enable multimodal dataloading, we provide several configuration options:
+
+* ``use_multimodal_sampling`` when set to True, we'll discard the settings of ``batch_duration`` and ``quadratic_duration`` and consider the settings below instead.
+
+* ``batch_tokens`` is the maximum number of tokens we want to find inside a mini-batch. Similarly to ``batch_duration``, this number does consider padding tokens too, therefore enabling bucketing is recommended to maximize the ratio of real vs padding tokens.
+
+* ``token_equivalent_duration`` is used to be able to measure audio examples in the number of "tokens". For example, if we're using fbank with 0.01s frame shift and an acoustic model that has a subsampling factor of 0.08, then a reasonable setting for this could be 0.08 (which means every subsampled frame counts as one token). Calibrate this value to fit your needs. Note that this value acts as a "balancer" between how much audio data vs text data gets sampled into a mini-batch.
+
+* ``quadratic_factor`` works the same way as ``quadratic_duration``, but is defined in the number of tokens.
+
+Example 3. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so that mini-batches have some examples from both datasets. Provide a custom prompt field for both datasets (to be leveraged by a relevant dataset class):
+
+```yaml
+use_multimodal_sampling: true
+batch_tokens: 1024
+token_equivalent_duration: 0.08  # 0.01 frame shift * 8 subsampling factor
+quadratic_factor: 50
+num_buckets: 30
+use_bucketing: true
+input_cfg:
+  - type: nemo_tarred
+    manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+    tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+    weight: 0.5
+    tags:
+      lang: en
+      prompt: "Given the following recording, transcribe what the person is saying:"
+  - type: txt_pair
+    source_path: /path/to/en__OP_0..512_CL_.txt
+    target_path: /path/to/pl__OP_0..512_CL_.txt
+    source_language: en
+    target_language: pl
+    weight: 0.5
+    tags:
+      prompt: "Translate the following text to Polish:"
+```
+
+.. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples.
+
 Pre-computing bucket duration bins
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 028ea8bfef90..fa5ae5804c4b 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -14,30 +14,36 @@
 
 import logging
 import warnings
+from functools import partial
 from itertools import repeat
 from pathlib import Path
 from typing import Sequence, Tuple
 
 from lhotse import CutSet
+from omegaconf import DictConfig, ListConfig, OmegaConf
 
 from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator, LazyNeMoTarredIterator
+from nemo.collections.common.data.lhotse.text_adapters import LhotseTextAdapter, LhotseTextPairAdapter
 
 
-def read_cutset_from_config(config) -> Tuple[CutSet, bool]:
+def read_cutset_from_config(config: DictConfig) -> Tuple[CutSet, bool]:
     """
     Reads NeMo configuration and creates a CutSet either from Lhotse or NeMo manifests.
 
     Returns a tuple of ``CutSet`` and a boolean indicating whether the data is tarred (True) or not (False).
     """
-    # First, we'll figure out if we should read Lhotse manifest or NeMo manifest.
-    use_nemo_manifest = all(config[opt] is None for opt in ("cuts_path", "shar_path"))
+    # First, check if the dataset is specified in the new configuration format and use it if possible.
+    if config.get("input_cfg") is not None:
+        return read_dataset_config(config)
+    # Now, we'll figure out if we should read Lhotse manifest or NeMo manifest.
+    use_nemo_manifest = all(config.get(opt) is None for opt in ("cuts_path", "shar_path"))
     if use_nemo_manifest:
         assert (
-            config.manifest_filepath is not None
-        ), "You must specify either: manifest_filepath, lhotse.cuts_path, or lhotse.shar_path"
-        is_tarred = config.tarred_audio_filepaths is not None
+            config.get("manifest_filepath") is not None
+        ), "You must specify either: manifest_filepath, cuts_path, or shar_path"
+        is_tarred = config.get("tarred_audio_filepaths") is not None
     else:
-        is_tarred = config.shar_path is not None
+        is_tarred = config.get("shar_path") is not None
     if use_nemo_manifest:
         # Read NeMo manifest -- use the right wrapper depending on tarred/non-tarred.
         cuts = read_nemo_manifest(config, is_tarred)
@@ -47,6 +53,193 @@ def read_cutset_from_config(config) -> Tuple[CutSet, bool]:
     return cuts, is_tarred
 
 
+KNOWN_DATASET_CONFIG_TYPES = frozenset(("nemo", "nemo_tarred", "lhotse", "lhotse_shar", "txt", "txt_pair", "group"))
+
+
+def read_dataset_config(config) -> tuple[CutSet, bool]:
+    """
+    Input configuration format examples.
+    Example 1. Combine two datasets with equal weights and attach custom metadata in ``tags`` to each cut::
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+            tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+            weight: 0.5
+            tags:
+              lang: en
+              some_metadata: some_value
+          - type: nemo_tarred
+            manifest_filepath: /path/to/manifest__OP_0..512_CL_.json
+            tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar
+            weight: 0.5
+            tags:
+              lang: pl
+              some_metadata: some_value
+    Example 2. Combine multiple (4) datasets, with 2 corresponding to different tasks (ASR, AST).
+        There are two levels of weights: per task (outer) and per dataset (inner).
+        The final weight is the product of outer and inner weight::
+        input_cfg:
+          - type: group
+            weight: 0.7
+            tags:
+              task: asr
+            input_cfg:
+              - type: nemo_tarred
+                manifest_filepath: /path/to/asr1/manifest__OP_0..512_CL_.json
+                tarred_audio_filepath: /path/to/tarred_audio/asr1/audio__OP_0..512_CL_.tar
+                weight: 0.6
+                tags:
+                  lang: en
+                  some_metadata: some_value
+              - type: nemo_tarred
+                manifest_filepath: /path/to/asr2/manifest__OP_0..512_CL_.json
+                tarred_audio_filepath: /path/to/asr2/tarred_audio/audio__OP_0..512_CL_.tar
+                weight: 0.4
+                tags:
+                  lang: pl
+                  some_metadata: some_value
+          - type: group
+            weight: 0.3
+            tags:
+              task: ast
+            input_cfg:
+              - type: nemo_tarred
+                manifest_filepath: /path/to/ast1/manifest__OP_0..512_CL_.json
+                tarred_audio_filepath: /path/to/ast1/tarred_audio/audio__OP_0..512_CL_.tar
+                weight: 0.2
+                tags:
+                  src_lang: en
+                  tgt_lang: pl
+              - type: nemo_tarred
+                manifest_filepath: /path/to/ast2/manifest__OP_0..512_CL_.json
+                tarred_audio_filepath: /path/to/ast2/tarred_audio/audio__OP_0..512_CL_.tar
+                weight: 0.8
+                tags:
+                  src_lang: pl
+                  tgt_lang: en
+    """
+    propagate_attrs = {
+        "shuffle": config.shuffle,
+        "shard_seed": config.shard_seed,
+        "text_field": config.text_field,
+        "lang_field": config.lang_field,
+        "missing_sampling_rate_ok": config.missing_sampling_rate_ok,
+        "max_open_streams": config.max_open_streams,
+    }
+    input_cfg = config.input_cfg
+    if isinstance(input_cfg, (str, Path)):
+        # Resolve /path/to/input_cfg.yaml into config contents if needed.
+        input_cfg = OmegaConf.load(input_cfg)
+    cuts, is_tarred = parse_and_combine_datasets(input_cfg, propagate_attrs=propagate_attrs)
+    return cuts, is_tarred
+
+
+def parse_group(grp_cfg: DictConfig, propagate_attrs: dict) -> [CutSet, bool]:
+    assert grp_cfg.type in KNOWN_DATASET_CONFIG_TYPES, f"Unknown item type in dataset config list: {grp_cfg.type=}"
+    if grp_cfg.type == "nemo_tarred":
+        is_tarred = True
+        cuts = read_nemo_manifest(grp_cfg, is_tarred=is_tarred)
+    elif grp_cfg.type == "nemo":
+        is_tarred = False
+        cuts = read_nemo_manifest(grp_cfg, is_tarred=is_tarred)
+    elif grp_cfg.type == "lhotse_shar":
+        is_tarred = True
+        cuts = read_lhotse_manifest(grp_cfg, is_tarred=is_tarred)
+    elif grp_cfg.type == "lhotse":
+        is_tarred = False
+        cuts = read_lhotse_manifest(grp_cfg, is_tarred=is_tarred)
+    # Note: "txt" and "txt_pair" have "is_tarred" set to True.
+    #       The main reason is to enable combination of tarred audio and text dataloading,
+    #       since we don't allow combination of tarred and non-tarred datasets.
+    #       We choose to treat text as-if it was tarred, which also tends to be more
+    #       efficient as it moves the text file iteration into dataloading subprocess.
+    elif grp_cfg.type == "txt":
+        is_tarred = True
+        cuts = read_txt_paths(grp_cfg)
+    elif grp_cfg.type == "txt_pair":
+        is_tarred = True
+        cuts = read_txt_pair_paths(grp_cfg)
+    elif grp_cfg.type == "group":
+        cuts, is_tarred = parse_and_combine_datasets(grp_cfg.input_cfg, propagate_attrs=propagate_attrs,)
+    else:
+        raise ValueError(f"Unrecognized group: {grp_cfg.type}")
+    # Attach extra tags to every utterance dynamically, if provided.
+    if (extra_tags := grp_cfg.get("tags")) is not None:
+        cuts = cuts.map(partial(attach_tags, tags=extra_tags), apply_fn=None)
+    return cuts, is_tarred
+
+
+def read_txt_paths(config: DictConfig) -> CutSet:
+    return CutSet(
+        LhotseTextAdapter(
+            paths=config.paths, language=config.language, shuffle_shards=config.shuffle, shard_seed=config.shard_seed,
+        )
+    ).repeat()
+
+
+def read_txt_pair_paths(config: DictConfig) -> CutSet:
+    return CutSet(
+        LhotseTextPairAdapter(
+            source_paths=config.source_paths,
+            target_paths=config.target_paths,
+            source_language=config.source_language,
+            target_language=config.target_language,
+            shuffle_shards=config.shuffle,
+            shard_seed=config.shard_seed,
+        )
+    ).repeat()
+
+
+def attach_tags(cut, tags: dict):
+    for key, val in tags.items():
+        setattr(cut, key, val)
+    return cut
+
+
+def parse_and_combine_datasets(
+    config_list: list[DictConfig] | ListConfig, propagate_attrs: dict
+) -> tuple[CutSet, bool]:
+    cuts = []
+    weights = []
+    tarred_status = []
+    assert len(config_list) > 0, "Empty group in dataset config list."
+
+    for item in config_list:
+
+        # Check if we have any attributes that are propagated downwards to each item in the group.
+        # If a key already exists in the item, it takes precedence (we will not overwrite);
+        # otherwise we will assign it.
+        # We also update propagate_atts for the next sub-groups based on what's present in this group
+        next_propagate_attrs = propagate_attrs.copy()
+        for k, v in propagate_attrs.items():
+            if k not in item:
+                item[k] = v
+            else:
+                next_propagate_attrs[k] = item[k]
+
+        # Load the item (which may also be another group) as a CutSet.
+        item_cuts, item_is_tarred = parse_group(item, next_propagate_attrs)
+        cuts.append(item_cuts)
+        tarred_status.append(item_is_tarred)
+        if (w := item.get("weight")) is not None:
+            weights.append(w)
+
+    assert all(t == tarred_status[0] for t in tarred_status), "Mixing tarred and non-tarred datasets is not supported."
+    assert len(weights) == 0 or len(cuts) == len(
+        weights
+    ), "Missing dataset weight. When weighting datasets, every dataset must have a specified weight."
+    if len(cuts) > 1:
+        cuts = mux(
+            *cuts,
+            weights=weights if weights else None,
+            max_open_streams=propagate_attrs["max_open_streams"],
+            seed=propagate_attrs["shard_seed"],
+        )
+    else:
+        (cuts,) = cuts
+    return cuts, tarred_status[0]
+
+
 def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
     if is_tarred:
         # Lhotse Shar is the equivalent of NeMo's native "tarred" dataset.
@@ -64,7 +257,7 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
         # - integer means we'll set a specific seed in every worker, and data would be duplicated across them.
         #   This is mostly useful for unit testing or debugging.
         shard_seed = config.shard_seed
-        if config.cuts_path is not None:
+        if config.get("cuts_path") is not None:
             warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.")
         if isinstance(config.shar_path, (str, Path)):
             logging.info(f"Initializing Lhotse Shar CutSet (tarred) from a single data source: '{config.shar_path}'")
@@ -119,9 +312,10 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
                     config.manifest_filepath,
                     tar_paths=config.tarred_audio_filepaths,
                     shuffle_shards=config.shuffle,
+                    shard_seed=config.shard_seed,
                     **common_kwargs,
                 )
-            )
+            ).repeat()
         else:
             cuts = CutSet(LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **common_kwargs))
     else:
@@ -132,7 +326,7 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
         #         this ensures that we distribute the data from each source uniformly throughout each epoch.
         #         Setting equal weights would exhaust the shorter data sources closer the towards the beginning
         #         of an epoch (or over-sample it in the case of infinite CutSet iteration with .repeat()).
-        # Format option 1:
+        # Format option 2:
         #   Assume it's [[path1, weight1], [path2, weight2], ...] (while tarred_audio_filepaths remain unchanged).
         #   Note: this option allows to manually set the weights for multiple datasets.
         logging.info(
@@ -148,7 +342,11 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
             manifest_path = manifest_info[0]
             if is_tarred:
                 nemo_iter = LazyNeMoTarredIterator(
-                    manifest_path=manifest_path, tar_paths=tar_path, shuffle_shards=config.shuffle, **common_kwargs
+                    manifest_path=manifest_path,
+                    tar_paths=tar_path,
+                    shuffle_shards=config.shuffle,
+                    shard_seed=config.shard_seed,
+                    **common_kwargs,
                 )
             else:
                 nemo_iter = LazyNeMoIterator(manifest_path, **notar_kwargs, **common_kwargs)
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 9eeb8800066a..fd2a69725a0e 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -15,12 +15,14 @@
 import logging
 import warnings
 from dataclasses import dataclass
-from functools import partial
+from functools import partial, singledispatch
 from typing import Any, Optional
 
+import numpy as np
 import torch
 from lhotse import CutSet
 from lhotse.cut import Cut
+from lhotse.cut.text import TextExample, TextPairExample
 from lhotse.dataset import (
     CutConcatenate,
     DynamicBucketingSampler,
@@ -28,11 +30,14 @@
     IterableDatasetWrapper,
     make_worker_init_fn,
 )
+from lhotse.dataset.sampling.base import SamplingConstraint, TimeConstraint, TokenConstraint
 from lhotse.lazy import LazyFlattener
 from lhotse.utils import fastcopy
 from omegaconf import DictConfig, OmegaConf
 
+from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
 from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
+from nemo.collections.common.tokenizers import TokenizerSpec
 
 
 @dataclass
@@ -46,6 +51,7 @@ class LhotseDataLoadingConfig:
 
     # 1. Data inputs.
     #   a. "Classic" NeMo input path fields.
+    input_cfg: Any = None  # TODO(pzelasko): typing
     manifest_filepath: Any = None  # str | list[list[str | float]] | None = None
     tarred_audio_filepaths: Any = None  # str | list[list[str]] | None = None
     #   b. Lhotse CutSet manifest / Lhotse Shar tar dir paths.
@@ -70,6 +76,12 @@ class LhotseDataLoadingConfig:
     shard_seed: int | str = "trng"
     max_open_streams: int | None = None
 
+    # 2.1 Multimodal sampling override options
+    use_multimodal_sampling: bool = False
+    token_equivalent_duration: float | None = None
+    batch_tokens: int | None = None
+    quadratic_factor: float | None = None
+
     # 3. Supported existing NeMo options.
     shuffle: bool = False
     sample_rate: int = 16000
@@ -102,12 +114,16 @@ class LhotseDataLoadingConfig:
 
 
 def get_lhotse_dataloader_from_config(
-    config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset
+    config: DictConfig,
+    global_rank: int,
+    world_size: int,
+    dataset: torch.utils.data.Dataset,
+    tokenizer: TokenizerSpec | TokenizerWrapper = None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
 
-    Expects a typical NeMo dataset configuration format, with additional fields: "use_lhotse=True" and "lhotse: <dict>".
+    Expects a typical NeMo dataset configuration format, with additional fields: "use_lhotse=True".
     Some fields in the original NeMo configuration may be ignored.
 
     The ``dataset`` parameter should be an instance of a Lhotse-compatible PyTorch Dataset class.
@@ -115,8 +131,15 @@ def get_lhotse_dataloader_from_config(
     This dataset is not expected to hold a reference to any actual data; it may be interpreted as a function
     mapping a Lhotse CutSet into a mini-batch of tensors.
 
-    For example, see: :class:`nemo.collections.asr.data.audio_to_text_lhotse.LhotseSpeechToTextBpeDataset`,
+    For an example, see: :class:`nemo.collections.asr.data.audio_to_text_lhotse.LhotseSpeechToTextBpeDataset`,
     which is constructed from just a tokenizer and essentially loads and collates audio and tokenizes the transcript.
+
+    The ``tokenizer`` is used when text-only datasets are included in dataloading.
+    In these cases we will tokenize ``TextExample``s before sampling mini-batches so that
+    we can account for their number of tokens.
+    Note: this behaviour might eventually be extended to audio datasets too.
+
+    Note that ``tokenizer`` can be any tokenizer type (e.g. both SentencePiece and Aggregate tokenizers work).
     """
     logging.info("We will be using a Lhotse DataLoader.")
 
@@ -132,7 +155,16 @@ def get_lhotse_dataloader_from_config(
     cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))
 
     # Expands cuts if multiple translations are provided.
-    cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text)))
+    cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None)))
+
+    if config.use_multimodal_sampling:
+        assert (
+            tokenizer is not None
+        ), "You must pass a tokenizer to `get_lhotse_dataloader_from_config` in order to read text-only datasets (enabled via use_multimodal_dataloading)"
+        if not isinstance(tokenizer, TokenizerWrapper):
+            tokenizer = TokenizerWrapper(tokenizer)
+        # Note this code can also pre-tokenize the text in cuts, but for now we disable it with apply_fn.
+        cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=is_text)
 
     # 2. Optional augmentations.
     # 2.a. Noise mixing.
@@ -149,6 +181,20 @@ def get_lhotse_dataloader_from_config(
     if config.perturb_speed:
         cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
 
+    if config.use_multimodal_sampling:
+        constraint = MultimodalSamplingConstraint(
+            token_equivalent_duration=config.token_equivalent_duration,
+            batch_size=config.batch_size,
+            batch_tokens=config.batch_tokens,
+            quadratic_factor=config.quadratic_factor,
+        )
+    else:
+        constraint = TimeConstraint(
+            max_cuts=config.batch_size,
+            max_duration=config.batch_duration,
+            quadratic_duration=config.quadratic_duration,
+        )
+
     # 3. The sampler.
     if config.use_bucketing:
         # Bucketing. Some differences from NeMo's native bucketing:
@@ -161,12 +207,10 @@ def get_lhotse_dataloader_from_config(
         )
         sampler = DynamicBucketingSampler(
             cuts,
-            max_duration=config.batch_duration,
-            max_cuts=config.batch_size,
+            constraint=constraint,
             shuffle=config.shuffle,
             drop_last=config.drop_last,
             shuffle_buffer_size=config.shuffle_buffer_size,
-            quadratic_duration=config.quadratic_duration,
             seed=config.seed,
             num_buckets=config.num_buckets,
             duration_bins=config.bucket_duration_bins,
@@ -185,12 +229,10 @@ def get_lhotse_dataloader_from_config(
         )
         sampler = DynamicCutSampler(
             cuts,
-            max_duration=config.batch_duration,
-            max_cuts=config.batch_size,
+            constraint=constraint,
             shuffle=config.shuffle,
             drop_last=config.drop_last,
             shuffle_buffer_size=config.shuffle_buffer_size,
-            quadratic_duration=config.quadratic_duration,
             seed=config.seed,
             rank=0 if is_tarred else global_rank,
             world_size=1 if is_tarred else world_size,
@@ -260,6 +302,89 @@ def make_structured_with_schema_warnings(config: DictConfig) -> DictConfig:
     return OmegaConf.merge(default, config)
 
 
+@dataclass
+class MultimodalSamplingConstraint(SamplingConstraint):
+    # how many seconds of audio is a text token worth; balances audio to text ratio in a mini-batch
+    token_equivalent_duration: float
+
+    # defines maximum batch size (may be lower than that if batch_length is also specified)
+    batch_size: int | None = None
+
+    # defines the total number of tokens in a mini-batch
+    # setting this enables dynamic batch sizes
+    # we will use ``token_equivalent_duration`` to convert audio examples to token sizes
+    batch_tokens: int | None = None
+
+    # when specified, this value is inversely proportional to the penalty we assign
+    # to longer examples when measuring their length/duration;
+    # i.e. large quadratic factor is a small penalty, small quadratic factor is a large penalty
+    # tweaking this helps equalize the GPU memory usage for dynamic batch sizes when using bucketing
+    quadratic_factor: float | None = None
+
+    _internal = None
+
+    def __post_init__(self):
+        self._internal = TokenConstraint(
+            max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor,
+        )
+
+    def add(self, example: Any) -> None:
+        if isinstance(example, Cut):
+            num_tokens = self.measure_length(example)
+            example.num_tokens = num_tokens
+        self._internal.add(example)
+
+    def exceeded(self) -> bool:
+        return self._internal.exceeded()
+
+    def close_to_exceeding(self) -> bool:
+        return self._internal.close_to_exceeding()
+
+    def reset(self) -> None:
+        self._internal.reset()
+
+    def measure_length(self, example: Any) -> float:
+        if isinstance(example, Cut):
+            return example.duration / self.token_equivalent_duration
+        if isinstance(example, (TextExample, TextPairExample)):
+            return example.num_tokens
+        raise RuntimeError(f"Unsupported example type: {type(example)}")
+
+
+# The functions below are overloads for different types of examples.
+# This is required for multi-modal dataloading since we will iterate
+# over a union type now.
+
+
+def is_text(example) -> bool:
+    return isinstance(example, (TextExample, TextPairExample))
+
+
+@singledispatch
+def tokenize(example, tokenizer):
+    raise RuntimeError(f"Unsupported type of example: {type(example)}")
+
+
+@tokenize.register
+def _(example: Cut, tokenizer) -> Cut:
+    for s in example.supervisions:
+        s.tokens = np.asarray(tokenizer(s.text, s.language))
+    return example
+
+
+@tokenize.register
+def _(example: TextExample, tokenizer) -> TextExample:
+    example.tokens = np.asarray(tokenizer(example.text, example.language))
+    return example
+
+
+@tokenize.register
+def _(example: TextPairExample, tokenizer) -> TextPairExample:
+    example.source.tokens = np.asarray(tokenizer(example.source.text, example.source.language))
+    example.target.tokens = np.asarray(tokenizer(example.source.text, example.target.language))
+    return example
+
+
 # The helper callables below exist to avoid passing lambdas into lhotse CutSet map/filter methods.
 # Lambdas are not serializable across processes by pickle.
 # Note: lhotse offers LHOTSE_DILL_ENABLED=1 and ``lhotse.lazy.set_dill_enabled(True)``
@@ -273,8 +398,11 @@ def __init__(self, d_min: float, d_max: float) -> None:
         self.d_min = d_min
         self.d_max = d_max
 
-    def __call__(self, cut: Cut) -> bool:
-        return self.d_min <= cut.duration <= self.d_max
+    def __call__(self, example) -> bool:
+        if isinstance(example, Cut):
+            return self.d_min <= example.duration <= self.d_max
+        else:
+            return True  # does not apply to text etc.
 
 
 def _normalize_loudness(cuts: CutSet, db_norm: float) -> CutSet:
@@ -287,7 +415,7 @@ def _merge_supervisions(cuts: CutSet) -> CutSet:
 
 def _flatten_alt_text(cut) -> list:
     ans = [cut]
-    if cut.custom is None or cut.custom.get("alt_text") is None:
+    if not isinstance(cut, Cut) or cut.custom is None or cut.custom.get("alt_text") is None:
         return ans
     cut = cut.move_to_memory(audio_format="wav")  # performs I/O once and holds audio in memory from now on
     # Popping to ease eyesight on debug.
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 4fae72e6f467..02b3e1f4edda 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -18,12 +18,13 @@
 import tarfile
 from io import BytesIO
 from pathlib import Path
-from typing import Generator, Iterable, List
+from typing import Generator, Iterable, List, Literal
 
 import soundfile
 from cytoolz import groupby
 from lhotse import AudioSource, Recording, SupervisionSegment
 from lhotse.cut import Cut
+from lhotse.dataset.dataloading import resolve_seed
 from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator
 from lhotse.serialization import open_best
 from lhotse.utils import compute_num_samples
@@ -147,6 +148,12 @@ class LazyNeMoTarredIterator:
     Args ``manifest_path`` and ``tar_paths`` can be either a path/string to a single file, or a string in NeMo format
     that indicates multiple paths (e.g. "[[data/bucket0/tarred_audio_paths.json],[data/bucket1/...]]").
 
+    The ``shard_seed`` argument is used to seed the RNG shuffling the shards.
+    By default it's ``trng`` which samples a seed number from OS-provided TRNG (see Python ``secrets`` module).
+    Seed is resolved lazily so that every dataloading worker may sample a different one.
+    Override with an integer value for deterministic behaviour and consult Lhotse documentation for details:
+    https://lhotse.readthedocs.io/en/latest/datasets.html#handling-random-seeds
+
     Example of CutSet with inter-shard shuffling enabled::
 
         >>> cuts = lhotse.CutSet(LazyNeMoTarredIterator(
@@ -161,6 +168,7 @@ def __init__(
         manifest_path: str | Path,
         tar_paths: str | list,
         shuffle_shards: bool = False,
+        shard_seed: int | Literal["trng", "randomized"] = "trng",
         text_field: str = "text",
         lang_field: str = "lang",
     ) -> None:
@@ -189,6 +197,7 @@ def strip_pipe(p):
         tar_paths = expand_sharded_filepaths(tar_paths)
         self.shard_id_to_tar_path: dict[int, str] = {int(strip_pipe(p).stem.split("_")[1]): p for p in tar_paths}
         self.shuffle_shards = shuffle_shards
+        self.shard_seed = shard_seed
         self.text_field = text_field
         self.lang_field = lang_field
         self._validate()
@@ -205,6 +214,7 @@ def to_shards(self) -> List["LazyNeMoTarredIterator"]:
                     manifest_path=path,
                     tar_paths=tarpath,
                     shuffle_shards=False,
+                    shard_seed=self.shard_seed,
                     text_field=self.text_field,
                     lang_field=self.lang_field,
                 )
@@ -227,8 +237,8 @@ def __iter__(self) -> Generator[Cut, None, None]:
         shard_ids = self.shard_ids
 
         if self.shuffle_shards:
-            # Use TRNG for 100% randomness
-            random.Random(secrets.randbelow(2 ** 32)).shuffle(shard_ids)
+            seed = resolve_seed(self.shard_seed)
+            random.Random(seed).shuffle(shard_ids)
 
         for sid in shard_ids:
             shard_manifest = self.shard_id_to_manifest[sid]
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
new file mode 100644
index 000000000000..805ef5dd542f
--- /dev/null
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterator, Literal
+
+from lhotse.cut.text import TextExample, TextPairExample
+from lhotse.dataset.dataloading import resolve_seed
+from lhotse.utils import Pathlike
+
+from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
+
+
+@dataclass
+class LhotseTextAdapter:
+    """
+    ``LhotseTextAdapter`` is used to read a text file and wrap
+    each line into a ``TextExample``.
+    """
+
+    paths: Pathlike | list[Pathlike]
+    language: str | None = None
+    shuffle_shards: bool = False
+    shard_seed: int | Literal["trng", "randomized"] = "trng"
+
+    def __post_init__(self):
+        self.paths = expand_sharded_filepaths(self.paths)
+
+    def __iter__(self) -> Iterator[TextExample]:
+        paths = self.paths
+        if self.shuffle_shards:
+            seed = resolve_seed(self.shard_seed)
+            random.Random(seed).shuffle(paths)
+        for path in paths:
+            with open(path) as f:
+                for line in f:
+                    example = TextExample(line)
+                    if self.language is not None:
+                        example.language = self.language
+                    yield example
+
+
+@dataclass
+class LhotseTextPairAdapter:
+    """
+    ``LhotseTextAdapter`` is used to read a tuple of N text files
+    (e.g., a pair of files with translations in different languages)
+    and wrap them in a ``TextExample`` object to enable dataloading
+    with Lhotse together with training examples in audio modality.
+    """
+
+    source_paths: Pathlike | list[Pathlike]
+    target_paths: Pathlike | list[Pathlike]
+    source_language: str | None = None
+    target_language: str | None = None
+    shuffle_shards: bool = False
+    shard_seed: int | Literal["trng", "randomized"] = "trng"
+
+    def __post_init__(self):
+        ASSERT_MSG = "Both source and target must be a single path or lists of paths"
+        if isinstance(self.source_paths, (str, Path)):
+            assert isinstance(self.target_paths, (str, Path)), ASSERT_MSG
+        else:
+            assert isinstance(self.source_paths, list) and isinstance(self.target_paths, list), ASSERT_MSG
+            assert len(self.source_paths) == len(
+                self.target_paths
+            ), f"Source ({len(self.source_paths)}) and target ({len(self.target_paths)}) path lists must have the same number of items."
+        self.source_paths = expand_sharded_filepaths(self.source_paths)
+        self.target_paths = expand_sharded_filepaths(self.target_paths)
+
+    def __iter__(self) -> Iterator[TextPairExample]:
+        paths = list(zip(self.source_paths, self.target_paths))
+        if self.shuffle_shards:
+            seed = resolve_seed(self.shard_seed)
+            random.Random(seed).shuffle(paths)
+        for source_path, target_path in paths:
+            with open(source_path) as fs, open(target_path) as ft:
+                for ls, lt in zip(fs, ft):
+                    example = TextPairExample(source=TextExample(ls.strip()), target=TextExample(lt.strip()))
+                    if self.source_language is not None:
+                        example.source.language = self.source_language
+                    if self.target_language is not None:
+                        example.target.language = self.target_language
+                    yield example
diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
index bc10b67af880..b686322c0882 100644
--- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
+++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
@@ -376,7 +376,9 @@ def create_spt_model(
     # Add BERT control symbols
     tokens = []
 
-    with open(f"{output_dir}/tokenizer.vocab", "r") as f:
+    # Encoding arg is added for compatibility with systems which enforce
+    # ASCII encoding in Python. Sentencepiece always uses Unicode (UTF8).
+    with open(f"{output_dir}/tokenizer.vocab", "r", encoding="utf8") as f:
         # Read tokens from each line and parse for vocab
         for line in f:
             piece = line.split("\t")[0]
@@ -394,7 +396,7 @@ def create_spt_model(
 
     # Save vocabulary to output file
     vocab_file = f'{output_dir}/vocab.txt'
-    with open(vocab_file, "w") as f:
+    with open(vocab_file, "w", encoding="utf8") as f:
         for token in vocab:
             f.write(f"{token}\n")
     return f'{output_dir}/tokenizer.model', vocab_file
diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt
index 6df223209cc1..b7863714eb2d 100644
--- a/requirements/requirements_asr.txt
+++ b/requirements/requirements_asr.txt
@@ -5,7 +5,7 @@ ipywidgets
 jiwer
 kaldi-python-io
 kaldiio
-lhotse>=1.20.0
+lhotse>=1.22.0
 librosa>=0.10.0
 marshmallow
 matplotlib
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 955765019133..4e89a93e83e4 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -11,19 +11,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from collections import Counter
 from io import BytesIO
 from itertools import islice
 from pathlib import Path
 from typing import Dict, List, Tuple
 
+import lhotse
+import numpy as np
 import pytest
 import torch
+from lhotse.cut import Cut
+from lhotse.cut.text import TextPairExample
 from omegaconf import OmegaConf
 
+from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
-
-lhotse = pytest.importorskip("lhotse", reason="Lhotse + NeMo tests require Lhotse (pip install lhotse).")
+from nemo.collections.common.data.lhotse.text_adapters import TextExample
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model
 
 requires_torchaudio = pytest.mark.skipif(
     not lhotse.utils.is_torchaudio_available(), reason="Lhotse Shar format support requires torchaudio."
@@ -332,7 +337,7 @@ def test_dataloader_from_tarred_nemo_manifest(nemo_tarred_manifest_path: tuple[s
         config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
     )
 
-    batches = [batch for batch in dl]
+    batches = [batch for batch in islice(dl, 4)]
     assert len(batches) == 4
 
     b = batches[0]
@@ -349,7 +354,7 @@ def test_dataloader_from_tarred_nemo_manifest(nemo_tarred_manifest_path: tuple[s
 
     b = batches[3]
     assert set(b.keys()) == {"audio", "audio_lens", "ids"}
-    assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1
+    assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 3
 
 
 def test_dataloader_from_tarred_nemo_manifest_weighted_combination(nemo_tarred_manifest_path: tuple[str, str]):
@@ -411,7 +416,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi(nemo_tarred_manifest_path_mu
         config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
     )
 
-    batches = [batch for batch in dl]
+    batches = [batch for batch in islice(dl, 4)]
     assert len(batches) == 4
 
     b = batches[0]
@@ -428,7 +433,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi(nemo_tarred_manifest_path_mu
 
     b = batches[3]
     assert set(b.keys()) == {"audio", "audio_lens", "ids"}
-    assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1
+    assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 3
 
 
 def test_dataloader_from_tarred_nemo_manifest_multi_max_open_streams(nemo_tarred_manifest_path_multi: tuple[str, str]):
@@ -489,7 +494,7 @@ def test_dataloader_from_tarred_nemo_manifest_concat(nemo_tarred_manifest_path:
         config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
     )
 
-    batches = [batch for batch in dl]
+    batches = [batch for batch in islice(dl, 4)]
 
     assert len(batches) == 4
 
@@ -513,8 +518,8 @@ def test_dataloader_from_tarred_nemo_manifest_concat(nemo_tarred_manifest_path:
 
     b = batches[3]
     assert set(b.keys()) == {"audio", "audio_lens", "ids"}
-    assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1
-    torch.testing.assert_close(b["audio_lens"], torch.tensor([16000], dtype=torch.int32))
+    assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 2
+    torch.testing.assert_close(b["audio_lens"], expected_audio_lens)
 
 
 @requires_torchaudio
@@ -728,7 +733,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path):
     assert cut.supervisions[0].text == "irrelevant"
     audio = cut.load_audio()
     assert audio.shape == (1, 8000)
-    np.testing.assert_equal(audio[0], expected_audio[8000:])
+    np.testing.assert_allclose(audio[0], expected_audio[8000:], atol=5e-5)
 
     assert cuts[0].id != cuts[1].id
 
@@ -736,6 +741,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path):
 def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path):
     import numpy as np
     import soundfile as sf
+
     from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
 
     # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV
@@ -764,4 +770,427 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path):
     assert cut.num_samples == 8000
     assert cut.supervisions[0].text == "irrelevant"
     assert audio.shape == (1, 8000)
-    np.testing.assert_equal(audio[0], expected_audio[:8000])
+    np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5)
+
+
+class Identity(torch.utils.data.Dataset):
+    def __getitem__(self, cuts: lhotse.CutSet) -> lhotse.CutSet:
+        return cuts
+
+
+def test_extended_data_input_cfg(cutset_shar_path, nemo_tarred_manifest_path_multi):
+    config = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "nemo_tarred",
+                    "manifest_filepath": nemo_tarred_manifest_path_multi[0],
+                    "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1],
+                    "weight": 0.5,
+                    "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",},
+                },
+                {
+                    "type": "lhotse_shar",
+                    "shar_path": cutset_shar_path,
+                    "weight": 0.5,
+                    "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",},
+                },
+            ],
+            "sample_rate": 16000,
+            "shuffle": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    # Note: we use islice here because the dataloader will be infinite.
+    batches = [batch for batch in islice(dl, 2)]
+
+    b = batches[0]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(c.custom["language"] == "en" for c in b)
+    assert all(c.custom["modality"] == "audio" for c in b)
+    assert sum(c.custom["dataset_name"] == "D1" for c in b) == 2
+    assert sum(c.custom["dataset_name"] == "D2" for c in b) == 2
+
+    b = batches[1]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(c.custom["language"] == "en" for c in b)
+    assert all(c.custom["modality"] == "audio" for c in b)
+    assert sum(c.custom["dataset_name"] == "D1" for c in b) == 1
+    assert sum(c.custom["dataset_name"] == "D2" for c in b) == 3
+
+
+def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest_path_multi):
+    config = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "group",
+                    "input_cfg": [
+                        {
+                            "type": "nemo_tarred",
+                            "manifest_filepath": nemo_tarred_manifest_path_multi[0],
+                            "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1],
+                            "weight": 0.5,
+                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",},
+                        },
+                        {
+                            "type": "lhotse_shar",
+                            "shar_path": cutset_shar_path,
+                            "weight": 0.5,
+                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",},
+                        },
+                    ],
+                    "weight": 0.2,
+                    "tags": {"group_name": "G1",},
+                },
+                {
+                    "type": "group",
+                    "weight": 0.8,
+                    "input_cfg": [
+                        {
+                            "type": "nemo_tarred",
+                            "manifest_filepath": nemo_tarred_manifest_path_multi[0],
+                            "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1],
+                            "weight": 0.5,
+                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D3",},
+                        },
+                        {
+                            "type": "lhotse_shar",
+                            "shar_path": cutset_shar_path,
+                            "weight": 0.5,
+                            "tags": {"language": "en", "modality": "audio", "dataset_name": "D4",},
+                        },
+                    ],
+                    "tags": {"group_name": "G2",},
+                },
+            ],
+            "sample_rate": 16000,
+            "shuffle": True,
+            "num_workers": 0,
+            "batch_size": 32,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    # Sample 100 mini-batches and test statistical properties
+    group_occurrences = Counter()
+    dataset_occurrences = Counter()
+    for batch in islice(dl, 100):
+        for cut in batch:
+            group_occurrences[cut.group_name] += 1
+            dataset_occurrences[cut.dataset_name] += 1
+
+    tot = sum(group_occurrences.values())
+    for k in group_occurrences:
+        group_occurrences[k] /= tot
+    for k in dataset_occurrences:
+        dataset_occurrences[k] /= tot
+
+    def almost(number):
+        return pytest.approx(number, abs=0.02)
+
+    assert group_occurrences["G1"] == almost(0.2)  # group weight: 0.2
+    assert group_occurrences["G2"] == almost(0.8)  # group weight: 0.8
+    assert dataset_occurrences["D1"] == almost(0.1)  # group weight: 0.2 * dataset weight 0.5 => 0.1
+    assert dataset_occurrences["D2"] == almost(0.1)  # group weight: 0.2 * dataset weight 0.5 => 0.1
+    assert dataset_occurrences["D3"] == almost(0.4)  # group weight: 0.8 * dataset weight 0.5 => 0.4
+    assert dataset_occurrences["D4"] == almost(0.4)  # group weight: 0.8 * dataset weight 0.5 => 0.4
+
+
+def test_extended_data_input_cfg_yaml_path(tmp_path, cutset_shar_path, nemo_tarred_manifest_path_multi):
+    input_cfg = [
+        {
+            "type": "nemo_tarred",
+            "manifest_filepath": str(nemo_tarred_manifest_path_multi[0]),
+            "tarred_audio_filepaths": str(nemo_tarred_manifest_path_multi[1]),
+            "weight": 0.5,
+            "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",},
+        },
+        {
+            "type": "lhotse_shar",
+            "shar_path": str(cutset_shar_path),
+            "weight": 0.5,
+            "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",},
+        },
+    ]
+
+    yaml_path = tmp_path / "input_cfg.yaml"
+    lhotse.serialization.save_to_yaml(input_cfg, yaml_path)
+
+    config = OmegaConf.create(
+        {
+            "input_cfg": input_cfg,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "num_workers": 0,
+            "batch_size": 32,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    batch = next(iter(dl))
+    assert isinstance(batch, lhotse.CutSet)
+    for cut in batch:
+        assert cut.dataset_name in ("D1", "D2")
+
+
+@pytest.fixture(scope="session")
+def txt_en_path(tmp_path_factory):
+    tmp_path = tmp_path_factory.mktemp("text_data")
+    en_path = tmp_path / "text.en"
+    en_path.write_text(
+        """Example text in English.
+Another sentence.
+        """
+    )
+    return en_path
+
+
+@pytest.fixture(scope="session")
+def txt_es_path(tmp_path_factory):
+    tmp_path = tmp_path_factory.mktemp("text_data")
+    es_path = tmp_path / "text.es"
+    es_path.write_text(
+        """Otro texto en ingles.
+Otra frase."""
+    )
+    return es_path
+
+
+def test_text_file_input(txt_en_path, txt_es_path):
+    config = OmegaConf.create(
+        {
+            "input_cfg": [{"type": "txt", "paths": txt_en_path, "language": "en",},],
+            "shuffle": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    # Note: this test does not need to pass a tokenizer because we use static batch sizes
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    # Note: we use islice here because the dataloader will be infinite.
+    batches = [batch for batch in islice(dl, 2)]
+
+    b = batches[0]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(isinstance(c, TextExample) for c in b)
+    assert all(c.language == "en" for c in b)
+
+    b = batches[1]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(isinstance(c, TextExample) for c in b)
+    assert all(c.language == "en" for c in b)
+
+
+def test_text_file_pairs_input(txt_en_path, txt_es_path):
+    config = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "txt_pair",
+                    "source_paths": txt_en_path,
+                    "target_paths": txt_es_path,
+                    "source_language": "en",
+                    "target_language": "es",
+                },
+            ],
+            "shuffle": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    # Note: this test does not need to pass a tokenizer because we use static batch sizes
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    # Note: we use islice here because the dataloader will be infinite.
+    batches = [batch for batch in islice(dl, 2)]
+
+    b = batches[0]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(isinstance(c, TextPairExample) for c in b)
+    assert all(c.source.language == "en" for c in b)
+    assert all(c.target.language == "es" for c in b)
+
+    b = batches[1]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(isinstance(c, TextPairExample) for c in b)
+    assert all(c.source.language == "en" for c in b)
+    assert all(c.target.language == "es" for c in b)
+
+
+@pytest.fixture(scope="session")
+def txt_pair_paths_shards(tmp_path_factory, txt_en_path, txt_es_path):
+    tmp_path = tmp_path_factory.mktemp("text_data_shards")
+
+    en_text = txt_en_path.read_text().splitlines()
+    (tmp_path / "en_0.txt").write_text("\n".join(en_text[:5]))
+    (tmp_path / "en_1.txt").write_text("\n".join(en_text[5:]))
+
+    es_text = txt_es_path.read_text().splitlines()
+    (tmp_path / "es_0.txt").write_text("\n".join(es_text[:5]))
+    (tmp_path / "es_1.txt").write_text("\n".join(es_text[5:]))
+
+    return f"{tmp_path}/en__OP_0..1_CL_.txt", f"{tmp_path}/es__OP_0..1_CL_.txt"
+
+
+def test_text_file_pairs_shards_input(txt_pair_paths_shards: tuple[str, str]):
+    en_paths, es_paths = txt_pair_paths_shards
+
+    config = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "txt_pair",
+                    "source_paths": en_paths,
+                    "target_paths": es_paths,
+                    "source_language": "en",
+                    "target_language": "es",
+                },
+            ],
+            "shuffle": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    # Note: this test does not need to pass a tokenizer because we use static batch sizes
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity())
+
+    # Note: we use islice here because the dataloader will be infinite.
+    batches = [batch for batch in islice(dl, 2)]
+
+    b = batches[0]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(isinstance(c, TextPairExample) for c in b)
+    assert all(c.source.language == "en" for c in b)
+    assert all(c.target.language == "es" for c in b)
+
+    b = batches[1]
+    assert isinstance(b, lhotse.CutSet)
+    assert all(isinstance(c, TextPairExample) for c in b)
+    assert all(c.source.language == "en" for c in b)
+    assert all(c.target.language == "es" for c in b)
+
+
+@pytest.fixture(scope="session")
+def en_es_tokenizer(tmp_path_factory, txt_en_path, txt_es_path) -> TokenizerWrapper:
+    tmpdir = tmp_path_factory.mktemp("en_es_tokenizer")
+    text_path = tmpdir / "text.txt"
+    text_path.write_text(txt_en_path.read_text() + "\n" + txt_es_path.read_text())
+    create_spt_model(text_path, vocab_size=128, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir))
+    return TokenizerWrapper(SentencePieceTokenizer(str(tmpdir / "tokenizer.model")))
+
+
+def test_multimodal_text_audio_dataloading(
+    txt_pair_paths_shards: tuple[str, str],
+    nemo_tarred_manifest_path_multi: tuple[str, str],
+    en_es_tokenizer: TokenizerWrapper,
+):
+    en_paths, es_paths = txt_pair_paths_shards
+    manifest_filepath, tarred_audio_filepaths = nemo_tarred_manifest_path_multi
+    config = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "txt_pair",
+                    "source_paths": en_paths,
+                    "target_paths": es_paths,
+                    "source_language": "en",
+                    "target_language": "es",
+                    "tags": {"modality": "text",},
+                },
+                {
+                    "type": "nemo_tarred",
+                    "manifest_filepath": manifest_filepath,
+                    "tarred_audio_filepaths": tarred_audio_filepaths,
+                    "tags": {"modality": "audio",},
+                },
+            ],
+            "shuffle": True,
+            "num_workers": 0,
+            "use_multimodal_sampling": True,
+            "batch_tokens": 1024,
+            # How to set token equivalent duration in actual training?
+            #   assuming fbank frames: 0.01 is the base due to frame shift;
+            #       + subsampling x8 gives us 0.08
+            #   assuming discrete audio tokens, with frame rate 50Hz,
+            #       we'd get 0.02
+            #   in this test we'll just use 0.1 for simplicity
+            "token_equivalent_duration": 0.1,
+            "quadratic_factor": 50,
+            "seed": 0,
+            "shard_seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=en_es_tokenizer,
+    )
+
+    # Note: we use islice here because the dataloader will be infinite.
+    batches = [batch for batch in islice(dl, 2)]
+
+    b = batches[0]
+    assert isinstance(b, lhotse.CutSet)
+    assert len(b) == 48
+    assert sum(ex.num_tokens for ex in b) == pytest.approx(574.0)
+    assert min(ex.num_tokens for ex in b) == pytest.approx(10)
+    assert max(ex.num_tokens for ex in b) == pytest.approx(16)
+    assert sum(isinstance(ex, Cut) for ex in b) == 29
+    assert sum(isinstance(ex, TextPairExample) for ex in b) == 19
+    for ex in b:
+        if isinstance(ex, Cut):
+            assert ex.modality == "audio"
+            assert isinstance(ex.load_audio(), np.ndarray)
+            assert isinstance(ex.supervisions[0].text, str)
+        if isinstance(ex, TextPairExample):
+            assert ex.modality == "text"
+            assert ex.source.language == "en"
+            assert ex.target.language == "es"
+            assert isinstance(ex.source.text, str)
+            assert isinstance(ex.target.text, str)
+            assert isinstance(ex.source.tokens, np.ndarray)
+            assert isinstance(ex.target.tokens, np.ndarray)
+
+    b = batches[1]
+    assert isinstance(b, lhotse.CutSet)
+    assert len(b) == 48
+    assert sum(ex.num_tokens for ex in b) == pytest.approx(614.0)
+    assert min(ex.num_tokens for ex in b) == pytest.approx(10)
+    assert max(ex.num_tokens for ex in b) == pytest.approx(16)
+    assert sum(isinstance(ex, Cut) for ex in b) == 21
+    assert sum(isinstance(ex, TextPairExample) for ex in b) == 27
+    for ex in b:
+        if isinstance(ex, Cut):
+            assert ex.modality == "audio"
+            assert isinstance(ex.load_audio(), np.ndarray)
+            assert isinstance(ex.supervisions[0].text, str)
+        if isinstance(ex, TextPairExample):
+            assert ex.modality == "text"
+            assert ex.source.language == "en"
+            assert ex.target.language == "es"
+            assert isinstance(ex.source.text, str)
+            assert isinstance(ex.target.text, str)
+            assert isinstance(ex.source.tokens, np.ndarray)
+            assert isinstance(ex.target.tokens, np.ndarray)

From 12e7cf9d24070e26a7d4e0fb4d3ebf0f4a7c09c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Tue, 16 Apr 2024 13:55:51 -0400
Subject: [PATCH 38/39] Lhotse AudioToAudio dataset (supports ref recording and
 embedding) (#8477)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Draft for Lhotse AudioToAudio dataset (supports ref recording and embedding)

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Integrate with speech enhancement models

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix absolute path + write cuts in the output manifest

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Support channel selectors for input, reference, and target recordings

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Support on the fly truncation and/or cutting into windows

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Bump min required lhotse version

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add copyright headers

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Added unit tests checking lhotse dataloader is matching the existing dataset (#8619)

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Fix batch unpacking, test_ds, use nemo logging

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* fixed some code scanning issues

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Fixed a couple CI issues

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Support NeMo-style resolution of relative paths in native lhotse cuts

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Added option to leave original paths or force absolute paths in the converted manifests

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Fix support for relative path resolution in lhotse arrays

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix unit tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Ante Jukić <ajukic@nvidia.com>
Co-authored-by: Ante Jukić <ajukic@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: anteju <108555623+anteju@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 examples/audio_tasks/audio_to_audio_eval.py   |   7 +
 examples/audio_tasks/speech_enhancement.py    |   2 +-
 .../asr/data/audio_to_audio_lhotse.py         | 207 +++++++++++
 .../asr/models/enhancement_models.py          |  25 +-
 nemo/collections/common/data/lhotse/cutset.py |  56 ++-
 .../common/data/lhotse/dataloader.py          |  32 +-
 .../audio_to_audio/convert_nemo_to_lhotse.py  |  77 ++++
 tests/collections/asr/test_asr_datasets.py    | 336 ++++++++++++------
 .../common/test_lhotse_dataloading.py         |  99 +++++-
 9 files changed, 727 insertions(+), 114 deletions(-)
 create mode 100644 nemo/collections/asr/data/audio_to_audio_lhotse.py
 create mode 100644 scripts/audio_to_audio/convert_nemo_to_lhotse.py

diff --git a/examples/audio_tasks/audio_to_audio_eval.py b/examples/audio_tasks/audio_to_audio_eval.py
index 57d7095057a9..4ac68dfc84e7 100644
--- a/examples/audio_tasks/audio_to_audio_eval.py
+++ b/examples/audio_tasks/audio_to_audio_eval.py
@@ -73,7 +73,9 @@
 from tqdm import tqdm
 
 from nemo.collections.asr.data import audio_to_audio_dataset
+from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset
 from nemo.collections.asr.metrics.audio import AudioMetricWrapper
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.collections.common.parts.preprocessing import manifest
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
@@ -103,6 +105,11 @@ class AudioEvaluationConfig(process_audio.ProcessConfig):
 def get_evaluation_dataloader(config):
     """Prepare a dataloader for evaluation.
     """
+    if config.get("use_lhotse", False):
+        return get_lhotse_dataloader_from_config(
+            config, global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset()
+        )
+
     dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config)
 
     return torch.utils.data.DataLoader(
diff --git a/examples/audio_tasks/speech_enhancement.py b/examples/audio_tasks/speech_enhancement.py
index 5b32d9b95298..250d212d2a25 100644
--- a/examples/audio_tasks/speech_enhancement.py
+++ b/examples/audio_tasks/speech_enhancement.py
@@ -51,7 +51,7 @@ def main(cfg):
     trainer.fit(model)
 
     # Run on test data, if available
-    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
+    if hasattr(cfg.model, 'test_ds'):
         if trainer.is_global_zero:
             # Destroy the current process group and let the trainer initialize it again with a single device.
             if torch.distributed.is_initialized():
diff --git a/nemo/collections/asr/data/audio_to_audio_lhotse.py b/nemo/collections/asr/data/audio_to_audio_lhotse.py
new file mode 100644
index 000000000000..6317d8a929c2
--- /dev/null
+++ b/nemo/collections/asr/data/audio_to_audio_lhotse.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import torch
+from lhotse import AudioSource, CutSet, Recording
+from lhotse.array import Array
+from lhotse.audio import info
+from lhotse.cut import MixedCut
+from lhotse.dataset.collation import collate_audio, collate_custom_field
+from lhotse.serialization import load_jsonl
+
+from nemo.collections.common.parts.preprocessing.manifest import get_full_path
+
+INPUT_CHANNEL_SELECTOR = "input_channel_selector"
+TARGET_CHANNEL_SELECTOR = "target_channel_selector"
+REFERENCE_CHANNEL_SELECTOR = "reference_channel_selector"
+LHOTSE_TARGET_CHANNEL_SELECTOR = "target_recording_channel_selector"
+LHOTSE_REFERENCE_CHANNEL_SELECTOR = "reference_recording_channel_selector"
+
+
+class LhotseAudioToTargetDataset(torch.utils.data.Dataset):
+    """
+    A dataset for audio-to-audio tasks where the goal is to use
+    an input signal to recover the corresponding target signal.
+
+    .. note:: This is a Lhotse variant of :class:`nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`.
+    """
+
+    TARGET_KEY = "target_recording"
+    REFERENCE_KEY = "reference_recording"
+    EMBEDDING_KEY = "embedding_vector"
+
+    def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]:
+        src_audio, src_audio_lens = collate_audio(cuts)
+        ans = {
+            "input_signal": src_audio,
+            "input_length": src_audio_lens,
+        }
+        if _key_available(cuts, self.TARGET_KEY):
+            tgt_audio, tgt_audio_lens = collate_audio(cuts, recording_field=self.TARGET_KEY)
+            ans.update(target_signal=tgt_audio, target_length=tgt_audio_lens)
+        if _key_available(cuts, self.REFERENCE_KEY):
+            ref_audio, ref_audio_lens = collate_audio(cuts, recording_field=self.REFERENCE_KEY)
+            ans.update(reference_signal=ref_audio, reference_length=ref_audio_lens)
+        if _key_available(cuts, self.EMBEDDING_KEY):
+            emb = collate_custom_field(cuts, field=self.EMBEDDING_KEY)
+            ans.update(embedding_signal=emb)
+        return ans
+
+
+def _key_available(cuts: CutSet, key: str) -> bool:
+    for cut in cuts:
+        if isinstance(cut, MixedCut):
+            cut = cut._first_non_padding_cut
+        if cut.custom is not None and key in cut.custom:
+            continue
+        else:
+            return False
+    return True
+
+
+def create_recording(path_or_paths: str | list[str]) -> Recording:
+    if isinstance(path_or_paths, list):
+        cur_channel_idx = 0
+        sources = []
+        infos = []
+        for p in path_or_paths:
+            i = info(p)
+            infos.append(i)
+            sources.append(
+                AudioSource(type="file", channels=list(range(cur_channel_idx, cur_channel_idx + i.channels)), source=p)
+            )
+            cur_channel_idx += i.channels
+        assert all(
+            i.samplerate == infos[0].samplerate for i in infos[1:]
+        ), f"Mismatched sampling rates for individual audio files in: {path_or_paths}"
+        recording = Recording(
+            id=path_or_paths[0],
+            sources=sources,
+            sampling_rate=infos[0].samplerate,
+            num_samples=infos[0].frames,
+            duration=infos[0].duration,
+            channel_ids=list(range(0, cur_channel_idx)),
+        )
+    else:
+        recording = Recording.from_file(path_or_paths)
+    return recording
+
+
+def create_array(path: str) -> Array:
+    assert path.endswith(".npy"), f"Currently only conversion of numpy files is supported (got: {path})"
+    arr = np.load(path)
+    parent, path = os.path.split(path)
+    return Array(storage_type="numpy_files", storage_path=parent, storage_key=path, shape=list(arr.shape),)
+
+
+def convert_manifest_nemo_to_lhotse(
+    input_manifest: str,
+    output_manifest: str,
+    input_key: str = 'input_filepath',
+    target_key: str = 'target_filepath',
+    reference_key: str = 'reference_filepath',
+    embedding_key: str = 'embedding_filepath',
+    force_absolute_paths: bool = False,
+):
+    """
+    Convert an audio-to-audio manifest from NeMo format to Lhotse format.
+    
+    Args:
+        input_manifest: Path to the input NeMo manifest.
+        output_manifest: Path where we'll write the output Lhotse manifest (supported extensions: .jsonl.gz and .jsonl).
+        input_key: Key of the input recording, mapped to Lhotse's 'Cut.recording'.
+        target_key: Key of the target recording, mapped to Lhotse's 'Cut.target_recording'.
+        reference_key: Key of the reference recording, mapped to Lhotse's 'Cut.reference_recording'.
+        embedding_key: Key of the embedding, mapped to Lhotse's 'Cut.embedding_vector'.
+        force_absolute_paths: If True, the paths in the output manifest will be absolute.
+    """
+    with CutSet.open_writer(output_manifest) as writer:
+        for item in load_jsonl(input_manifest):
+
+            # Create Lhotse recording and cut object, apply offset and duration slicing if present.
+            item_input_key = item.pop(input_key)
+            recording = create_recording(get_full_path(audio_file=item_input_key, manifest_file=input_manifest))
+            cut = recording.to_cut().truncate(duration=item.pop("duration"), offset=item.pop("offset", 0.0))
+
+            _as_relative(cut.recording, item_input_key, enabled=not force_absolute_paths)
+
+            if (channels := item.pop(INPUT_CHANNEL_SELECTOR, None)) is not None:
+                if cut.num_channels == 1:
+                    assert (
+                        len(channels) == 1 and channels[0] == 0
+                    ), f"The input recording has only a single channel, but manifest specified {INPUT_CHANNEL_SELECTOR}={channels}"
+                else:
+                    cut = cut.with_channels(channels)
+
+            if target_key in item:
+                item_target_key = item.pop(target_key)
+                cut.target_recording = create_recording(
+                    get_full_path(audio_file=item_target_key, manifest_file=input_manifest)
+                )
+
+                _as_relative(cut.target_recording, item_target_key, enabled=not force_absolute_paths)
+
+                if (channels := item.pop(TARGET_CHANNEL_SELECTOR, None)) is not None:
+                    if cut.target_recording.num_channels == 1:
+                        assert (
+                            len(channels) == 1 and channels[0] == 0
+                        ), f"The target recording has only a single channel, but manifest specified {TARGET_CHANNEL_SELECTOR}={channels}"
+                    else:
+                        cut = cut.with_custom(LHOTSE_TARGET_CHANNEL_SELECTOR, channels)
+
+            if reference_key in item:
+                item_reference_key = item.pop(reference_key)
+                cut.reference_recording = create_recording(
+                    get_full_path(audio_file=item_reference_key, manifest_file=input_manifest)
+                )
+
+                _as_relative(cut.reference_recording, item_target_key, enabled=not force_absolute_paths)
+
+                if (channels := item.pop(REFERENCE_CHANNEL_SELECTOR, None)) is not None:
+                    if cut.reference_recording.num_channels == 1:
+                        assert (
+                            len(channels) == 1 and channels[0] == 0
+                        ), f"The reference recording has only a single channel, but manifest specified {REFERENCE_CHANNEL_SELECTOR}={channels}"
+                    else:
+                        cut = cut.with_custom(LHOTSE_REFERENCE_CHANNEL_SELECTOR, channels)
+
+            if embedding_key in item:
+                item_embedding_key = item.pop(embedding_key)
+                cut.embedding_vector = create_array(
+                    get_full_path(audio_file=item_embedding_key, manifest_file=input_manifest)
+                )
+
+                if not force_absolute_paths:
+                    # Use the same format for paths as in the original manifest
+                    cut.embedding_vector.storage_path = ""
+                    cut.embedding_vector.storage_key = item_embedding_key
+
+            if item:
+                cut.custom.update(item)  # any field that's still left goes to custom fields
+
+            writer.write(cut)
+
+
+def _as_relative(recording: Recording, paths: list[str] | str, enabled: bool) -> None:
+    if not enabled:
+        return
+    if isinstance(paths, str):
+        paths = [paths]
+    assert len(recording.sources) == len(
+        paths
+    ), f"Mismatched number of sources for lhotse Recording and the override list. Got {recording=} and {paths=}"
+    for source, path in zip(recording.sources, paths):
+        source.source = path
diff --git a/nemo/collections/asr/models/enhancement_models.py b/nemo/collections/asr/models/enhancement_models.py
index 7cc5c3d8459f..b80c357364aa 100644
--- a/nemo/collections/asr/models/enhancement_models.py
+++ b/nemo/collections/asr/models/enhancement_models.py
@@ -24,9 +24,11 @@
 from tqdm import tqdm
 
 from nemo.collections.asr.data import audio_to_audio_dataset
+from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset
 from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config
 from nemo.collections.asr.models.audio_to_audio_model import AudioToAudioModel
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.core.classes.common import PretrainedModelInfo, typecheck
 from nemo.core.neural_types import AudioSignal, LengthsType, NeuralType
 from nemo.utils import logging
@@ -198,6 +200,11 @@ def process(
 
     def _setup_dataloader_from_config(self, config: Optional[Dict]):
 
+        if config.get("use_lhotse", False):
+            return get_lhotse_dataloader_from_config(
+                config, global_rank=self.global_rank, world_size=self.world_size, dataset=LhotseAudioToTargetDataset()
+            )
+
         is_concat = config.get('is_concat', False)
         if is_concat:
             raise NotImplementedError('Concat not implemented')
@@ -398,7 +405,14 @@ def forward(self, input_signal, input_length=None):
 
     # PTL-specific methods
     def training_step(self, batch, batch_idx):
-        input_signal, input_length, target_signal, target_length = batch
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
 
         # Expand channel dimension, if necessary
         # For consistency, the model uses multi-channel format, even if the channel dimension is 1
@@ -426,7 +440,14 @@ def training_step(self, batch, batch_idx):
         return loss
 
     def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
-        input_signal, input_length, target_signal, target_length = batch
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch['target_signal']
+        else:
+            input_signal, input_length, target_signal, _ = batch
 
         # Expand channel dimension, if necessary
         # For consistency, the model uses multi-channel format, even if the channel dimension is 1
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index fa5ae5804c4b..6dd01479a501 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -19,11 +19,14 @@
 from pathlib import Path
 from typing import Sequence, Tuple
 
-from lhotse import CutSet
+from lhotse import CutSet, Features, Recording
+from lhotse.array import Array, TemporalArray
+from lhotse.cut import Cut, MixedCut, PaddingCut
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
 from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator, LazyNeMoTarredIterator
 from nemo.collections.common.data.lhotse.text_adapters import LhotseTextAdapter, LhotseTextPairAdapter
+from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 
 
 def read_cutset_from_config(config: DictConfig) -> Tuple[CutSet, bool]:
@@ -291,10 +294,59 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet:
             cuts = mux(*cutsets, weights=weights, max_open_streams=config.max_open_streams, seed=config.shard_seed)
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
-        cuts = CutSet.from_file(config.cuts_path)
+        path = config.cuts_path
+        cuts = CutSet.from_file(path).map(partial(resolve_relative_paths, manifest_path=path))
     return cuts
 
 
+def resolve_relative_paths(cut: Cut, manifest_path: str) -> Cut:
+    if isinstance(cut, PaddingCut):
+        return cut
+
+    if isinstance(cut, MixedCut):
+        for track in cut.tracks:
+            track.cut = resolve_relative_paths(track.cut, manifest_path)
+        return cut
+
+    def resolve_recording(value):
+        for audio_source in value.sources:
+            if audio_source.type == "file":
+                audio_source.source = get_full_path(audio_source.source, manifest_file=manifest_path)
+
+    def resolve_array(value):
+        if isinstance(value, TemporalArray):
+            value.array = resolve_array(value.array)
+        else:
+            if value.storage_type in ("numpy_files", "lilcom_files"):
+                abspath = Path(
+                    get_full_path(str(Path(value.storage_path) / value.storage_key), manifest_file=manifest_path)
+                )
+                value.storage_path = str(abspath.parent)
+                value.storage_key = str(abspath.name)
+            elif value.storage_type in (
+                "kaldiio",
+                "chunked_lilcom_hdf5",
+                "lilcom_chunky",
+                "lilcom_hdf5",
+                "numpy_hdf5",
+            ):
+                value.storage_path = get_full_path(value.storage_path, manifest_file=manifest_path)
+            # ignore others i.e. url, in-memory data, etc.
+
+    if cut.has_recording:
+        resolve_recording(cut.recording)
+    if cut.has_features:
+        resolve_array(cut.features)
+    if cut.custom is not None:
+        for key, value in cut.custom.items():
+            if isinstance(value, Recording):
+                resolve_recording(value)
+            elif isinstance(value, (Array, TemporalArray, Features)):
+                resolve_array(value)
+
+    return cut
+
+
 def read_nemo_manifest(config, is_tarred: bool) -> CutSet:
     common_kwargs = {
         "text_field": config.text_field,
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index fd2a69725a0e..83920660302b 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import warnings
 from dataclasses import dataclass
 from functools import partial, singledispatch
@@ -38,6 +37,7 @@
 from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
 from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
 from nemo.collections.common.tokenizers import TokenizerSpec
+from nemo.utils import logging
 
 
 @dataclass
@@ -104,6 +104,15 @@ class LhotseDataLoadingConfig:
     concatenate_duration_factor: float = 1.0
     concatenate_merge_supervisions: bool = True
     db_norm: Optional[float] = -25.0  # from CodeSwitchingDataset
+    #   d. On-the-fly cut truncation or window slicing
+    #       I) truncate: select one chunk of a fixed duration for each cut
+    truncate_duration: Optional[float] = None  # set this to enable
+    truncate_offset_type: str = "random"  # "random" | "start" (fixed) | "end" (fixed, counted back)
+    #       II) cut_into_windows: convert each cut to smaller cut using a sliding window (define hop for overlapping windows)
+    cut_into_windows_duration: Optional[float] = None  # set this to enable
+    cut_into_windows_hop: Optional[float] = None
+    #       III) common options
+    keep_excessive_supervisions: bool = True  # when a cut is truncated in the middle of a supervision, should we keep them.
 
     # 5. Other Lhotse options.
     text_field: str = "text"  # key to read the transcript from
@@ -151,9 +160,6 @@ def get_lhotse_dataloader_from_config(
     # Resample as a safeguard; it's a no-op when SR is already OK
     cuts = cuts.resample(config.sample_rate)
 
-    # Duration filtering, same as native NeMo dataloaders.
-    cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))
-
     # Expands cuts if multiple translations are provided.
     cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None)))
 
@@ -181,6 +187,24 @@ def get_lhotse_dataloader_from_config(
     if config.perturb_speed:
         cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),)
 
+    # 2.d: truncation/slicing
+    if config.truncate_duration is not None:
+        cuts = cuts.truncate(
+            max_duration=config.truncate_duration,
+            offset_type=config.truncate_offset_type,
+            keep_excessive_supervisions=config.keep_excessive_supervisions,
+        )
+    if config.cut_into_windows_duration is not None:
+        cuts = cuts.cut_into_windows(
+            duration=config.cut_into_windows_duration,
+            hop=config.cut_into_windows_hop,
+            keep_excessive_supervisions=config.keep_excessive_supervisions,
+        )
+
+    # Duration filtering, same as native NeMo dataloaders.
+    # We can filter after the augmentations because they are applied only when calling load_audio().
+    cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))
+
     if config.use_multimodal_sampling:
         constraint = MultimodalSamplingConstraint(
             token_equivalent_duration=config.token_equivalent_duration,
diff --git a/scripts/audio_to_audio/convert_nemo_to_lhotse.py b/scripts/audio_to_audio/convert_nemo_to_lhotse.py
new file mode 100644
index 000000000000..e498a3b2d460
--- /dev/null
+++ b/scripts/audio_to_audio/convert_nemo_to_lhotse.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from nemo.collections.asr.data.audio_to_audio_lhotse import convert_manifest_nemo_to_lhotse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Convert an audio-to-audio manifest from NeMo format to Lhotse format. "
+        "This step enables the use of Lhotse datasets for audio-to-audio processing. "
+    )
+    parser.add_argument("input", help='Path to the input NeMo manifest.')
+    parser.add_argument(
+        "output", help="Path where we'll write the output Lhotse manifest (supported extensions: .jsonl.gz and .jsonl)"
+    )
+    parser.add_argument(
+        "-i",
+        "--input_key",
+        default="audio_filepath",
+        help="Key of the input recording, mapped to Lhotse's 'Cut.recording'.",
+    )
+    parser.add_argument(
+        "-t",
+        "--target_key",
+        default="target_filepath",
+        help="Key of the target recording, mapped to Lhotse's 'Cut.target_recording'.",
+    )
+    parser.add_argument(
+        "-r",
+        "--reference_key",
+        default="reference_filepath",
+        help="Key of the reference recording, mapped to Lhotse's 'Cut.reference_recording'.",
+    )
+    parser.add_argument(
+        "-e",
+        "--embedding_key",
+        default="embedding_filepath",
+        help="Key of the embedding, mapped to Lhotse's 'Cut.embedding_vector'.",
+    )
+    parser.add_argument(
+        "-a",
+        "--force_absolute_paths",
+        action='store_true',
+        default=False,
+        help="Force absolute paths in the generated manifests.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    convert_manifest_nemo_to_lhotse(
+        input_manifest=args.input,
+        output_manifest=args.output,
+        input_key=args.input_key,
+        target_key=args.target_key,
+        reference_key=args.reference_key,
+        embedding_key=args.embedding_key,
+        force_absolute_paths=args.force_absolute_paths,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/collections/asr/test_asr_datasets.py b/tests/collections/asr/test_asr_datasets.py
index 4040227c5e3e..946acb614f11 100644
--- a/tests/collections/asr/test_asr_datasets.py
+++ b/tests/collections/asr/test_asr_datasets.py
@@ -34,6 +34,7 @@
     AudioToTargetWithReferenceDataset,
     _audio_collate_fn,
 )
+from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset, convert_manifest_nemo_to_lhotse
 from nemo.collections.asr.data.audio_to_text import (
     DataStoreObject,
     TarredAudioToBPEDataset,
@@ -52,6 +53,7 @@
 from nemo.collections.asr.parts.utils.audio_utils import get_segment_start
 from nemo.collections.asr.parts.utils.manifest_utils import write_manifest
 from nemo.collections.common import tokenizers
+from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
 from nemo.utils import logging
 
 try:
@@ -970,6 +972,27 @@ def test_audio_to_target_dataset(self):
             }
             dataset_factory = audio_to_audio_dataset.get_audio_to_target_dataset(config)
 
+            # Prepare lhotse manifest
+            cuts_path = manifest_filepath.replace('.json', '_cuts.jsonl')
+            convert_manifest_nemo_to_lhotse(
+                input_manifest=manifest_filepath,
+                output_manifest=cuts_path,
+                input_key=data_key['input_signal'],
+                target_key=data_key['target_signal'],
+            )
+
+            # Prepare lhotse dataset
+            config_lhotse = {
+                'cuts_path': cuts_path,
+                'use_lhotse': True,
+                'sample_rate': sample_rate,
+                'batch_size': 1,
+            }
+            dl_lhotse = get_lhotse_dataloader_from_config(
+                OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset()
+            )
+            dataset_lhotse = [item for item in dl_lhotse]
+
             # Test number of channels
             for signal in data:
                 assert data_num_channels[signal] == dataset.num_channels(
@@ -981,23 +1004,25 @@ def test_audio_to_target_dataset(self):
 
             # Test returned examples
             for n in range(num_examples):
-                item = dataset.__getitem__(n)
-                item_factory = dataset_factory.__getitem__(n)
-
                 for signal in data:
-                    item_signal = item[signal].cpu().detach().numpy()
                     golden_signal = data[signal][n]
-                    assert (
-                        item_signal.shape == golden_signal.shape
-                    ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
-                    assert np.allclose(
-                        item_signal, golden_signal, atol=atol
-                    ), f'Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})'
 
-                    item_factory_signal = item_factory[signal].cpu().detach().numpy()
-                    assert np.allclose(
-                        item_factory_signal, golden_signal, atol=atol
-                    ), f'Test 1: Failed for factory example {n}, signal {signal} (random seed {random_seed})'
+                    for use_lhotse in [False, True]:
+                        item_signal = (
+                            dataset_lhotse[n][signal].squeeze(0) if use_lhotse else dataset.__getitem__(n)[signal]
+                        )
+                        item_factory_signal = dataset_factory.__getitem__(n)[signal]
+
+                        assert (
+                            item_signal.shape == golden_signal.shape
+                        ), f'Test 1, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
+                        assert np.allclose(
+                            item_signal, golden_signal, atol=atol
+                        ), f'Test 1, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})'
+
+                        assert np.allclose(
+                            item_factory_signal, golden_signal, atol=atol
+                        ), f'Test 1, use_lhotse={use_lhotse}: Failed for factory example {n}, signal {signal} (random seed {random_seed})'
 
             # Test 2
             # - Filtering based on signal duration
@@ -1013,20 +1038,36 @@ def test_audio_to_target_dataset(self):
                 sample_rate=sample_rate,
             )
 
+            # Prepare lhotse dataset
+            config_lhotse = {
+                'cuts_path': cuts_path,
+                'use_lhotse': True,
+                'min_duration': min_duration,
+                'max_duration': max_duration,
+                'sample_rate': sample_rate,
+                'batch_size': 1,
+            }
+            dl_lhotse = get_lhotse_dataloader_from_config(
+                OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset()
+            )
+            dataset_lhotse = [item for item in dl_lhotse]
+
             filtered_examples = [n for n, val in enumerate(data_duration) if min_duration <= val <= max_duration]
 
             for n in range(len(dataset)):
-                item = dataset.__getitem__(n)
+                for use_lhotse in [False, True]:
+                    for signal in data:
+                        item_signal = (
+                            dataset_lhotse[n][signal].squeeze(0) if use_lhotse else dataset.__getitem__(n)[signal]
+                        )
+                        golden_signal = data[signal][filtered_examples[n]]
+                        assert (
+                            item_signal.shape == golden_signal.shape
+                        ), f'Test 2, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
 
-                for signal in data:
-                    item_signal = item[signal].cpu().detach().numpy()
-                    golden_signal = data[signal][filtered_examples[n]]
-                    assert (
-                        item_signal.shape == golden_signal.shape
-                    ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
-                    assert np.allclose(
-                        item_signal, golden_signal, atol=atol
-                    ), f'Test 2: Failed for example {n}, signal {signal} (random seed {random_seed})'
+                        assert np.allclose(
+                            item_signal, golden_signal, atol=atol
+                        ), f'Test 2, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})'
 
             # Test 3
             # - Use channel selector
@@ -1078,58 +1119,98 @@ def test_audio_to_target_dataset(self):
                     random_offset=random_offset,  # random offset when selecting subsegment
                 )
 
-                for n in range(len(dataset)):
-                    item = dataset.__getitem__(n)
-
-                    golden_start = golden_end = None
-                    for signal in data:
-                        item_signal = item[signal].cpu().detach().numpy()
-                        full_golden_signal = data[signal][filtered_examples[n]]
-
-                        # Find random segment using correlation on the first channel
-                        # of the first signal, and then use it fixed for other signals
-                        if golden_start is None:
-                            golden_start = get_segment_start(
-                                signal=full_golden_signal[0, :], segment=item_signal[0, :]
-                            )
-                            if not random_offset:
-                                assert (
-                                    golden_start == 0
-                                ), f'Expecting the signal to start at 0 when random_offset is False'
-
-                            golden_end = golden_start + audio_duration_samples
-                        golden_signal = full_golden_signal[..., golden_start:golden_end]
-
-                        # Test length is correct
-                        assert (
-                            item_signal.shape[-1] == audio_duration_samples
-                        ), f'Test 4: Signal length ({item_signal.shape[-1]}) not matching the expected length ({audio_duration_samples})'
+                # Prepare lhotse dataset
+                config_lhotse = {
+                    'cuts_path': cuts_path,
+                    'use_lhotse': True,
+                    'min_duration': audio_duration,
+                    'truncate_duration': audio_duration,
+                    'truncate_offset_type': 'random' if random_offset else 'start',
+                    'sample_rate': sample_rate,
+                    'batch_size': 1,
+                }
+                dl_lhotse = get_lhotse_dataloader_from_config(
+                    OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset()
+                )
+                dataset_lhotse = [item for item in dl_lhotse]
 
-                        assert (
-                            item_signal.shape == golden_signal.shape
-                        ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
-                        # Test signal values
-                        assert np.allclose(
-                            item_signal, golden_signal, atol=atol
-                        ), f'Test 4: Failed for example {n}, signal {signal} (random seed {random_seed})'
+                for n in range(len(dataset)):
+                    for use_lhotse in [False, True]:
+                        item = dataset_lhotse[n] if use_lhotse else dataset.__getitem__(n)
+                        golden_start = golden_end = None
+                        for signal in data:
+                            item_signal = item[signal].squeeze(0) if use_lhotse else item[signal]
+                            full_golden_signal = data[signal][filtered_examples[n]]
+
+                            # Find random segment using correlation on the first channel
+                            # of the first signal, and then use it fixed for other signals
+                            if golden_start is None:
+                                golden_start = get_segment_start(
+                                    signal=full_golden_signal[0, :], segment=item_signal[0, :]
+                                )
+                                if not random_offset:
+                                    assert (
+                                        golden_start == 0
+                                    ), f'Test 4, use_lhotse={use_lhotse}: Expecting the signal to start at 0 when random_offset is False'
+
+                                golden_end = golden_start + audio_duration_samples
+                            golden_signal = full_golden_signal[..., golden_start:golden_end]
+
+                            # Test length is correct
+                            assert (
+                                item_signal.shape[-1] == audio_duration_samples
+                            ), f'Test 4, use_lhotse={use_lhotse}: Signal length ({item_signal.shape[-1]}) not matching the expected length ({audio_duration_samples})'
+
+                            assert (
+                                item_signal.shape == golden_signal.shape
+                            ), f'Test 4, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
+                            # Test signal values
+                            assert np.allclose(
+                                item_signal, golden_signal, atol=atol
+                            ), f'Test 4, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})'
 
             # Test 5:
             # - Test collate_fn
             batch_size = 16
-            batch = [dataset.__getitem__(n) for n in range(batch_size)]
-            batched = dataset.collate_fn(batch)
 
-            for n, signal in enumerate(data.keys()):
-                signal_shape = batched[2 * n].shape
-                signal_len = batched[2 * n + 1]
+            for use_lhotse in [False, True]:
+                if use_lhotse:
+                    # Get batch from lhotse dataloader
+                    config_lhotse['batch_size'] = batch_size
+                    dl_lhotse = get_lhotse_dataloader_from_config(
+                        OmegaConf.create(config_lhotse),
+                        global_rank=0,
+                        world_size=1,
+                        dataset=LhotseAudioToTargetDataset(),
+                    )
+                    batched = next(iter(dl_lhotse))
+                else:
+                    # Get examples from dataset and collate into a batch
+                    batch = [dataset.__getitem__(n) for n in range(batch_size)]
+                    batched = dataset.collate_fn(batch)
 
-                assert signal_shape == (
-                    batch_size,
-                    data_num_channels[signal],
-                    audio_duration_samples,
-                ), f'Test 5: Unexpected signal {signal} shape {signal_shape}'
-                assert len(signal_len) == batch_size, f'Test 5: Unexpected length of signal_len ({len(signal_len)})'
-                assert all(signal_len == audio_duration_samples), f'Test 5: Unexpected signal_len {signal_len}'
+                # Test all shapes and lengths
+                for n, signal in enumerate(data.keys()):
+                    length = signal.replace('_signal', '_length')
+
+                    if isinstance(batched, dict):
+                        signal_shape = batched[signal].shape
+                        signal_len = batched[length]
+                    else:
+                        signal_shape = batched[2 * n].shape
+                        signal_len = batched[2 * n + 1]
+
+                    assert signal_shape == (
+                        batch_size,
+                        data_num_channels[signal],
+                        audio_duration_samples,
+                    ), f'Test 5, use_lhotse={use_lhotse}: Unexpected signal {signal} shape {signal_shape}'
+                    assert (
+                        len(signal_len) == batch_size
+                    ), f'Test 5, use_lhotse={use_lhotse}: Unexpected length of signal_len ({len(signal_len)})'
+                    assert all(
+                        signal_len == audio_duration_samples
+                    ), f'Test 5, use_lhotse={use_lhotse}: Unexpected signal_len {signal_len}'
 
     @pytest.mark.unit
     def test_audio_to_target_dataset_with_target_list(self):
@@ -1237,28 +1318,49 @@ def test_audio_to_target_dataset_with_target_list(self):
             }
             dataset_factory = audio_to_audio_dataset.get_audio_to_target_dataset(config)
 
-            for n in range(num_examples):
-                item = dataset.__getitem__(n)
-                item_factory = dataset_factory.__getitem__(n)
+            # Prepare lhotse manifest
+            cuts_path = manifest_filepath.replace('.json', '_cuts.jsonl')
+            convert_manifest_nemo_to_lhotse(
+                input_manifest=manifest_filepath,
+                output_manifest=cuts_path,
+                input_key=data_key['input_signal'],
+                target_key=data_key['target_signal'],
+            )
 
-                for signal in data:
-                    item_signal = item[signal].cpu().detach().numpy()
-                    golden_signal = data[signal][n]
-                    assert (
-                        item_signal.shape == golden_signal.shape
-                    ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
-                    assert np.allclose(
-                        item_signal, golden_signal, atol=atol
-                    ), f'Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})'
+            # Prepare lhotse dataset
+            config_lhotse = {
+                'cuts_path': cuts_path,
+                'use_lhotse': True,
+                'sample_rate': sample_rate,
+                'batch_size': 1,
+            }
+            dl_lhotse = get_lhotse_dataloader_from_config(
+                OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset()
+            )
+            dataset_lhotse = [item for item in dl_lhotse]
 
-                    item_factory_signal = item_factory[signal].cpu().detach().numpy()
-                    assert np.allclose(
-                        item_factory_signal, golden_signal, atol=atol
-                    ), f'Test 1: Failed for factory example {n}, signal {signal} (random seed {random_seed})'
+            for n in range(num_examples):
+                for use_lhotse in [False, True]:
+                    item = dataset_lhotse[n] if use_lhotse else dataset.__getitem__(n)
+                    item_factory = dataset_factory.__getitem__(n)
+                    for signal in data:
+                        item_signal = item[signal].squeeze(0) if use_lhotse else item[signal]
+                        golden_signal = data[signal][n]
+                        assert (
+                            item_signal.shape == golden_signal.shape
+                        ), f'Test 1, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
+                        assert np.allclose(
+                            item_signal, golden_signal, atol=atol
+                        ), f'Test 1, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})'
+
+                        assert np.allclose(
+                            item_factory[signal], golden_signal, atol=atol
+                        ), f'Test 1, use_lhotse={use_lhotse}: Failed for factory example {n}, signal {signal} (random seed {random_seed})'
 
             # Test 2
             # Set target as the first channel of input_filepath and all files listed in target_filepath.
             # In this case, the target will have 3 channels.
+            # Note: this is currently not supported by lhotse, so we only test the default dataset here.
             dataset = AudioToTargetDataset(
                 manifest_filepath=manifest_filepath,
                 input_key=data_key['input_signal'],
@@ -1367,29 +1469,55 @@ def test_audio_to_target_dataset_for_inference(self):
             }
             dataset_factory = audio_to_audio_dataset.get_audio_to_target_dataset(config)
 
+            # Prepare lhotse manifest
+            cuts_path = manifest_filepath.replace('.json', '_cuts.jsonl')
+            convert_manifest_nemo_to_lhotse(
+                input_manifest=manifest_filepath,
+                output_manifest=cuts_path,
+                input_key=data_key['input_signal'],
+                target_key=None,
+            )
+
+            # Prepare lhotse dataset
+            config_lhotse = {
+                'cuts_path': cuts_path,
+                'use_lhotse': True,
+                'sample_rate': sample_rate,
+                'batch_size': 1,
+            }
+            dl_lhotse = get_lhotse_dataloader_from_config(
+                OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset()
+            )
+            dataset_lhotse = [item for item in dl_lhotse]
+
             for n in range(num_examples):
-                item = dataset.__getitem__(n)
-                item_factory = dataset_factory.__getitem__(n)
 
-                # Check target is None
-                assert item['target_signal'].numel() == 0, 'target_signal is expected to be empty.'
-                assert item_factory['target_signal'].numel() == 0, 'target_signal is expected to be empty.'
+                for label in ['original', 'factory', 'lhotse']:
 
-                # Check valid signals
-                for signal in data:
-                    item_signal = item[signal].cpu().detach().numpy()
-                    golden_signal = data[signal][n]
-                    assert (
-                        item_signal.shape == golden_signal.shape
-                    ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
-                    assert np.allclose(
-                        item_signal, golden_signal, atol=atol
-                    ), f'Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})'
+                    if label == 'original':
+                        item = dataset.__getitem__(n)
+                    elif label == 'factory':
+                        item = dataset_factory.__getitem__(n)
+                    elif label == 'lhotse':
+                        item = dataset_lhotse[n]
+                    else:
+                        raise ValueError(f'Unknown label {label}')
 
-                    item_factory_signal = item_factory[signal].cpu().detach().numpy()
-                    assert np.allclose(
-                        item_factory_signal, golden_signal, atol=atol
-                    ), f'Test 1: Failed for factory example {n}, signal {signal} (random seed {random_seed})'
+                    # Check target is None
+                    if 'target_signal' in item:
+                        assert item['target_signal'].numel() == 0, f'{label}: target_signal is expected to be empty.'
+
+                    # Check valid signals
+                    for signal in data:
+
+                        item_signal = item[signal].squeeze(0) if label == 'lhotse' else item[signal]
+                        golden_signal = data[signal][n]
+                        assert (
+                            item_signal.shape == golden_signal.shape
+                        ), f'{label} -- Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}'
+                        assert np.allclose(
+                            item_signal, golden_signal, atol=atol
+                        ), f'{label} -- Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})'
 
     @pytest.mark.unit
     def test_audio_to_target_with_reference_dataset(self):
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 4e89a93e83e4..791c5df1c018 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -21,6 +21,8 @@
 import numpy as np
 import pytest
 import torch
+from lhotse import CutSet, NumpyFilesWriter, Recording
+from lhotse.audio import AudioLoadingError
 from lhotse.cut import Cut
 from lhotse.cut.text import TextPairExample
 from omegaconf import OmegaConf
@@ -189,6 +191,62 @@ def test_dataloader_from_lhotse_cuts(cutset_path: Path):
     assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1
 
 
+def test_dataloader_from_lhotse_cuts_truncate(cutset_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": cutset_path,
+            "truncate_duration": 0.5,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+    )
+
+    batches = [b for b in dl]
+    assert len(batches) == 3
+    # 0.5s = 8000 samples, note the constant duration and batch size except for last batch
+    assert batches[0]["audio"].shape == (4, 8000)
+    assert batches[1]["audio"].shape == (4, 8000)
+    assert batches[2]["audio"].shape == (2, 8000)
+    # exactly 10 cuts were used
+
+
+def test_dataloader_from_lhotse_cuts_cut_into_windows(cutset_path: Path):
+    config = OmegaConf.create(
+        {
+            "cuts_path": cutset_path,
+            "cut_into_windows_duration": 0.5,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+    )
+
+    batches = [b for b in dl]
+    assert len(batches) == 5
+    # 0.5s = 8000 samples, note the constant duration and batch size
+    assert batches[0]["audio"].shape == (4, 8000)
+    assert batches[1]["audio"].shape == (4, 8000)
+    assert batches[2]["audio"].shape == (4, 8000)
+    assert batches[3]["audio"].shape == (4, 8000)
+    assert batches[4]["audio"].shape == (4, 8000)
+    # exactly 20 cuts were used because we cut 10x 1s cuts into 20x 0.5s cuts
+
+
 @requires_torchaudio
 def test_dataloader_from_lhotse_shar_cuts(cutset_shar_path: Path):
     config = OmegaConf.create(
@@ -770,7 +828,46 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path):
     assert cut.num_samples == 8000
     assert cut.supervisions[0].text == "irrelevant"
     assert audio.shape == (1, 8000)
-    np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5)
+    np.testing.assert_equal(audio[0], expected_audio[:8000])
+
+
+def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path):
+    cuts_path = tmp_path / "cuts.jsonl.gz"
+    audio_path = tmp_path / "_relative_test_audio_.wav"
+    lhotse.audio.save_audio(audio_path, np.random.rand(16000) - 0.5, 16000)
+    cut = Recording.from_file(audio_path).to_cut()
+    cut.recording.sources[0].source = str(audio_path.name)  # make the path relative
+    cut.target_recording = cut.recording  # assign a custom field with relative path
+    with NumpyFilesWriter(tmp_path) as w:
+        cut.some_array = w.store_array(cut.id, np.random.randn(32))
+        cut.some_array.storage_path = ""  # relative path
+
+    with pytest.raises(AudioLoadingError):
+        cut.load_audio()  # Lhotse doesn't know about what the path should be relative to
+        cut.load_target_recording()
+
+    CutSet([cut]).to_file(cuts_path)
+
+    config = OmegaConf.create(
+        {"cuts_path": cuts_path, "sample_rate": 16000, "use_lhotse": True, "num_workers": 0, "batch_size": 2,}
+    )
+
+    class _Identity(torch.utils.data.Dataset):
+        def __getitem__(self, x):
+            return x
+
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity())
+
+    batches = [batch for batch in dl]
+    assert len(batches) == 1
+
+    for cut in batches[0]:
+        assert cut.has_recording
+        cut.load_audio()  # works
+        assert cut.has_custom("target_recording")
+        cut.load_target_recording()
+        assert cut.has_custom("some_array")
+        cut.load_some_array()
 
 
 class Identity(torch.utils.data.Dataset):

From 468d5b6d369733909524d42b80a514f33bc19263 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 16 Apr 2024 12:39:31 -0700
Subject: [PATCH 39/39] Akoumparouli/low mem mixtral ckpt converter (#8895)

* add --low-mem option to enable conversion of large checkpoints with low ram requirements

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* delete param_to_weights

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* various fixes; set hf dtype to auto

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* remove unused lien

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../convert_mixtral_hf_to_nemo.py             | 109 ++++++++++++++----
 1 file changed, 88 insertions(+), 21 deletions(-)

diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
index ac323757a2f6..98143c0328ec 100644
--- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py
@@ -24,6 +24,7 @@
 import os
 from argparse import ArgumentParser
 from collections import OrderedDict
+from pathlib import Path
 
 import megatron.core.parallel_state as parallel_state
 import torch
@@ -43,6 +44,8 @@
 )
 from nemo.utils import logging
 
+torch.set_grad_enabled(False)
+
 
 def get_args():
     parser = ArgumentParser()
@@ -51,6 +54,8 @@ def get_args():
     )
     parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.")
     parser.add_argument("--precision", type=str, default="32", help="Model precision")
+    parser.add_argument('--low-ram', action='store_true')
+    parser.add_argument('--tmp-dir', default='/tmp/mixtral_ckpt_parts/')
     args = parser.parse_args()
     return args
 
@@ -108,6 +113,9 @@ def load_config(mixtral_config, tokenizer_path):
     # RMSNorm's epsilon.
     nemo_config.layernorm_epsilon = mixtral_config['rms_norm_eps']
     nemo_config.normalization = 'rmsnorm'
+    nemo_config.micro_batch_size = 1
+    nemo_config.global_batch_size = 1
+    nemo_config.expert_model_parallel_size = 1
 
     if 'num_key_value_heads' in mixtral_config:
         nemo_config.num_query_groups = mixtral_config['num_key_value_heads']
@@ -132,24 +140,28 @@ def load_config(mixtral_config, tokenizer_path):
     return nemo_config
 
 
-def load_mixtral_ckpt(in_dir):
+def load_hf_model_args(in_dir):
     params_file = os.path.join(in_dir, 'config.json')
     assert os.path.exists(params_file)
     with open(params_file, 'r') as fp:
         model_args = json.load(fp)
+    return model_args
+
 
-    model = AutoModelForCausalLM.from_pretrained(in_dir)
-    ckpt = model.state_dict()
+def load_mixtral_ckpt(in_dir, load_model=True):
+    model_args = load_hf_model_args(in_dir)
+    ckpt = None
+    if load_model:
+        model = AutoModelForCausalLM.from_pretrained(in_dir, torch_dtype='auto')
+        ckpt = model.state_dict()
 
     tokenizer = AutoTokenizer.from_pretrained(in_dir)
     assert tokenizer.vocab_size == model_args['vocab_size']
     return model_args, ckpt, tokenizer
 
 
-def convert(args):
-    logging.info(f"loading checkpoint {args.input_name_or_path}")
-
-    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path)
+def make_trainer(args, nemo_config):
+    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
     nemo_config = load_config(model_args, tokenizer.vocab_file)
 
     if args.precision in ["32", "16"]:
@@ -195,6 +207,14 @@ def convert(args):
     print(f"nemo_config: {nemo_config}")
 
     trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy())
+    return trainer, dtype
+
+
+def convert(args):
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+
+    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path)
+    nemo_config = load_config(model_args, tokenizer.vocab_file)
 
     hidden_size = nemo_config.hidden_size
     head_num = nemo_config.num_attention_heads
@@ -207,8 +227,6 @@ def convert(args):
         'transformer_engine', False
     ), "mcore_gpt transformer_engine must be enabled (or disabled) together."
 
-    param_to_weights = lambda param: param.float()
-
     checkpoint = OrderedDict()
     checkpoint['state_dict'] = OrderedDict()
 
@@ -217,7 +235,7 @@ def convert(args):
         embed_weights_base_name = f'model.embedding.word_embeddings.weight'
     else:
         embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight'
-    checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight)
+    checkpoint['state_dict'][embed_weights_base_name] = embed_weight
 
     if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num:
         num_query_groups = head_num
@@ -227,6 +245,10 @@ def convert(args):
     if mcore_gpt:
         assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.'
 
+    yield checkpoint
+    checkpoint = OrderedDict()
+    checkpoint['state_dict'] = OrderedDict()
+
     for l in range(int(num_layers)):
         print(f"converting layer {l}")
         old_tensor_shape = ckpt[f'model.layers.{l}.self_attn.q_proj.weight'].size()
@@ -249,7 +271,7 @@ def convert(args):
             qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'
         else:
             qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight'
-        checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights)
+        checkpoint['state_dict'][qkv_weights_base_name] = qkv_weights
 
         # attention dense
         o_weight = ckpt[f'model.layers.{l}.self_attn.o_proj.weight']
@@ -257,7 +279,7 @@ def convert(args):
             o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight'
         else:
             o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight'
-        checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight)
+        checkpoint['state_dict'][o_weight_base_name] = o_weight
 
         # # MLP
         # Handle gate
@@ -266,7 +288,7 @@ def convert(args):
             moe_gate_name = f'model.decoder.layers.{l}.mlp.router.weight'
         else:
             raise Exception("not implemented")
-        checkpoint['state_dict'][moe_gate_name] = param_to_weights(moe_gate)
+        checkpoint['state_dict'][moe_gate_name] = moe_gate
         # Handle experts
         for i in range(nemo_config.num_moe_experts):
             gate_proj = ckpt[f'model.layers.{l}.block_sparse_moe.experts.{i}.w1.weight']
@@ -276,14 +298,14 @@ def convert(args):
             else:
                 raise Exception("not implemented")
             mlp_down_weight = torch.cat((gate_proj, up_proj), axis=0)
-            checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight)
+            checkpoint['state_dict'][mlp_down_base_name] = mlp_down_weight
 
             mlp_up_weight = ckpt[f'model.layers.{l}.block_sparse_moe.experts.{i}.w2.weight']
             if mcore_gpt:
                 mlp_up_base_name = f'model.decoder.layers.{l}.mlp.experts.local_experts.{i}.linear_fc2.weight'
             else:
                 raise Exception("not implemented")
-            checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight)
+            checkpoint['state_dict'][mlp_up_base_name] = mlp_up_weight
 
         # LayerNorm
         input_ln_weight = ckpt[f'model.layers.{l}.input_layernorm.weight']
@@ -292,7 +314,7 @@ def convert(args):
             input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'
         else:
             input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight'
-        checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight)
+        checkpoint['state_dict'][input_ln_base_name] = input_ln_weight
 
         post_attn_ln_weight = ckpt[f'model.layers.{l}.post_attention_layernorm.weight']
         if mcore_gpt:
@@ -301,28 +323,57 @@ def convert(args):
             post_attn_ln_base_name = f'model.decoder.layers.{l}.pre_mlp_layernorm.weight'
         else:
             post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight'
-        checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight)
+        checkpoint['state_dict'][post_attn_ln_base_name] = post_attn_ln_weight
 
         print(f"done layer {l}")
 
+        yield checkpoint
+        checkpoint = OrderedDict()
+        checkpoint['state_dict'] = OrderedDict()
+
     final_ln_weight = ckpt[f'model.norm.weight']
     if mcore_gpt:
         final_ln_base_name = f'model.decoder.final_layernorm.weight'
     else:
         final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight'
-    checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight)
+    checkpoint['state_dict'][final_ln_base_name] = final_ln_weight
 
     output_layer_weight = ckpt[f'lm_head.weight']
     if mcore_gpt:
         output_layer_base_name = f'model.output_layer.weight'
     else:
         output_layer_base_name = f'model.language_model.output_layer.weight'
-    checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight)
+    checkpoint['state_dict'][output_layer_base_name] = output_layer_weight
 
     checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
-
+    yield checkpoint
     del ckpt
 
+
+def merge(a: dict, b: dict, path=[]):
+    is_dict = lambda x: isinstance(x, OrderedDict) or isinstance(x, dict)
+    for key in b:
+        if key in a:
+            if is_dict(a[key]) and is_dict(b[key]):
+                merge(a[key], b[key], path + [str(key)])
+            elif a[key] != b[key]:
+                raise Exception('Value conflict: ' + '.'.join(path + [str(key)]))
+        else:
+            a[key] = b[key]
+    return a
+
+
+def save_to_nemo(args, checkpoint):
+
+    logging.info(f"loading checkpoint {args.input_name_or_path}")
+    model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False)
+    nemo_config = load_config(model_args, tokenizer.vocab_file)
+    trainer, dtype = make_trainer(args, nemo_config)
+
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY].use_cpu_initialization = True
+    checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY].perform_initialization = False
+
     if nemo_config.get('megatron_amp_O2', False):
         keys = list(checkpoint['state_dict'].keys())
         for key in keys:
@@ -342,5 +393,21 @@ def convert(args):
 
 if __name__ == '__main__':
     args = get_args()
+    if args.low_ram:
+        os.makedirs(args.tmp_dir, exist_ok=True)
+
     parallel_state.set_expert_model_parallel_world_size(1)
-    convert(args)
+    checkpoint = OrderedDict()
+    for i, ckpt_part in enumerate(convert(args)):
+        if args.low_ram:
+            torch.save(ckpt_part, f'{args.tmp_dir}/nemo_ckpt_part_{i}.pth')
+        else:
+            checkpoint = merge(checkpoint, ckpt_part)
+
+    if args.low_ram:
+        print("Loading partial checkpoints")
+        for path in map(str, Path(args.tmp_dir).rglob("*.pth")):
+            print(f"Loading checkpoint: {path}")
+            checkpoint = merge(checkpoint, torch.load(path, mmap=True))
+
+    save_to_nemo(args, checkpoint)