From e0b3fe51615ac23e14f5393ad200bf6ed74bfbf0 Mon Sep 17 00:00:00 2001
From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Date: Mon, 22 Apr 2024 16:55:42 -0400
Subject: [PATCH 01/30] Skip top_p computations when set to 1.0 (#8905)

Signed-off-by: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/collections/nlp/modules/common/text_generation_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index 660fa04bb08d..3daf93ac0ed2 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -303,7 +303,7 @@ def top_k_logits(logits, top_k=0, top_p=0.0, filter_value=-float('Inf'), started
         else:
             logits[indices_to_remove] = filter_value
 
-    if top_p > 0.0:
+    if 0.0 < top_p < 1.0:
         # Cconvert to 1D
         sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

From 96187eac848ebf02c56e9fc658a57a500a56a842 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Mon, 22 Apr 2024 19:27:46 -0400
Subject: [PATCH 02/30] precision fix (#8962)

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .../nlp/models/language_modeling/megatron_gpt_model.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index e5e48cdc10da..c2e1f0ed48b7 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -166,7 +166,7 @@ def forward(self, **kwargs):
         the superclass by the square root of the hidden size specified in the configuration.
         """
         embeddings = super().forward(**kwargs)
-        return embeddings * (self.config.hidden_size ** 0.5)
+        return embeddings * torch.tensor(self.config.hidden_size ** 0.5, dtype=embeddings.dtype)
 
 
 class MegatronGPTExportableModel(torch.nn.Module, Exportable):

From a6424e9c5cb3087d2448f9307ab0370fd14befd5 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Tue, 23 Apr 2024 08:26:02 +0200
Subject: [PATCH 03/30] [NeMo-UX] Adding megatron strategy (#8995)

* Adding MegatronParallel

Signed-off-by: Marc Romeyn <marcromeyn@gmail.com>

* Move over _strategy_liMegatronCheckpointIO

Signed-off-by: Marc Romeyn <marcromeyn@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: Marc Romeyn <marcromeyn@gmail.com>

* add strategy lib test

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused test and add comment

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Marc Romeyn <marcromeyn@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/io/__init__.py                          |   0
 nemo/io/pl.py                                | 167 ++++++
 nemo/lightning/_strategy_lib.py              | 438 ++++++++++++++++
 nemo/lightning/pytorch/__init__.py           |   0
 nemo/lightning/pytorch/callbacks/__init__.py |   3 +
 nemo/lightning/pytorch/callbacks/progress.py |  67 +++
 nemo/lightning/pytorch/strategies.py         | 502 +++++++++++++++++++
 tests/lightning/test_megatron_parallel.py    |  24 +-
 tests/lightning/test_strategy_lib.py         | 211 ++++++++
 9 files changed, 1400 insertions(+), 12 deletions(-)
 create mode 100644 nemo/io/__init__.py
 create mode 100644 nemo/io/pl.py
 create mode 100644 nemo/lightning/_strategy_lib.py
 create mode 100644 nemo/lightning/pytorch/__init__.py
 create mode 100644 nemo/lightning/pytorch/callbacks/__init__.py
 create mode 100644 nemo/lightning/pytorch/callbacks/progress.py
 create mode 100644 nemo/lightning/pytorch/strategies.py
 create mode 100644 tests/lightning/test_strategy_lib.py

diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/io/pl.py b/nemo/io/pl.py
new file mode 100644
index 000000000000..f6bf46557b43
--- /dev/null
+++ b/nemo/io/pl.py
@@ -0,0 +1,167 @@
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, TypeVar, Union
+
+import lightning as L
+import torch
+from lightning.fabric.plugins.io.checkpoint_io import CheckpointIO
+from lightning.fabric.utilities.cloud_io import get_filesystem
+from lightning.fabric.utilities.types import _PATH
+from torch import nn
+from typing_extensions import override
+
+
+log = logging.getLogger(__name__)
+
+
+LightningModuleT = TypeVar("LightningModuleT", bound=L.LightningModule)
+ModuleT = TypeVar("ModuleT", bound=nn.Module)
+
+
+class MegatronCheckpointIO(CheckpointIO):
+    """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
+    common for most use cases.
+
+    .. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.
+
+    """
+
+    @override
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            checkpoint: dict containing model and trainer state
+            path: write-target path
+            storage_options: not used in ``TorchCheckpointIO.save_checkpoint``
+
+        Raises
+        ------
+            TypeError:
+                If ``storage_options`` arg is passed in
+
+        """
+        from megatron.core import dist_checkpointing
+
+        if storage_options is not None:
+            raise TypeError(
+                "`Trainer.save_checkpoint(..., storage_options=...)` with `storage_options` arg"
+                f" is not supported for `{self.__class__.__name__}`. Please implement your custom `CheckpointIO`"
+                " to define how you'd like to use `storage_options`."
+            )
+        checkpoint_dir = ckpt_to_dir(path)
+        fs = get_filesystem(checkpoint_dir)
+        if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
+            logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
+            return
+
+        fs.makedirs(checkpoint_dir, exist_ok=True)
+        dist_checkpointing.save(sharded_state_dict=checkpoint, checkpoint_dir=str(checkpoint_dir))
+
+    @override
+    def load_checkpoint(
+        self, path: _PATH, sharded_state_dict=None, map_location: Optional[Callable] = None
+    ) -> Dict[str, Any]:
+        """Loads checkpoint using :func:`torch.load`, with additional handling for ``fsspec`` remote loading of files.
+
+        Args:
+            path: Path to checkpoint
+            map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
+                locations.
+
+        Returns: The loaded checkpoint.
+
+        Raises
+        ------
+            FileNotFoundError: If ``path`` is not found by the ``fsspec`` filesystem
+
+        """
+        from megatron.core import dist_checkpointing
+
+        if map_location is not None:
+            raise ValueError("`map_location` argument is not supported for `MegatronCheckpointIO.load_checkpoint`.")
+
+        # Try to read the checkpoint at `path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(path)
+        if not fs.exists(path):
+            raise FileNotFoundError(f"Checkpoint file not found: {path}")
+        if not fs.isdir(path):
+            raise ValueError(f"Distributed checkpoints should be a directory. Found: {path}.")
+
+        # return pl_load(path, map_location=map_location)
+
+        checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=str(path))
+        checkpoint = _fix_tensors_device(checkpoint)
+
+        return checkpoint
+
+    @override
+    def remove_checkpoint(self, path: _PATH) -> None:
+        """Remove checkpoint file from the filesystem.
+
+        Args:
+            path: Path to checkpoint
+
+        """
+        fs = get_filesystem(path)
+        if fs.exists(path):
+            fs.rm(path, recursive=True)
+            log.debug(f"Removed checkpoint: {path}")
+
+
+def _fix_tensors_device(ckpt: Dict) -> Dict:
+    """Ensure checkpoint tensors are on the correct device."""
+    assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
+    cur_dev = torch.device("cuda", index=torch.cuda.current_device())
+
+    from megatron.core.dist_checkpointing.dict_utils import dict_list_map_outplace
+
+    def _fix_device(t):
+        if isinstance(t, torch.Tensor) and t.is_cuda and t.device != cur_dev:
+            t = t.to(cur_dev)
+        return t
+
+    return dict_list_map_outplace(_fix_device, ckpt)
+
+
+def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
+    """PTL considers checkpoints as .ckpt files.
+    This method removes the extension and returns a path
+    to be used as a directory for distributed checkpoints.
+    """
+    filepath = Path(filepath)
+
+    if not filepath.suffix == ".ckpt":
+        filepath = filepath.with_suffix(filepath.suffix + ".ckpt")
+
+    # adding this assert because we will later remove directories based on the return value of this method
+    assert filepath.suffix == ".ckpt", f"filepath: {filepath} must have .ckpt extension"
+
+    # create a new path whose name is the original filepath without the .ckpt extension
+    checkpoint_dir = filepath.with_name(filepath.stem)
+
+    return checkpoint_dir
+
+
+def is_distributed_ckpt(path) -> bool:
+    """Check if the given path corresponds to a distributed checkpoint directory.
+
+    This function determines if the specified path is a directory that contains a distributed
+    checkpoint by checking the directory's metadata.
+
+    Args:
+        path (Union[str, Path]): The path to check for being a distributed checkpoint.
+
+    Returns
+    -------
+        bool: True if the path is a distributed checkpoint directory, False otherwise.
+
+    """
+    from megatron.core import dist_checkpointing
+
+    checkpoint_dir = ckpt_to_dir(path)
+    fs = get_filesystem(checkpoint_dir)
+    if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
+        return True
+
+    return False
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
new file mode 100644
index 000000000000..e3f5f146ff12
--- /dev/null
+++ b/nemo/lightning/_strategy_lib.py
@@ -0,0 +1,438 @@
+import itertools
+import os
+from collections import defaultdict
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Any, Dict, Generator, Optional, Protocol, TypeVar
+
+import torch
+from torch import nn
+
+NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE = "NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE"
+
+
+if TYPE_CHECKING:
+    from lightning.fabric.utilities.types import Optimizable
+    from megatron.core.model_parallel_config import ModelParallelConfig
+
+
+class SharedStateDictProtocol(Protocol):
+    def sharded_state_dict(self, prefix=""):
+        ...
+
+
+def init_parallel_ranks(
+    world_size: int, global_rank: int, local_rank: int, parallel_config: "ModelParallelConfig", seed=1234, fp8=False,
+) -> None:
+    """
+    Initializes the parallel ranks for distributed training.
+
+    This function sets up the parallel ranks based on the provided world size, global rank, local rank,
+    and parallel configuration. It also sets the seed for random number generation and determines whether
+    to use fp8 precision.
+
+    Args:
+        world_size (int): The total number of processes participating in the distributed training.
+        global_rank (int): The rank of the current process in the distributed training setup.
+        local_rank (int): The rank of the current process within its machine.
+        parallel_config (ModelParallelConfig): The configuration object containing settings for model parallelism.
+        seed (int, optional): The seed for random number generation. Defaults to 1234.
+        fp8 (bool, optional): Whether to use fp8 precision for model parameters. Defaults to False.
+    """
+    from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo
+    from nemo.utils import AppState
+
+    app_state = AppState()
+
+    if os.environ.get(NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, "false").lower() == "true":
+        init_world_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
+        init_global_rank = app_state.global_rank
+        init_local_rank = app_state.local_rank
+    else:
+        init_world_size = world_size
+        init_global_rank = global_rank
+        init_local_rank = local_rank
+
+    initialize_model_parallel_for_nemo(
+        world_size=init_world_size,
+        global_rank=init_global_rank,
+        local_rank=init_local_rank,
+        tensor_model_parallel_size=parallel_config.tensor_model_parallel_size,
+        pipeline_model_parallel_size=parallel_config.pipeline_model_parallel_size,
+        virtual_pipeline_model_parallel_size=parallel_config.virtual_pipeline_model_parallel_size,
+        seed=seed,
+        pipeline_model_parallel_split_rank=getattr(parallel_config, "pipeline_model_parallel_split_rank", None),
+        use_fp8=fp8,
+        init_mpi_proc_group=getattr(parallel_config, "ub_tp_comm_overlap", False),
+        # apex_transformer_log_level=self.cfg.get('apex_transformer_log_level', 30),
+    )
+
+
+def init_model_parallel(model: Optional[nn.Module] = None) -> None:
+    """Initializes Megatron-LM model parallel if using model parallelism."""
+    import torch.distributed
+    from megatron.core import parallel_state
+
+    from nemo.utils import AppState
+
+    app_state = AppState()
+
+    # we initialize megatron-lm model parallel and data parallel groups
+    # after initializing DDP with PTL.
+    if app_state.model_parallel_size is not None:
+        # destroy groups in case they have already been created
+        # this happens with multiple calls to trainer.test for example
+        parallel_state.destroy_model_parallel()
+        if torch.distributed.is_initialized():
+            parallel_state.initialize_model_parallel(
+                tensor_model_parallel_size=app_state.tensor_model_parallel_size,
+                pipeline_model_parallel_size=app_state.pipeline_model_parallel_size,
+                virtual_pipeline_model_parallel_size=app_state.virtual_pipeline_model_parallel_size,
+                pipeline_model_parallel_split_rank=app_state.pipeline_model_parallel_split_rank,
+            )
+
+            # assert that fake tp and pp rank match after model parallel init
+            assert app_state.tensor_model_parallel_rank == parallel_state.get_tensor_model_parallel_rank()
+            assert app_state.pipeline_model_parallel_rank == parallel_state.get_pipeline_model_parallel_rank()
+
+            app_state.tensor_model_parallel_group = parallel_state.get_tensor_model_parallel_group()
+            app_state.data_parallel_group = parallel_state.get_data_parallel_group()
+            app_state.data_parallel_rank = parallel_state.get_data_parallel_rank()
+            app_state.data_parallel_size = parallel_state.get_data_parallel_world_size()
+            app_state.pipeline_model_parallel_group = parallel_state.get_pipeline_model_parallel_group()
+
+            # create MPI process group for UCX-based communication APIs
+            if app_state.init_mpi_proc_group:
+                torch.distributed.new_group(backend="mpi")
+
+        if model:
+            # Set TP group
+            # Deep iterate but skip self to avoid infinite recursion.
+            for index, child in enumerate(model.modules()):
+                if index == 0:
+                    continue
+                if hasattr(child, "set_tensor_parallel_group"):
+                    tp_group = parallel_state.get_tensor_model_parallel_group()
+                    child.set_tensor_parallel_group(tp_group)
+
+
+@contextmanager
+def megatron_lazy_init_context(config) -> Generator[None, None, None]:
+    def monkey_patched(c):
+        return {"device": "meta"}
+
+    from megatron.core.transformer.custom_layers import transformer_engine as _te
+
+    original = _te._get_extra_te_kwargs  # noqa: SLF001
+    _te._get_extra_te_kwargs = monkey_patched  # noqa: SLF001
+
+    _orig_perform_initialization = config.perform_initialization
+    _orig_use_cpu_initialization = config.use_cpu_initialization
+
+    config.perform_initialization = False
+    config.use_cpu_initialization = True
+
+    yield
+
+    _te._get_extra_te_kwargs = original  # noqa: SLF001
+    config.perform_initialization = _orig_perform_initialization
+    config.use_cpu_initialization = _orig_use_cpu_initialization
+
+
+@contextmanager
+def megatron_cpu_init_context(config) -> Generator[None, None, None]:
+    _orig_use_cpu_initialization = config.use_cpu_initialization
+
+    config.use_cpu_initialization = True
+
+    yield
+
+    config.use_cpu_initialization = _orig_use_cpu_initialization
+
+
+ModelT = TypeVar("ModelT", bound=nn.Module)
+
+
+class GradScaler(torch.cuda.amp.GradScaler):
+    """
+    Gradient sclaer for model-parallel inf check. The inf in gradients are checked across tensor-parallel
+    ranks in (1) executing optimizer step and (2) gradient scaler update.
+
+    """
+
+    def __init__(
+        self,
+        init_scale=2.0 ** 16,
+        growth_factor=2.0,
+        backoff_factor=0.5,
+        growth_interval=2000,
+        enabled=True,
+        hysteresis=1,
+    ):
+        super().__init__(
+            init_scale=init_scale,
+            growth_factor=growth_factor,
+            backoff_factor=backoff_factor,
+            growth_interval=growth_interval,
+            enabled=enabled,
+        )
+        self.optimizer_update_skipped: Optional[bool] = None
+        self.hysteresis = hysteresis
+        self._hysteresis_tracker = self.hysteresis
+
+    def _unscale_grads_(self, optimizer, *args):
+        if getattr(optimizer, "_custom_amp_unscale_grads", False):
+            return optimizer.unscale_grads(*args)
+        else:
+            return super()._unscale_grads_(optimizer, *args)
+
+    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
+        from megatron.core import parallel_state
+
+        retval = None
+        found_inf = torch.cuda.FloatTensor([sum(v.item() for v in optimizer_state["found_inf_per_device"].values())])
+
+        # Update across all model parallel instances.
+        torch.distributed.all_reduce(
+            found_inf, op=torch.distributed.ReduceOp.MAX, group=parallel_state.get_model_parallel_group(),
+        )
+
+        if found_inf.item() == 0:
+            retval = optimizer.step(*args, **kwargs)
+            self.optimizer_update_skipped = False
+        else:
+            self.optimizer_update_skipped = True
+        return retval
+
+    def update(self, new_scale=None):
+        """
+        Updates to native grad scaler update function.
+        1. Check inf across model-parallel ranks.
+        2. Update hysteresis tracker.
+        3. Apply hysteresis to grad scale update.
+        """
+        from megatron.core import parallel_state
+
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = (
+                    "new_scale should be a float or a 1-element torch.cuda.FloatTensor with" " requires_grad=False."
+                )
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+
+            # Update across all model parallel instances.
+            torch.distributed.all_reduce(
+                found_inf_combined, op=torch.distributed.ReduceOp.MAX, group=parallel_state.get_model_parallel_group(),
+            )
+
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf = found_infs[i]
+                    # Update across all model parallel instances.
+                    torch.distributed.all_reduce(
+                        found_inf, op=torch.distributed.ReduceOp.MAX, group=parallel_state.get_model_parallel_group(),
+                    )
+                    found_inf_combined += found_inf
+
+            if found_inf_combined > 0:
+                self._hysteresis_tracker -= 1
+                if self._hysteresis_tracker <= 0:
+                    # When hysteresis becomes zero, follow the native grad scale update rule.
+                    # Increase scale and reset growth tracker
+                    torch._amp_update_scale_(  # noqa: SLF001
+                        _scale,
+                        _growth_tracker,
+                        found_inf_combined,
+                        self._growth_factor,
+                        self._backoff_factor,
+                        self._growth_interval,
+                    )
+                else:
+                    # Only reset the growth tracker when hysteresis is larger than zero
+                    _growth_tracker.fill_(0.0)
+            else:
+                # When no inf found, follow the native grad scale update rule.
+                # Increment growth_tracker, update scale when growth tracker reaches the interval, and
+                # reset the hysteresis tracker.
+                torch._amp_update_scale_(  # noqa: SLF001
+                    _scale,
+                    _growth_tracker,
+                    found_inf_combined,
+                    self._growth_factor,
+                    self._backoff_factor,
+                    self._growth_interval,
+                )
+                self._hysteresis_tracker = self.hysteresis
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(
+            torch.cuda.amp.grad_scaler._refresh_per_optimizer_state  # noqa: SLF001
+        )
+
+    def state_dict(self):
+        """
+        Add hysteresis_tracker to the native functions' state_dict.
+        """
+        return (
+            {
+                "scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker(),
+                "_hysteresis_tracker": self._hysteresis_tracker,
+            }
+            if self._enabled
+            else {}
+        )
+
+    def load_state_dict(self, state_dict):
+        """
+        Load hysteresis_tracker in addition to the state dict of the native function.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError(
+                "The source state dict is empty, possibly because it was saved "
+                "from a disabled instance of GradScaler."
+            )
+
+        self._init_scale = state_dict["scale"]
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = state_dict["growth_factor"]
+        self._backoff_factor = state_dict["backoff_factor"]
+        self._growth_interval = state_dict["growth_interval"]
+        self._init_growth_tracker = state_dict["_growth_tracker"]
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+        if "_hysterisis_tracker" in state_dict:
+            self._hysteresis_tracker = state_dict["_hysterisis_tracker"]
+        else:
+            self._hysteresis_tracker = 1
+
+
+def enable_nvidia_optimizations() -> None:
+    """These optimizations are present in NVIDIA NGC PyTorch Containers."""
+    # NVIDIA container version check
+    nvidia_torch_version = os.getenv("NVIDIA_PYTORCH_VERSION", None)
+    if nvidia_torch_version is not None:
+        try:
+            NVIDIA_TORCH_MAJOR = int(nvidia_torch_version.split(".")[0])
+        except Exception:
+            NVIDIA_TORCH_MAJOR = 0
+        try:
+            NVIDIA_TORCH_MINOR = int(nvidia_torch_version.split(".")[1])
+        except Exception:
+            NVIDIA_TORCH_MINOR = 0
+
+        # NVFUSER available starting with 21.11
+        if NVIDIA_TORCH_MAJOR >= 21 or (NVIDIA_TORCH_MAJOR == 21 and NVIDIA_TORCH_MINOR >= 11):
+            # NVFUSER
+            torch._C._jit_set_profiling_executor(True)  # noqa: SLF001
+            torch._C._jit_set_profiling_mode(True)  # noqa: SLF001
+            torch._C._jit_override_can_fuse_on_cpu(False)  # noqa: SLF001
+            torch._C._jit_override_can_fuse_on_gpu(False)  # noqa: SLF001
+            torch._C._jit_set_texpr_fuser_enabled(False)  # noqa: SLF001
+            # torch._C._jit_set_nvfuser_enabled(True)
+            torch._C._debug_set_autodiff_subgraph_inlining(False)  # noqa: SLF001
+    else:
+        # Not a Nvidia container. NVFUSER Dependency check is on users
+        pass
+
+
+def optimizer_sharded_state_dict(model: SharedStateDictProtocol, optimizer: "Optimizable") -> Dict[str, torch.Tensor]:
+    """
+    Sharded state dictionary for an MainParamsOptimizerWrapper.
+    Used to save and load the optimizer state when training with distributed_checkpoint.
+
+    Returns
+    -------
+        dict: The sharded state dictionary for the optimizer
+    Raises:
+        ValueError: If a parameter ID does not match any model sharded parameter.
+    """
+    from megatron.core.dist_checkpointing.optimizer import (
+        get_param_id_to_sharded_param_map,
+        make_sharded_optimizer_tensor,
+        optim_state_to_sharding_state,
+    )
+
+    from nemo.core.optim import MainParamsOptimizerWrapper
+    from nemo.core.optim.optimizers import init_optimizer_states
+
+    model_sharded_state_dict = model.sharded_state_dict()
+
+    # remove _extra_state
+    model_sharded_state_dict = {
+        key: value for key, value in model_sharded_state_dict.items() if not key.endswith("_extra_state")
+    }
+
+    if hasattr(optimizer, "sharded_state_dict"):
+        return optimizer.sharded_state_dict(model_sharded_state_dict)
+
+    if not isinstance(optimizer, MainParamsOptimizerWrapper):
+        # Regular optimizer, e.g. Adam or FusedAdam
+        init_optimizer_states(optimizer)
+        optimizer_state_dict = optimizer.state_dict()
+        id_to_sharded_param_map = get_param_id_to_sharded_param_map(
+            model_sharded_state_dict=model_sharded_state_dict,
+            optim_params_iter=itertools.chain.from_iterable(g['params'] for g in optimizer.param_groups),
+        )
+        optim_state_to_sharding_state(optimizer_state_dict, id_to_sharded_param_map)
+        return optimizer_state_dict
+
+    optimizer_state_dict: Dict[str, Any] = optimizer.state_dict()
+
+    id_to_sharded_param_map = get_param_id_to_sharded_param_map(
+        model_sharded_state_dict=model_sharded_state_dict,
+        optim_params_iter=itertools.chain.from_iterable(g for g in optimizer.float16_groups),
+    )
+
+    # Convert fp32_from_fp16_params
+    assert len(optimizer_state_dict["fp32_from_fp16_params"]) == len(optimizer_state_dict["optimizer"]["param_groups"])
+
+    def get_safe(param_id):
+        try:
+            return id_to_sharded_param_map[param_id]
+        except KeyError as e:
+            raise ValueError(f"Param id {param_id} does not match any model sharded param") from e
+
+    optimizer_state_dict["fp32_from_fp16_params"] = [
+        [
+            make_sharded_optimizer_tensor(get_safe(param_id), fp32_param, prefix="optimizer.state.fp32_param")
+            for param_id, fp32_param in zip(state_group["params"], fp32_group)
+        ]
+        for fp32_group, state_group in zip(
+            optimizer_state_dict["fp32_from_fp16_params"], optimizer_state_dict["optimizer"]["param_groups"],
+        )
+    ]
+
+    # Convert state
+    optim_state_to_sharding_state(optimizer_state_dict["optimizer"], id_to_sharded_param_map)
+
+    return optimizer_state_dict
diff --git a/nemo/lightning/pytorch/__init__.py b/nemo/lightning/pytorch/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py
new file mode 100644
index 000000000000..fcceedeb7090
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/__init__.py
@@ -0,0 +1,3 @@
+from nemo_ext.lightning.pytorch.callbacks.progress import MegatronProgressBar
+
+__all__ = ["MegatronProgressBar"]
diff --git a/nemo/lightning/pytorch/callbacks/progress.py b/nemo/lightning/pytorch/callbacks/progress.py
new file mode 100644
index 000000000000..9d4d9b385da8
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/progress.py
@@ -0,0 +1,67 @@
+from pytorch_lightning.callbacks.progress import TQDMProgressBar
+from pytorch_lightning.callbacks.progress.tqdm_progress import _update_n
+
+
+class MegatronProgressBar(TQDMProgressBar):
+    """
+    Add MegatronProgressBar to remove 's/it' and display progress per step instead of per microbatch
+    for megatron models.
+    """
+
+    def get_current_epoch_step(self, trainer) -> int:
+        """
+        Get the value of step within an epoch.
+        """
+        return max(
+            trainer.fit_loop.epoch_loop.automatic_optimization.optim_progress.optimizer.step.current.completed,
+            trainer.fit_loop.epoch_loop.manual_optimization.optim_step_progress.current.completed,
+        )
+
+    def init_train_tqdm(self):
+        """
+        Override bar_format to not have 's/it'.
+        """
+        self.bar = super().init_train_tqdm()
+        self.bar.bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}{postfix}]"
+        return self.bar
+
+    def on_train_epoch_start(self, trainer, *_):
+        if trainer.max_steps > 0 and (trainer.ckpt_path is not None):
+            # while resuming from a ckpt use trainer.max_steps as the total for progress bar as trainer.num_training_batches
+            # is truncated to max_steps - step being resumed at
+            num_training_batches = trainer.max_steps
+        else:
+            num_training_batches = trainer.num_training_batches
+
+        # from nemo.utils import AppState
+        # app_state = AppState()
+        # app_state.
+
+        num_training_batches = num_training_batches // calculate_data_parallel_groups()
+
+        self.train_progress_bar.reset(num_training_batches)
+        self.train_progress_bar.initial = 0
+        self.train_progress_bar.set_description(f"Epoch {trainer.current_epoch}")
+
+    def on_train_batch_end(self, trainer, pl_module, *_, **__):
+        """
+        Override parent class on_train_batch_end to update progress bar per global batch instead of per microbatch.
+        """
+        n = self.get_current_epoch_step(trainer)
+        if self._should_update(n, self.train_progress_bar.total):
+            _update_n(self.train_progress_bar, n)
+            self.train_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
+
+
+def calculate_data_parallel_groups() -> int:
+    from nemo.utils import AppState
+
+    app_state = AppState()
+
+    pipeline_model_parallel_size = app_state.pipeline_model_parallel_size
+    tensor_model_parallel_size = app_state.tensor_model_parallel_size
+
+    world_size = app_state.world_size
+    data_parallel_group_len = world_size // (pipeline_model_parallel_size * tensor_model_parallel_size)
+
+    return world_size // data_parallel_group_len
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
new file mode 100644
index 000000000000..0fa386cb45ef
--- /dev/null
+++ b/nemo/lightning/pytorch/strategies.py
@@ -0,0 +1,502 @@
+import functools
+import logging
+import shutil
+from collections import OrderedDict
+from contextlib import ExitStack
+from pathlib import Path
+from typing import Any, ContextManager, Dict, List, Mapping, Optional, TypeVar, Union, cast
+
+import lightning.pytorch as pl
+import torch
+import torch.distributed
+from lightning.fabric.plugins import CheckpointIO, ClusterEnvironment
+from lightning.fabric.utilities.optimizer import _optimizers_to_device
+from lightning.pytorch.accelerators import CPUAccelerator
+from lightning.pytorch.callbacks.progress import TQDMProgressBar
+from lightning.pytorch.loops import _AutomaticOptimization, evaluation_loop, fit_loop, prediction_loop
+from lightning.pytorch.loops.fetchers import _DataLoaderIterDataFetcher
+from lightning.pytorch.overrides.distributed import _sync_module_states
+from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
+from lightning.pytorch.strategies.ddp import DDPStrategy
+from lightning.pytorch.trainer.states import RunningStage, TrainerFn
+from lightning.pytorch.utilities.model_helpers import is_overridden
+from lightning.pytorch.utilities.types import STEP_OUTPUT
+from torch import nn
+from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
+from torch.nn.parallel import DistributedDataParallel
+from torch.utils.data import DataLoader
+from typing_extensions import override
+
+from nemo.io.pl import MegatronCheckpointIO
+from nemo.lightning import _strategy_lib
+from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
+from nemo.lightning.pytorch.callbacks import MegatronProgressBar
+
+ConfigT = TypeVar("ConfigT")
+
+
+class MegatronStrategy(DDPStrategy):
+    """Megatron plugin for Pytorch Lightning.
+
+    Args:
+        no_ddp_communication_hook: Disable DDP communication hook when using AMP-O2
+        with FP32 gradient accumulation.
+    """
+
+    trainer: pl.Trainer
+
+    def __init__(
+        self,
+        tensor_model_parallel_size: int = 1,
+        pipeline_model_parallel_size: int = 1,
+        virtual_pipeline_model_parallel_size: Optional[int] = None,
+        sequence_parallel: bool = False,
+        # data_sampler: Optional[DataSampler] = None,
+        parallel_devices: Optional[List[torch.device]] = None,
+        cluster_environment=None,  # TODO: Add type-hint
+        checkpoint_io=None,  # TODO: Add type-hint
+        no_ddp_communication_hook: bool = True,
+        find_unused_parameters: bool = False,
+        lazy_init: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            parallel_devices,
+            cluster_environment,
+            checkpoint_io,
+            find_unused_parameters=find_unused_parameters,
+            **kwargs,
+        )
+        self.no_ddp_communication_hook = no_ddp_communication_hook
+        self.megatron_callbacks = CallbackConnector()
+        # self.data_sampler: Optional[DataSampler] = data_sampler
+        self.tensor_model_parallel_size = tensor_model_parallel_size
+        self.pipeline_model_parallel_size = pipeline_model_parallel_size
+        self.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
+        self.sequence_parallel = sequence_parallel
+        self.lazy_init = lazy_init
+
+        # used in NVIDIA NGC PyTorch containers
+        _strategy_lib.enable_nvidia_optimizations()
+
+    @override
+    def connect(self, model: pl.LightningModule) -> None:
+        super().connect(model)
+
+        # Right now mcore sub-classes ModelParellelConfig, we should remove that
+        # Given Lightning's structure it would be better if parallelism is a different object
+        # Since then it can be passed to the Strategy
+
+        from megatron.core.transformer.transformer_config import TransformerConfig
+
+        has_mcore_config = isinstance(getattr(model, "config", None), TransformerConfig)
+        if has_mcore_config and is_overridden("configure_model", model):
+            config: TransformerConfig = model.config
+            config.tensor_model_parallel_size = self.tensor_model_parallel_size
+            config.pipeline_model_parallel_size = self.pipeline_model_parallel_size
+            config.virtual_pipeline_model_parallel_size = self.virtual_pipeline_model_parallel_size
+            config.sequence_parallel = self.sequence_parallel
+            self._mcore_config = config
+
+    @override
+    def setup(self, trainer: pl.Trainer) -> None:
+        assert self.accelerator is not None
+        self.accelerator.setup(trainer)
+        self.trainer = trainer
+
+        # move the model to the correct device
+        # self.model_to_device()
+
+        # skip wrapping the model if we are not fitting as no gradients need to be exchanged
+        trainer_fn = trainer.state.fn
+
+        if trainer_fn == TrainerFn.FITTING and self._layer_sync:
+            assert self.model is not None
+            self.model = self._layer_sync.apply(self.model)
+
+        datamodule = getattr(trainer, "datamodule", None)
+        if not self.data_sampler and hasattr(datamodule, "data_sampler"):
+            self.data_sampler = datamodule.data_sampler
+            self.data_sampler.setup(self.cluster_environment.global_rank())
+
+        if self.data_sampler:
+            self.data_sampler.connect(trainer)
+
+        self._fix_progress_bar(trainer)
+        self.setup_megatron_parallel(trainer)
+        self.setup_precision_plugin()
+
+        if trainer.num_sanity_val_steps > 1 and self.pipeline_model_parallel_size > 1:
+            # TODO: log here
+            trainer.num_sanity_val_steps = 0
+
+        for loop in [fit_loop, evaluation_loop, prediction_loop]:
+            loop._select_data_fetcher = _data_fetcher_wrapper(loop._select_data_fetcher)  # noqa: SLF001
+
+        if trainer_fn == TrainerFn.FITTING:
+            # TODO: Make sure we don't always wrap the model in data-parallel
+            # See: https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/parts/nlp_overrides.py#L215-L217
+
+            # do not wrap with DDP if not fitting as there's no gradients to reduce
+            self.configure_ddp()
+
+            trainer.fit_loop.epoch_loop.automatic_optimization = _MegatronAutomaticOptimization(trainer)
+
+            # set up optimizers after the wrapped module has been moved to the device
+            self.setup_optimizers(trainer)
+            if hasattr(self.precision_plugin, "convert_optimizer"):
+                _optimizers = [*self.optimizers]
+                _optimizers[0] = self.precision_plugin.convert_optimizer(self.optimizers[0])
+                self.optimizers = _optimizers
+
+            _optimizers_to_device(self.optimizers, self.root_device)
+
+            import torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook as post_localSGD
+
+            if isinstance(self._ddp_comm_state, post_localSGD.PostLocalSGDState):
+                self._enable_model_averaging()
+        else:
+            # we need to manually synchronize the module's states since we aren't using the DDP wrapper
+            assert self.model is not None
+            _sync_module_states(self.model)
+
+    @override
+    def setup_distributed(self) -> None:
+        self._setup_parallel_ranks()
+        super().setup_distributed()
+
+        from megatron.core import parallel_state
+        from nemo.utils import AppState
+
+        # init model parallel if needed
+        if not parallel_state.model_parallel_is_initialized():
+            app_state = AppState()
+
+            if app_state.model_parallel_size is not None:
+                _strategy_lib.init_model_parallel(self.model)
+
+        if self.data_sampler:
+            assert isinstance(self.cluster_environment, ClusterEnvironment), "Cluster environment not initialized"
+            self.data_sampler.setup(self.cluster_environment.global_rank())
+
+    @override
+    def process_dataloader(self, dataloader: DataLoader) -> DataLoader:
+        if self.data_sampler:
+            return self.data_sampler.transform_dataloader(dataloader)
+
+        return dataloader
+
+    def setup_megatron_parallel(self, trainer: pl.Trainer) -> None:
+        assert self.model is not None, "Model is not set"
+
+        self.megatron_parallel = MegatronParallel(
+            self.model,
+            precision_plugin=self.precision_plugin,
+            vp_size=self.virtual_pipeline_model_parallel_size,
+            cpu=isinstance(trainer.accelerator, CPUAccelerator),
+        )
+        self.model = self.megatron_parallel
+        self.model.trainer = trainer
+
+        if hasattr(self.precision_plugin, "convert_module"):
+            self.model = self.precision_plugin.convert_module(self.model)
+        self.model.callbacks.add(getattr(trainer, "callbacks"))
+
+        if self.data_sampler:
+            self.model.callbacks.add(self.data_sampler)
+
+        datamodule = getattr(trainer, "datamodule", None)
+        if datamodule:
+            self.model.callbacks.add(datamodule)
+
+    @override
+    def configure_ddp(self) -> None:
+        logging.debug(f"{self.__class__.__name__}: configuring MegatronParallel")
+        self.model = self._setup_model(self.model)
+        self._register_ddp_hooks()
+
+    @override
+    def _setup_model(self, model: nn.Module) -> DistributedDataParallel:
+        """Only called when we need to wrap the model for pytorch's ddp."""
+        from megatron.core import parallel_state
+        from nemo.utils import AppState
+
+        app_state = AppState()
+        if app_state.model_parallel_size is not None:
+            self._ddp_kwargs["process_group"] = parallel_state.get_data_parallel_group()
+
+        dist_data_parallel: DistributedDataParallel = super()._setup_model(model)
+        if self.no_ddp_communication_hook:
+            # When using custom gradient accumulation and allreduce, disable
+            # DDP communication hook that works on the gradient bucket.
+            # Instead, use the custom gradient function and communication hook,
+            # which is defined in the master optimizer wrapper.
+            dist_data_parallel.require_backward_grad_sync = False
+            dist_data_parallel.register_comm_hook(None, noop_hook)
+
+        return dist_data_parallel
+
+    def _setup_parallel_ranks(self) -> None:
+        self.set_world_ranks()
+        env = cast(ClusterEnvironment, self.cluster_environment)
+
+        _strategy_lib.init_parallel_ranks(env.world_size(), env.global_rank(), env.local_rank(), self.parallelism)
+
+    @override
+    def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        assert self.lightning_module is not None
+        assert self.model is not None
+        kwargs = self._update_step_kwargs(dataloader_iter, kwargs, "training")
+
+        with self.precision_plugin.train_step_context():  # TODO: Do we need this?
+            return self.model(dataloader_iter, *args, **kwargs)
+
+    @override
+    def validation_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        assert self.lightning_module is not None
+        assert self.model is not None
+        kwargs = self._update_step_kwargs(dataloader_iter, kwargs, "validation")
+
+        with self.precision_plugin.val_step_context():  # TODO: Do we need this?
+            return self.model(dataloader_iter, *args, **kwargs)
+
+    @override
+    def test_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        assert self.lightning_module is not None
+        assert self.model is not None
+        kwargs = self._update_step_kwargs(dataloader_iter, kwargs, "test")
+
+        with self.precision_plugin.test_step_context():  # TODO: Do we need this?
+            return self.model(dataloader_iter, *args, **kwargs)
+
+    @override
+    def predict_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+        assert self.lightning_module is not None
+        assert self.model is not None
+        kwargs = self._update_step_kwargs(dataloader_iter, kwargs, "predict")
+
+        with self.precision_plugin.predict_step_context():  # TODO: Do we need this?
+            return self.model(dataloader_iter, *args, **kwargs)
+
+    @override
+    def teardown(self) -> None:
+        super().teardown()
+
+    @override
+    def model_sharded_context(self) -> ContextManager:
+        if self.lazy_init and hasattr(self, "_mcore_config"):
+            stack = ExitStack()
+            stack.enter_context(_strategy_lib.megatron_lazy_init_context(self._mcore_config))
+            return stack
+
+        return super().model_sharded_context()
+
+    def _update_step_kwargs(self, dataloader_iter, kwargs, step_name: str):
+        if "data_step" not in kwargs:
+            kwargs["data_step"] = self._get_data_step(step_name)
+        if "forward_step" not in kwargs:
+            kwargs["forward_step"] = self._get_forward_step(step_name)
+        if "loss_reduction" not in kwargs:
+            kwargs["loss_reduction"] = self._get_loss_reduction(step_name)
+        kwargs.update(self._data_config_kwargs(dataloader_iter))
+
+        return kwargs
+
+    def _fix_progress_bar(self, trainer: pl.Trainer) -> None:
+        callbacks: List[pl.Callback] = cast(List[pl.Callback], getattr(trainer, "callbacks"))
+        contains_megatron_progress, contains_progress = False, False
+        for callback in callbacks:
+            if isinstance(callback, MegatronProgressBar):
+                contains_megatron_progress = True
+            if callback.__class__ == TQDMProgressBar:
+                contains_progress = True
+        if not contains_megatron_progress and contains_progress:
+            for callback in callbacks:
+                if isinstance(callback, TQDMProgressBar):
+                    callback.__class__ = MegatronProgressBar
+                    break
+
+    def optimizer_sharded_state_dict(self):
+        """
+        Sharded state dictionary for an MainParamsOptimizerWrapper.
+        Used to save and load the optimizer state when training with distributed_checkpoint.
+
+        Returns
+        -------
+            dict: The sharded state dictionary for the optimizer
+        Raises:
+            ValueError: If a parameter ID does not match any model sharded parameter.
+        """
+        # TODO: Fix when MainParamsOptimizerWrapper is not used
+
+        optimizer = self.lightning_module.optimizers(use_pl_optimizer=False)
+
+        return _strategy_lib.optimizer_sharded_state_dict(self.megatron_parallel, optimizer)
+
+    @override
+    def save_checkpoint(
+        self, checkpoint: Dict[str, Any], filepath: Union[str, Path], storage_options: Optional[Any] = None
+    ) -> None:
+        checkpoint['state_dict'] = OrderedDict([])  # remove device state_dict
+        checkpoint['sharded_state_dict'] = self.megatron_parallel.sharded_state_dict()
+        if self.trainer.state.fn == TrainerFn.FITTING:
+            checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
+
+        self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
+
+    @override
+    def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
+        """PTL method which we override to integrate distributed checkpoints for model parallel models.
+        In order to load distributed checkpoints we need to provide the sharded_state_dict to
+        the distributed load function. We get the sharded_state_dict from self.lightning_module
+        which makes it convenient to have the loading logic happen at the strategy level.
+        """
+        torch.cuda.empty_cache()
+
+        # After dist_checkpointing.load, sharded tensors will be replaced with tensors
+        sharded_state_dict = {}
+        sharded_state_dict["state_dict"] = self.megatron_parallel.sharded_state_dict()
+
+        # if self.trainer.state.fn == TrainerFn.FITTING:
+        #     if self.lightning_module.optimizers(use_pl_optimizer=False):
+        #         sharded_state_dict["optimizer_states"] = [self.optimizer_sharded_state_dict()]
+
+        checkpoint = self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=sharded_state_dict)
+
+        return checkpoint
+
+    def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
+        if self.is_global_zero:
+            shutil.rmtree(ckpt_to_dir(filepath))
+
+    def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
+        assert self.megatron_parallel is not None
+        from megatron.core import mpu
+
+        for index, module in enumerate(self.megatron_parallel):
+            if mpu.get_virtual_pipeline_model_parallel_world_size() is not None:
+                checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
+            else:
+                checkpoint_state_dict = checkpoint['state_dict']
+            # checkpoint_state_dict has "model." but module does not so we need to remove it when loading
+            checkpoint_state_dict = {
+                key.replace('model.', ''): checkpoint_state_dict.pop(key) for key in list(checkpoint_state_dict.keys())
+            }
+            module.load_state_dict(checkpoint_state_dict, strict=strict)
+
+    @property
+    @override
+    def checkpoint_io(self) -> CheckpointIO:
+        if self._checkpoint_io is None:
+            self._checkpoint_io = MegatronCheckpointIO()
+        elif isinstance(self._checkpoint_io, _WrappingCheckpointIO):
+            self._checkpoint_io.checkpoint_io = MegatronCheckpointIO()
+
+        return self._checkpoint_io
+
+    def _get_data_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
+        for fn_name in [f"{step_type}_data_step", "data_step"]:
+            if hasattr(self.lightning_module, fn_name):
+                return _ModuleStepFunction(fn_name)
+
+        return None
+
+    def _get_forward_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
+        from megatron.core import mpu
+
+        if mpu.is_pipeline_last_stage():
+            if not hasattr(self.lightning_module, f"{step_type}_step"):
+                raise ValueError(f"LightningModule does not have {step_type}_step method")
+
+            return _ModuleStepFunction(f"{step_type}_step", includes_self=True)
+
+        for fn_name in [f"{step_type}_forward_step", "forward_step"]:
+            if hasattr(self.lightning_module, fn_name):
+                return _ModuleStepFunction(fn_name, includes_self=True)
+
+        return None
+
+    def _get_loss_reduction(self, step_type: str) -> Optional[_ModuleStepFunction]:
+        for fn_name in [f"{step_type}_loss_reduction", "loss_reduction"]:
+            if hasattr(self.lightning_module, fn_name):
+                return _ModuleStepFunction(fn_name, is_property=True)
+
+        return None
+
+    def _data_config_kwargs(self, dataloader_iter) -> Dict[str, Any]:
+        if not hasattr(dataloader_iter, "data_config") and self.data_sampler:
+            if hasattr(self.data_sampler, "megatron_data_kwargs"):
+                return self.data_sampler.megatron_data_kwargs
+
+        return {}
+
+    @property
+    def distributed_sampler_kwargs(self) -> Dict[str, Any]:
+        from nemo.utils import AppState
+
+        app_state = AppState()
+        if app_state.model_parallel_size is not None:
+            # When using model parallel, data parallel groups are non-trivial and they
+            # correspond to the logical GPUs. This means that the GPUs that form a
+            # single logical GPU all need to get the same batch of data.
+            distributed_sampler_kwargs = dict(
+                num_replicas=app_state.data_parallel_size, rank=app_state.data_parallel_rank
+            )
+            return distributed_sampler_kwargs
+
+        else:
+            return super().distributed_sampler_kwargs
+
+    @property
+    def restore_checkpoint_after_setup(self) -> bool:
+        """Needs to be True for distributed checkpointing because
+        we require the model to have configured the optimizer before
+        deserializing the checkpoint.
+        """
+        return True
+
+    @property
+    def parallelism(self):
+        from megatron.core.model_parallel_config import ModelParallelConfig
+
+        return ModelParallelConfig(
+            tensor_model_parallel_size=self.tensor_model_parallel_size,
+            pipeline_model_parallel_size=self.pipeline_model_parallel_size,
+            virtual_pipeline_model_parallel_size=self.virtual_pipeline_model_parallel_size,
+            sequence_parallel=self.sequence_parallel,
+        )
+
+
+def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
+    """PTL considers checkpoints as .ckpt files.
+    This method removes the extension and returns a path
+    to be used as a directory for distributed checkpoints.
+    """
+    filepath = Path(filepath)
+
+    if filepath.suffix == ".ckpt":
+        return filepath.with_name(filepath.stem)
+
+    return filepath
+
+
+def _data_fetcher_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapped(trainer: pl.Trainer, stage: RunningStage):
+        if isinstance(trainer.strategy, MegatronStrategy):
+            return _DataLoaderIterDataFetcher()
+
+        return fn(trainer, stage)
+
+    return wrapped
+
+
+class _MegatronAutomaticOptimization(_AutomaticOptimization):
+    """
+    Custom loop for automatic optimization, tailored to work with a specific training_step
+    implementation that involves custom data preparation, forward pass, and loss reduction steps.
+    """
+
+    def __init__(self, trainer: "pl.Trainer") -> None:
+        super().__init__(trainer)
+        self._skip_backward = True  # megatron will do the backward pass
diff --git a/tests/lightning/test_megatron_parallel.py b/tests/lightning/test_megatron_parallel.py
index cac568747331..06e614d48251 100644
--- a/tests/lightning/test_megatron_parallel.py
+++ b/tests/lightning/test_megatron_parallel.py
@@ -52,8 +52,8 @@ def mock_loss_reduction(self, mocker):
 
     def test_init_with_defaults(self, mocker, mock_pipeline):
         """Test __init__ with default parameters."""
-        mocker.patch('megatron.core.mpu.get_pipeline_model_parallel_world_size', return_value=1)
-        mocker.patch('megatron.core.mpu.model_parallel_is_initialized', return_value=False)
+        mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
+        mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
 
         megatron_parallel = mp.MegatronParallel(pipeline=mock_pipeline)
 
@@ -76,8 +76,8 @@ def test_init_with_defaults(self, mocker, mock_pipeline):
     #     mock_loss_reduction
     # ):
     #     """Test __init__ with custom parameters."""
-    #     mocker.patch('megatron.core.mpu.get_pipeline_model_parallel_world_size', return_value=1)
-    #     mocker.patch('megatron.core.mpu.model_parallel_is_initialized', return_value=False)
+    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=1)
+    #     mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=False)
     #
     #     megatron_parallel = mp.MegatronParallel(
     #         pipeline=mock_pipeline,
@@ -99,20 +99,20 @@ def test_init_with_defaults(self, mocker, mock_pipeline):
     # def test_init_with_virtual_pipeline(self, mocker, mock_pipeline):
     #     """Test __init__ with virtual pipeline model parallel world size."""
     #     mocker.patch('torch.distributed.get_rank', return_value=1)
-    #     mocker.patch('megatron.core.mpu.get_tensor_model_parallel_group', return_value=1)
-    #     mocker.patch('megatron.core.mpu.get_pipeline_model_parallel_group', return_value=1)
-    #     mocker.patch('megatron.core.mpu.get_pipeline_model_parallel_world_size', return_value=2)
-    #     mocker.patch('megatron.core.mpu.model_parallel_is_initialized', return_value=True)
-    #     mocker.patch('megatron.core.mpu.set_virtual_pipeline_model_parallel_world_size')
-    #     mocker.patch('megatron.core.mpu.set_virtual_pipeline_model_parallel_rank')
+    #     mocker.patch('megatron.core.parallel_state.get_tensor_model_parallel_group', return_value=1)
+    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_group', return_value=1)
+    #     mocker.patch('megatron.core.parallel_state.get_pipeline_model_parallel_world_size', return_value=2)
+    #     mocker.patch('megatron.core.parallel_state.model_parallel_is_initialized', return_value=True)
+    #     mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size')
+    #     mocker.patch('megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank')
     #     mocker.patch('nemo_ext.lightning._strategy_lib.init_lightning_module', return_value=mock_pipeline)
 
     #     megatron_parallel = mp.MegatronParallel(mock_pipeline, vp_size=2)
 
     #     assert len(megatron_parallel.pipeline) == 2
     #     assert all(isinstance(mod, nn.Module) for mod in megatron_parallel.pipeline)
-    #     megatron.core.mpu.set_virtual_pipeline_model_parallel_world_size.assert_called_once_with(2)
-    #     assert megatron.core.mpu.set_virtual_pipeline_model_parallel_rank.call_count == 1
+    #     megatron.core.parallel_state.set_virtual_pipeline_model_parallel_world_size.assert_called_once_with(2)
+    #     assert megatron.core.parallel_state.set_virtual_pipeline_model_parallel_rank.call_count == 1
 
 
 class TestCallbackConnector:
diff --git a/tests/lightning/test_strategy_lib.py b/tests/lightning/test_strategy_lib.py
new file mode 100644
index 000000000000..96f5f2920bcf
--- /dev/null
+++ b/tests/lightning/test_strategy_lib.py
@@ -0,0 +1,211 @@
+from unittest.mock import ANY, MagicMock, patch
+
+from torch import nn
+
+from nemo.lightning import _strategy_lib  # , DataConfig
+
+
+class Identity(nn.Identity):
+    def __init__(self):
+        super().__init__()
+
+
+class WithCopy(nn.Identity):
+    def copy(self):
+        return WithCopy()
+
+
+@patch('nemo.collections.nlp.modules.common.megatron.megatron_init.initialize_model_parallel_for_nemo')
+def test_init_parallel_ranks(mock_initialize_model_parallel) -> None:
+    from nemo.utils import AppState
+
+    app_state = AppState()
+
+    app_state.tensor_model_parallel_size = 2
+    app_state.pipeline_model_parallel_size = 3
+    app_state.global_rank = 1
+    app_state.local_rank = 0
+
+    mock_parallel_config = MagicMock()
+    mock_parallel_config.tensor_model_parallel_size = 2
+    mock_parallel_config.pipeline_model_parallel_size = 3
+    mock_parallel_config.virtual_pipeline_model_parallel_size = 4
+    mock_parallel_config.ub_tp_comm_overlap = False
+    mock_parallel_config.pipeline_model_parallel_split_rank = None
+
+    _strategy_lib.init_parallel_ranks(
+        world_size=2, global_rank=1, local_rank=0, parallel_config=mock_parallel_config, seed=1234, fp8=False,
+    )
+    mock_initialize_model_parallel.assert_called_once_with(
+        world_size=2,
+        global_rank=1,
+        local_rank=0,
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=3,
+        virtual_pipeline_model_parallel_size=4,
+        seed=1234,
+        pipeline_model_parallel_split_rank=None,
+        use_fp8=False,
+        init_mpi_proc_group=False,
+    )
+
+
+@patch('torch.distributed.is_initialized', return_value=True)
+@patch('megatron.core.parallel_state')
+def test_init_model_parallel(mock_mpu, *args):
+    from nemo.utils import AppState
+
+    app_state = AppState()
+    app_state.model_parallel_size = 1
+    app_state.tensor_model_parallel_size = 2
+    app_state.pipeline_model_parallel_size = 1
+    app_state.pipeline_model_parallel_split_rank = None
+    app_state.init_mpi_proc_group = False
+    app_state.tensor_model_parallel_rank = 2
+    app_state.pipeline_model_parallel_rank = 0
+
+    _mpu_tp_2(mock_mpu)
+    _strategy_lib.init_model_parallel(nn.Identity())
+
+    mock_mpu.initialize_model_parallel.assert_called_once_with(
+        tensor_model_parallel_size=2,
+        pipeline_model_parallel_size=1,
+        virtual_pipeline_model_parallel_size=None,
+        pipeline_model_parallel_split_rank=None,
+    )
+
+
+# TODO @chcui uncomment after DataConfig is merged
+# @patch('nemo.lightning._strategy_lib.DataLoader', return_value=MagicMock())
+# @patch('megatron.core.parallel_state')
+# def test_process_dataloader(mock_mpu, mock_dataloader) -> None:
+#     mock_dataloader_instance = MagicMock()
+#     mock_dataloader_instance.dataset = [1, 2, 3]
+#     mock_dataloader_instance.num_workers = 4
+#     mock_dataloader_instance.pin_memory = True
+#     mock_dataloader_instance.persistent_workers = False
+#
+#     data_config = DataConfig(256)
+#     data_config.micro_batch_size = 2
+#     data_config.global_batch_size = 6
+#     data_config.rampup_batch_size = 3
+#
+#     mock_mpu.get_data_parallel_rank.return_value = 0
+#     mock_mpu.get_data_parallel_world_size.return_value = 1
+#
+#     out = _strategy_lib.process_dataloader(mock_dataloader_instance, data_config)
+#     assert isinstance(out.batch_sampler, MagicMock)
+#     mock_dataloader.assert_called_once_with(
+#         mock_dataloader_instance.dataset,
+#         batch_sampler=ANY,
+#         num_workers=4,
+#         pin_memory=True,
+#         persistent_workers=False,
+#         collate_fn=ANY
+#     )
+
+
+# @patch('nemo.lightning._strategy_lib.init_parallel_ranks')
+# @patch('megatron.core.parallel_state')
+# def test_setup_megatron_parallel_with_trainer(mock_mpu, mock_init_parallel_ranks) -> None:
+#     _mpu_tp_2(mock_mpu)
+#     mock_trainer = MagicMock(spec=pl.Trainer)
+#     mock_trainer.strategy = MegatronStrategy(
+#         ModelParallelConfig(tensor_model_parallel_size=2),
+#         DataConfig(256),
+#     )
+#     mock_trainer.world_size = 2
+#     mock_trainer.local_rank = 0
+#     mock_trainer.global_rank = 1
+
+#     result = _strategy_lib.setup_megatron_parallel(mock_trainer, nn.Identity())
+#     mock_init_parallel_ranks.assert_called_once()
+#     assert isinstance(result, LightningMegatronParallel)
+#     assert len(result) == 1
+
+#     # Test with function
+#     assert len(_strategy_lib.setup_megatron_parallel(mock_trainer, lambda: nn.Identity())) == 1
+
+
+# @patch('nemo.lightning._strategy_lib.init_parallel_ranks')
+# @patch('megatron.core.parallel_state')
+# def test_setup_megatron_parallel_virtual_pipelining(mock_mpu, mock_init_parallel_ranks) -> None:
+#     vp_size = 4
+#     _mpu_tp_2(mock_mpu)
+#     mock_mpu.get_pipeline_model_parallel_world_size.return_value = 4
+#     mock_trainer = MagicMock(spec=pl.Trainer)
+#     mock_trainer.strategy = MegatronStrategy(
+#         ModelParallelConfig(
+#             virtual_pipeline_model_parallel_size=vp_size,
+#             tensor_model_parallel_size=2,
+#         ),
+#         DataConfig(256),
+#     )
+#     mock_trainer.world_size = 8
+#     mock_trainer.local_rank = 0
+#     mock_trainer.global_rank = 1
+
+#     result = _strategy_lib.setup_megatron_parallel(mock_trainer, Identity())
+#     mock_init_parallel_ranks.assert_called_once()
+#     assert len(result) == vp_size
+
+#     # Test with function
+#     assert len(_strategy_lib.setup_megatron_parallel(mock_trainer, lambda: nn.Identity())) == vp_size
+
+#     # Test with a module with a copy method
+#     assert len(_strategy_lib.setup_megatron_parallel(mock_trainer, WithCopy())) == vp_size
+
+#     with pytest.raises(
+#         ValueError,
+#         match="Model does not have a copy method. Please implement this or " +
+#         "pass in a function that returns the model"
+#     ):
+#         _strategy_lib.setup_megatron_parallel(mock_trainer, nn.Identity())
+
+
+# @patch('nemo.lightning._strategy_lib.init_parallel_ranks')
+# @patch('megatron.core.parallel_state')
+# def test_setup_megatron_parallel_with_fabric(mock_mpu, mock_init_parallel_ranks) -> None:
+#     _mpu_tp_2(mock_mpu)
+#     mock_trainer = MagicMock(spec=fl.Fabric)
+#     mock_trainer.strategy = FabricMegatronStrategy(
+#         ModelParallelConfig(tensor_model_parallel_size=2),
+#         DataConfig(256),
+#     )
+#     mock_trainer.world_size = 2
+#     mock_trainer.local_rank = 0
+#     mock_trainer.global_rank = 1
+
+#     result = _strategy_lib.setup_megatron_parallel(mock_trainer, nn.Identity())
+
+#     mock_init_parallel_ranks.assert_called_once()
+#     assert isinstance(result, MegatronParallel)
+#     assert len(result) == 1
+
+
+# @patch('nemo.lightning._strategy_lib.init_parallel_ranks')
+# @patch('megatron.core.parallel_state')
+# def test_setup_megatron_parallel_with_strategy(mock_mpu, mock_init_parallel_ranks) -> None:
+#     _mpu_tp_2(mock_mpu)
+#     mock_trainer = MagicMock(spec=FabricMegatronStrategy)
+#     mock_trainer.configure_mock(
+#         parallelism=ModelParallelConfig(tensor_model_parallel_size=2),
+#         data_config=DataConfig(256),
+#         world_size=2,
+#         local_rank=0,
+#         global_rank=1
+#     )
+
+#     result = _strategy_lib.setup_megatron_parallel(mock_trainer, nn.Identity())
+
+#     mock_init_parallel_ranks.assert_called_once()
+#     assert isinstance(result, MegatronParallel)
+#     assert len(result) == 1
+
+
+def _mpu_tp_2(mock_mpu) -> None:
+    mock_mpu.get_tensor_model_parallel_rank.return_value = 2
+    mock_mpu.get_pipeline_model_parallel_rank.return_value = 0
+    mock_mpu.get_pipeline_model_parallel_world_size.return_value = 1
+    mock_mpu.get_pipeline_model_parallel_group.return_value = 0
+    mock_mpu.get_tensor_model_parallel_group.return_value = 1

From 571a425a73eedd3cca71ccd7b7c46f8e06f11c80 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 23 Apr 2024 09:41:37 +0200
Subject: [PATCH 04/30] Quantized checkpoint support in export and deploy
 modules (#8859)

* Resolve engine build command for int8_sq quantization

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix links and typos

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Add quantization docs to ToC

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Opt for using torchrun

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable exporting and running quantized qnemo checkpoints

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Report evaluation time and shorten passing results around

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Fix undefined model_info

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Unfold import path

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable HF tokenizer

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add copyright headers

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Update AMMO to 0.9.4

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Unpack qnemo checkpoint if it's a tarball

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Format results display

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Dockerfile                                    |   2 +-
 Jenkinsfile                                   |   2 +-
 docs/source/index.rst                         |   1 +
 docs/source/nlp/quantization.rst              |  46 +++++---
 .../conf/megatron_llama_quantization.yaml     |   3 +-
 nemo/export/quantize/quantizer.py             |  44 +++++--
 nemo/export/tarutils.py                       |   5 +
 nemo/export/tensorrt_llm.py                   |  76 +++++++-----
 nemo/export/trt_llm/qnemo/__init__.py         |  16 +++
 nemo/export/trt_llm/qnemo/align_config.py     |  46 ++++++++
 .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py    | 109 ++++++++++++++++++
 nemo/export/trt_llm/qnemo/tokenizer_utils.py  |  48 ++++++++
 nemo/export/trt_llm/tensorrt_llm_run.py       |   9 +-
 tests/export/run.sh                           |   9 +-
 tests/export/test_nemo_export.py              |  77 +++++--------
 tests/infer_data_path.py                      | 109 ++++++++++++++++++
 16 files changed, 497 insertions(+), 105 deletions(-)
 create mode 100644 nemo/export/trt_llm/qnemo/__init__.py
 create mode 100644 nemo/export/trt_llm/qnemo/align_config.py
 create mode 100644 nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
 create mode 100644 nemo/export/trt_llm/qnemo/tokenizer_utils.py

diff --git a/Dockerfile b/Dockerfile
index fa825d61f015..4c39b5bad235 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -133,7 +133,7 @@ RUN pip install flash-attn
 # install numba for latest containers
 RUN pip install numba>=0.57.1
 # install ammo
-RUN pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+RUN pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
 
 # copy nemo source into a scratch image
 FROM scratch as nemo-src
diff --git a/Jenkinsfile b/Jenkinsfile
index 72cf51456d62..cbc52d20c41c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -97,7 +97,7 @@ pipeline {
 
     stage('AMMO installation') {
       steps {
-         sh 'pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
+         sh 'pip install nvidia-ammo~=0.9.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir'
       }
     }
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5795b57682a1..8dc74ecc771d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -60,6 +60,7 @@ For more information, browse the developer docs for your area of interest in the
    nlp/models
    nlp/machine_translation/machine_translation
    nlp/megatron_onnx_export
+   nlp/quantization
    nlp/api
 
 
diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
index feb5881ed09d..afe2e9eccbca 100644
--- a/docs/source/nlp/quantization.rst
+++ b/docs/source/nlp/quantization.rst
@@ -3,10 +3,10 @@
 Quantization
 ==========================
 
-Post Training Quantization (PTQ)
+Post-Training Quantization (PTQ)
 --------------------------------
 
-PTQ enables deploying a model in a low-precision format -- FP8, INT4 or INT8 -- for efficient serving. Different quantization methods are available including FP8 quantization, INT8 SmoothQuant and INT4 AWQ.
+PTQ enables deploying a model in a low-precision format -- FP8, INT4, or INT8 -- for efficient serving. Different quantization methods are available including FP8 quantization, INT8 SmoothQuant, and INT4 AWQ.
 
 Model quantization has two primary benefits: reduced model memory requirements and increased inference throughput.
 
@@ -14,24 +14,24 @@ In NeMo, quantization is enabled by the Nvidia AMMO library -- a unified algorit
 
 The quantization process consists of the following steps:
 
-1. Loading a model checkpoint using appropriate parallelism strategy for evaluation
+1. Loading a model checkpoint using an appropriate parallelism strategy
 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
-3. Producing output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
+3. Producing an output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
 
-Loading models requires using AMMO spec defined in `megatron.core.deploy.gpt.model_specs module <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/deploy/gpt/model_specs.py>`_. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also soon to be the part of NeMo project and ``nemo.deploy`` and ``nemo.export`` modules, see https://github.com/NVIDIA/NeMo/pull/8690.
+Loading models requires using an AMMO spec defined in `megatron.core.inference.gpt.model_specs.py <https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/inference/gpt/model_specs.py>`_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
 
 Quantization algorithm can also be conveniently set to ``"null"`` to perform only the weights export step using default precision for TensorRT-LLM deployment. This is useful to obtain baseline performance and accuracy results for comparison.
 
 
 Example
 ^^^^^^^
-The example below shows how to quantize the Llama2 70b model into FP8 precision, using tensor parallelism of 8 on a single DGX H100 node. The quantized model is intended for serving using 2 GPUs specified with ``export.inference_tensor_parallel`` parameter.
+The example below shows how to quantize the Llama2 70b model into FP8 precision, using tensor parallelism of 8 on a single DGX H100 node. The quantized model is designed for serving using 2 GPUs specified with the ``export.inference_tensor_parallel`` parameter.
 
-The script should be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``mpirun`` command below.
+The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below:
 
 .. code-block:: bash
 
-    mpirun -n 8 python examples/nlp/language_modeling/megatron_llama_quantization.py \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_llama_quantization.py \
         model_file=llama2-70b-base-bf16.nemo \
         tensor_model_parallel_size=8 \
         pipeline_model_parallel_size=1 \
@@ -57,24 +57,38 @@ The output directory stores the following files:
     └── tokenizer_config.yaml
 
 
-The TensorRT-LLM engine can be build with ``trtllm-build`` command, see `TensorRT-LLM documentation <https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama#fp8-post-training-quantization>`_.
+The TensorRT-LLM engine can be conveniently built and run using ``TensorRTLLM`` class available in ``nemo.export`` submodule:
+
+.. code-block:: python
+
+    from nemo.export import TensorRTLLM
+
+
+    trt_llm_exporter = TensorRTLLM(model_dir="/path/to/trt_llm_engine_folder")
+    trt_llm_exporter.export(
+        nemo_checkpoint_path="llama2-70b-base-fp8-qnemo",
+        model_type="llama",
+    )
+    trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
+
+
+Alternatively, it can also be built directly using ``trtllm-build`` command, see `TensorRT-LLM documentation <https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama#fp8-post-training-quantization>`_:
 
 .. code-block:: bash
 
     trtllm-build \
         --checkpoint_dir llama2-70b-base-fp8-qnemo \
-        --output_dir engine_dir \
+        --output_dir /path/to/trt_llm_engine_folder \
         --max_batch_size 8 \
         --max_input_len 2048 \
-        --max_output_len 512
-
+        --max_output_len 512 \
+        --strongly_typed
 
 
 Known issues
 ^^^^^^^^^^^^
-* Currently in NeMo quantizing and building TensorRT-LLM engines is limited to single-node use cases.
-* Supported and tested model family is Llama2. Quantizing other model types is experimental and may not be fully supported.
-* For INT8 SmoothQuant ``quantization.algorithm=int8_sq``, the TensorRT-LLM engine cannot be build with CLI ``trtllm-build`` command -- Python API and ``tensorrt_llm.builder`` should be used instead.
+* Currently in NeMo, quantizing and building TensorRT-LLM engines is limited to single-node use cases.
+* The supported and tested model family is Llama2. Quantizing other model types is experimental and may not be fully supported.
 
 
 Please refer to the following papers for more details on quantization techniques.
@@ -82,6 +96,8 @@ Please refer to the following papers for more details on quantization techniques
 References
 ----------
 
+`Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation, 2020 <https://arxiv.org/abs/2004.09602>`_
+
 `FP8 Formats for Deep Learning, 2022 <https://arxiv.org/abs/2209.05433>`_
 
 `SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models, 2022 <https://arxiv.org/abs/2211.10438>`_
diff --git a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
index ac10f7224090..79a5bfbd8fe6 100644
--- a/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_llama_quantization.yaml
@@ -25,12 +25,13 @@ quantization:
   algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
+  awq_block_size: 128 # block size for scaling factors in AWQ algorithm
 
 export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
+  inference_pipeline_parallel: 1 # Default using 1 PP for inference
   dtype: 16 # Default precision data type
-  export_tensorrt_llm_config: true # export config to build TRT-LLM engine directly
 
 model_file: llama2-7b-fp16.nemo # Nemo file path
 model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index a2376b123023..2663f8fe9bac 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -17,6 +17,7 @@
 from contextlib import nullcontext
 from typing import List, Optional
 
+import torch
 import torch.distributed as dist
 from megatron.core import parallel_state
 from megatron.core.transformer.module import Float16Module
@@ -34,7 +35,7 @@
 
 try:
     import ammo.torch.quantization as atq
-    from ammo.torch.export import export_model_config
+    from ammo.torch.export import export_tensorrt_llm_checkpoint
 
     HAVE_AMMO = True
 
@@ -80,7 +81,7 @@ def __init__(
         trainer_config: DictConfig,
     ):
         if not HAVE_AMMO:
-            raise RuntimeError("nvidia-ammo>=0.7 is needed to use Quantizer") from HAVE_AMMO_ERROR
+            raise RuntimeError("nvidia-ammo is needed to use Quantizer") from HAVE_AMMO_ERROR
         QUANT_CFG_CHOICES = {
             "int8": atq.INT8_DEFAULT_CFG,
             "int8_sq": atq.INT8_SMOOTHQUANT_CFG,
@@ -97,10 +98,21 @@ def __init__(
         self.trainer_config = trainer_config
         if quantization_config.algorithm is not None:
             atq_config = QUANT_CFG_CHOICES[quantization_config.algorithm]
-            if quantization_config.algorithm != "fp8":
-                # disable quantization for the last output layer
-                atq_config = copy.deepcopy(atq_config)
-                atq_config["quant_cfg"]["*.output_layer.*"] = {"enable": False}
+
+            if "awq" in quantization_config.algorithm:
+                weight_quantizer = atq_config["quant_cfg"]["*weight_quantizer"]
+                if isinstance(weight_quantizer, list):
+                    weight_quantizer = weight_quantizer[0]
+                weight_quantizer["block_sizes"][-1] = quantization_config.awq_block_size
+
+            # Always turn on FP8 kv cache to save memory footprint.
+            # For int8_sq, we use int8 kv cache.
+            atq_config["quant_cfg"]["*output_quantizer"] = {
+                "num_bits": 8 if quantization_config.algorithm == "int8_sq" else (4, 3),
+                "axis": None,
+                "enable": export_config.decoder_type != "gptnext",
+            }
+
             self.atq_config = atq_config
         else:
             self.atq_config = None
@@ -188,6 +200,22 @@ def forward_loop():
                 model.predict_step(batch, i)
 
         model = atq.quantize(model, self.atq_config, forward_loop)
+
+        if self.export_config == "gptnext":
+            # We found squared_relu may have an under-calibration problem.
+            # Clamp the scaling_factor with a min threshold to avoid under-calibration.
+            maxbound = 0
+            if self.quantization_config.algorithm == "fp8":
+                maxbound = 448
+            elif self.quantization_config.quantization.algorithm == "int8_sq":
+                maxbound = 127
+            model = atq.postprocess_amax(
+                model, "*input_quantizer", lambda amax: torch.clamp(amax, min=0.01 * maxbound)
+            )
+
+        if dist.get_rank() == 0:
+            atq.print_quant_summary(model)
+
         return model
 
     def export(self, model, model_save: str):
@@ -206,13 +234,13 @@ def export(self, model, model_save: str):
             export_handler = nullcontext(enter_result=model_save)
 
         with export_handler as export_dir:
-            export_model_config(
+            export_tensorrt_llm_checkpoint(
                 model=model,
                 decoder_type=self.export_config.decoder_type,
                 dtype=torch_dtype,
                 export_dir=export_dir,
                 inference_tensor_parallel=self.export_config.inference_tensor_parallel,
-                export_tensorrt_llm_config=self.export_config.export_tensorrt_llm_config,
+                inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
             )
             dist.barrier()  # Wait until all ranks complete export_model_config step
             if dist.get_rank() == 0:
diff --git a/nemo/export/tarutils.py b/nemo/export/tarutils.py
index 2f20a5e33698..b93f65274120 100644
--- a/nemo/export/tarutils.py
+++ b/nemo/export/tarutils.py
@@ -202,3 +202,8 @@ def __delitem__(self, key):
 
     def keys(self):
         return self._path.iterdir()
+
+
+def unpack_tarball(archive: str, dest_dir: str):
+    with tarfile.open(archive, mode="r") as tar:
+        tar.extractall(path=dest_dir)
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index e7d538502800..40fb93816a33 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -27,10 +27,12 @@
 import wrapt
 
 from nemo.deploy import ITritonDeployable
-from nemo.export.tarutils import TarPath
+from nemo.export.tarutils import TarPath, unpack_tarball
 from nemo.export.trt_llm.model_config_trt import model_config_to_tensorrt_llm
 from nemo.export.trt_llm.nemo.nemo_ckpt_convert import build_tokenizer
 from nemo.export.trt_llm.nemo_utils import get_tokenzier, nemo_llm_model_to_model_config, nemo_llm_to_model_config
+from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
+from nemo.export.trt_llm.qnemo.tokenizer_utils import get_nmt_tokenizer
 from nemo.export.trt_llm.tensorrt_llm_run import generate, generate_streaming, load, load_refit
 from nemo.export.trt_llm.utils import is_nemo_file
 
@@ -188,32 +190,50 @@ def export(
         tmp_dir = tempfile.TemporaryDirectory()
         nemo_export_dir = Path(tmp_dir.name)
 
-        model_configs, self.tokenizer = nemo_llm_to_model_config(
-            in_file=nemo_checkpoint_path,
-            decoder_type=model_type,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            pipeline_parallel_size=pipeline_parallel_size,
-            nemo_export_dir=nemo_export_dir,
-            save_nemo_model_config=save_nemo_model_config,
-        )
+        if nemo_checkpoint_path.endswith("qnemo"):
+            if os.path.isdir(nemo_checkpoint_path):
+                nemo_export_dir = nemo_checkpoint_path
+            else:
+                unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
+                nemo_checkpoint_path = tmp_dir.name
+            self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
+
+            qnemo_to_tensorrt_llm(
+                nemo_checkpoint_path=nemo_checkpoint_path,
+                engine_dir=self.model_dir,
+                max_input_len=max_input_token,
+                max_output_len=max_output_token,
+                max_batch_size=max_batch_size,
+                max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                lora_target_modules=lora_target_modules,
+            )
+        else:
+            model_configs, self.tokenizer = nemo_llm_to_model_config(
+                in_file=nemo_checkpoint_path,
+                decoder_type=model_type,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallel_size,
+                pipeline_parallel_size=pipeline_parallel_size,
+                nemo_export_dir=nemo_export_dir,
+                save_nemo_model_config=save_nemo_model_config,
+            )
 
-        model_config_to_tensorrt_llm(
-            model_configs,
-            self.model_dir,
-            world_size=tensor_parallel_size * pipeline_parallel_size,
-            max_input_len=max_input_token,
-            max_output_len=max_output_token,
-            max_batch_size=max_batch_size,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            use_inflight_batching=use_inflight_batching,
-            paged_kv_cache=paged_kv_cache,
-            enable_context_fmha=enable_context_fmha,
-            enable_multi_block_mode=enable_multi_block_mode,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_lora_rank=max_lora_rank,
-        )
+            model_config_to_tensorrt_llm(
+                model_configs,
+                self.model_dir,
+                world_size=tensor_parallel_size * pipeline_parallel_size,
+                max_input_len=max_input_token,
+                max_output_len=max_output_token,
+                max_batch_size=max_batch_size,
+                max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                use_inflight_batching=use_inflight_batching,
+                paged_kv_cache=paged_kv_cache,
+                enable_context_fmha=enable_context_fmha,
+                enable_multi_block_mode=enable_multi_block_mode,
+                use_lora_plugin=use_lora_plugin,
+                lora_target_modules=lora_target_modules,
+                max_lora_rank=max_lora_rank,
+            )
 
         tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
         if os.path.exists(tokenizer_path):
@@ -700,5 +720,5 @@ def _load(self):
                     raise Exception(
                         "Files in the TensorRT-LLM folder is corrupted and "
                         "model needs to be exported again. "
-                        "Error message: " + str(error)
-                    )
+                        "Error message: " + repr(error)
+                    ) from error
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
new file mode 100644
index 000000000000..77832d749b66
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .align_config import align_config
+from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
diff --git a/nemo/export/trt_llm/qnemo/align_config.py b/nemo/export/trt_llm/qnemo/align_config.py
new file mode 100644
index 000000000000..abc53224e4b3
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/align_config.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Any, Dict
+
+
+def align_config(config_trtllm_build: Dict[str, Any]) -> Dict[str, Any]:
+    """Function to align config produced by trtllm-build API for consistency
+    with how ModelConfig from tensorrt_llm.runtime is used in the project.
+    """
+    config = {}
+
+    config_trtllm_build = copy.deepcopy(config_trtllm_build)
+
+    # Builder config
+    config["builder_config"] = {}
+    config["builder_config"]["name"] = "NeMo"
+    config["builder_config"].update(config_trtllm_build["build_config"])
+    config["builder_config"].update(config_trtllm_build["pretrained_config"])
+
+    # Plugin config
+    config["plugin_config"] = config["builder_config"].pop("plugin_config")
+
+    # Parallelism config
+    config["builder_config"]["world_size"] = config["builder_config"]["mapping"]["world_size"]
+    config["builder_config"]["tensor_parallel"] = config["builder_config"]["mapping"]["tp_size"]
+    config["builder_config"]["pipeline_parallel"] = config["builder_config"]["mapping"]["pp_size"]
+
+    # Other parameters
+    config["builder_config"]["num_heads"] = config_trtllm_build["pretrained_config"]["num_attention_heads"]
+    config["builder_config"]["num_layers"] = config_trtllm_build["pretrained_config"]["num_hidden_layers"]
+    config["builder_config"]["add_bos"] = False
+    config["builder_config"]["precision"] = config["builder_config"]["dtype"]
+    return config
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
new file mode 100644
index 000000000000..4e74d8e5fb58
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import subprocess
+from typing import List, Optional
+
+from nemo.export.trt_llm.qnemo import align_config
+from nemo.export.trt_llm.tensorrt_llm_build import MODEL_NAME, get_engine_name
+
+CONFIG_NAME = "config.json"
+CONFIG_TRTLLM_BUILD_NAME = "config_trtllm_build.json"
+
+
+def qnemo_to_tensorrt_llm(
+    nemo_checkpoint_path: str,
+    engine_dir: str,
+    max_input_len: int,
+    max_output_len: int,
+    max_batch_size: int,
+    max_prompt_embedding_table_size: int,
+    lora_target_modules: Optional[List[str]] = None,
+):
+    """Build TRT-LLM engine via trtllm-build CLI API in a subprocess."""
+    print(
+        "Note that setting n_gpus, tensor_parallel_size and pipeline_parallel_size parameters"
+        " for quantized models is possible only on export step via nemo.export.quantize module."
+        " These parameters are ignored when building and running TensorRT-LLM engine below."
+    )
+    # Load config to explicitly pass selected parameters to trtllm-build command:
+    with open(os.path.join(nemo_checkpoint_path, CONFIG_NAME), "r") as f:
+        model_config = json.load(f)
+    command = [
+        "trtllm-build",
+        "--checkpoint_dir",
+        nemo_checkpoint_path,
+        "--output_dir",
+        engine_dir,
+        "--max_batch_size",
+        str(max_batch_size),
+        "--max_input_len",
+        str(max_input_len),
+        "--max_output_len",
+        str(max_output_len),
+        "--max_prompt_embedding_table_size",
+        str(max_prompt_embedding_table_size),
+        "--gemm_plugin",
+        model_config["dtype"],
+        "--strongly_typed",
+        "--use_custom_all_reduce",
+        "disable",
+        "--workers",
+        str(model_config["mapping"]["world_size"]),
+    ]
+    command_str = " ".join(command)
+    print(f"Build command is:\n{command_str}")
+    print("Running trtllm-build, this may take a while...")
+    result = subprocess.run(command, capture_output=True)  # TODO: consider streaming logs
+    if result.returncode != 0:
+        print(result.stdout.decode())
+        print(result.stderr.decode())
+        raise RuntimeError("Error encountered for trtllm-build command, please check logs.")
+
+    print("Building engine done. Full logs are:")
+    print(result.stdout.decode())
+
+    # Alignment to make nemo-fw tensorrt_llm.runtime ModelConfig definition compatible with config
+    # produced by trtllm-build API. The new config is saved as "config.json" while the source build
+    # config is saved as "config_trtllm_build.json" in the engine directory for reference.
+    os.rename(os.path.join(engine_dir, CONFIG_NAME), os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME))
+    with open(os.path.join(engine_dir, CONFIG_TRTLLM_BUILD_NAME), "r") as f:
+        config_trtllm_build = json.load(f)
+
+    config = align_config(config_trtllm_build)
+
+    # Other parameters
+    assert lora_target_modules is None
+    config["builder_config"]["lora_target_modules"] = lora_target_modules
+
+    with open(os.path.join(engine_dir, CONFIG_NAME), "w") as f:
+        json.dump(config, f, indent=2)
+
+    # Rename for consistency with how engine is run later
+    for i in range(config["builder_config"]["world_size"]):
+        os.rename(
+            os.path.join(engine_dir, f"rank{i}.engine"),
+            os.path.join(
+                engine_dir,
+                get_engine_name(
+                    MODEL_NAME,
+                    config["builder_config"]["precision"],
+                    config["builder_config"]["tensor_parallel"],
+                    config["builder_config"]["pipeline_parallel"],
+                    i,
+                ),
+            ),
+        )
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
new file mode 100644
index 000000000000..3fde26253af6
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from omegaconf import OmegaConf
+from transformers import AutoTokenizer
+
+from nemo.export.trt_llm.nemo.sentencepiece_tokenizer import SentencePieceTokenizer
+
+# TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
+# from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+TOKENIZER_CONFIG_FILE = "tokenizer_config.yaml"
+
+
+def get_nmt_tokenizer(nemo_checkpoint_path: str):
+    """Build tokenizer from Nemo tokenizer config."""
+
+    print(f"Initializing tokenizer from {TOKENIZER_CONFIG_FILE}")
+    tokenizer_cfg = OmegaConf.load(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE))
+
+    library = tokenizer_cfg.library
+    legacy = tokenizer_cfg.get("sentencepiece_legacy", library == "sentencepiece")
+
+    if library == "huggingface":
+        print(f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_cfg.type}")
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg["type"], use_fast=tokenizer_cfg.get("use_fast", False))
+    elif library == "sentencepiece":
+        print(f"Getting SentencePieceTokenizer with model: {tokenizer_cfg.model}")
+        tokenizer = SentencePieceTokenizer(
+            model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy
+        )
+    else:
+        raise NotImplementedError("Currently we only support 'huggingface' and 'sentencepiece' tokenizer libraries.")
+
+    return tokenizer
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index 1e24f4f207a4..d7e3e40c87a2 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -89,6 +89,13 @@ def _read_config(config_path: Path):
     else:
         tokens_per_block = config["builder_config"]["tokens_per_block"]
 
+    if quantization := config["builder_config"].get("quantization"):
+        # Field "quantization" (dict) is introduced for quantized Nemo checkpoints support.
+        # For regular Nemo checkpoints "quant_mode" field should be used (default: 0).
+        quant_mode = QuantMode.from_quant_algo(quantization['quant_algo'], quantization['kv_cache_quant_algo'])
+    else:
+        quant_mode = QuantMode(config["builder_config"]["quant_mode"])
+
     model_config = ModelConfig(
         model_name=config["builder_config"]["name"],
         max_batch_size=config["builder_config"]["max_batch_size"],
@@ -107,7 +114,7 @@ def _read_config(config_path: Path):
         dtype=config["builder_config"]["precision"],
         lora_plugin=config["plugin_config"]["lora_plugin"],
         lora_target_modules=config["builder_config"]["lora_target_modules"],
-        quant_mode=QuantMode(config["builder_config"]["quant_mode"]),
+        quant_mode=quant_mode,
         use_custom_all_reduce=config["plugin_config"]["use_custom_all_reduce"],
         use_context_fmha_for_generation=config["plugin_config"]["use_context_fmha_for_generation"],
         gather_context_logits=config["builder_config"]["gather_context_logits"],
diff --git a/tests/export/run.sh b/tests/export/run.sh
index 8edac5a334e0..eca22e0d3684 100644
--- a/tests/export/run.sh
+++ b/tests/export/run.sh
@@ -28,9 +28,16 @@ python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_t
 python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
 python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base --existing_test_models --lora --min_gpus 1 --max_gpus 2
 python tests/export/test_nemo_export.py --model_name LLAMA2-7B-code --existing_test_models --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-fp8 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int4 --existing_test_models --min_gpus 1 --max_gpus 1
+python tests/export/test_nemo_export.py --model_name LLAMA2-7B-base-int8 --existing_test_models --min_gpus 1 --max_gpus 1
 python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --min_gpus 1 --max_gpus 2
 python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base --existing_test_models --ptuning --min_gpus 1 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-fp8 --existing_test_models --min_gpus 2 --max_gpus 2
+python tests/export/test_nemo_export.py --model_name LLAMA2-13B-base-int4 --existing_test_models --min_gpus 2 --max_gpus 2
 python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base --existing_test_models --min_gpus 2 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-fp8 --existing_test_models --min_gpus 8 --max_gpus 8
+python tests/export/test_nemo_export.py --model_name LLAMA2-70B-base-int4 --existing_test_models --min_gpus 8 --max_gpus 8
 python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Base-4k --existing_test_models --min_gpus 1 --max_gpus 8
 python tests/export/test_nemo_export.py --model_name NV-GPT-8B-QA-4k --existing_test_models --min_gpus 1 --max_gpus 8
 python tests/export/test_nemo_export.py --model_name NV-GPT-8B-Chat-4k-SFT --existing_test_models --min_gpus 1 --max_gpus 8
@@ -42,5 +49,3 @@ python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_
 python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
 python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
 python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 --run_accuracy --test_deployment True
-
-
diff --git a/tests/export/test_nemo_export.py b/tests/export/test_nemo_export.py
index 6eb41e8a09d7..0c5a9d9e2309 100644
--- a/tests/export/test_nemo_export.py
+++ b/tests/export/test_nemo_export.py
@@ -15,6 +15,7 @@
 import argparse
 import json
 import shutil
+import time
 from pathlib import Path
 import torch
 
@@ -48,6 +49,7 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=Non
     with open(test_data_path, 'r') as file:
         records = json.load(file)
 
+        eval_start = time.perf_counter()
         for record in records:
             prompt = record["text_before_last_word"]
             expected_output = record["last_word"].strip().lower()
@@ -94,6 +96,7 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=Non
                     if len(trtllm_deployed_output) == 1 and len(expected_output) > 1:
                         continue
                     trtllm_deployed_correct_relaxed += 1
+        eval_end = time.perf_counter()
 
     trtllm_accuracy = trtllm_correct / len(all_expected_outputs)
     trtllm_accuracy_relaxed = trtllm_correct_relaxed / len(all_expected_outputs)
@@ -101,13 +104,14 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=Non
     trtllm_deployed_accuracy = trtllm_deployed_correct / len(all_expected_outputs)
     trtllm_deployed_accuracy_relaxed = trtllm_deployed_correct_relaxed / len(all_expected_outputs)
 
+    evaluation_time = eval_end - eval_start
+
     return (
         trtllm_accuracy,
         trtllm_accuracy_relaxed,
         trtllm_deployed_accuracy,
         trtllm_deployed_accuracy_relaxed,
-        all_trtllm_outputs,
-        all_expected_outputs,
+        evaluation_time,
     )
 
 
@@ -141,10 +145,10 @@ def run_trt_llm_inference(
         if n_gpu > torch.cuda.device_count():
             print(
                 "Path: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}".format(
-                    model_info["checkpoint"], model_name, n_gpu, torch.cuda.device_count()
+                    checkpoint_path, model_name, n_gpu, torch.cuda.device_count()
                 )
             )
-            return None, None, None, None
+            return None, None, None, None, None
 
         Path(trt_llm_model_dir).mkdir(parents=True, exist_ok=True)
 
@@ -171,7 +175,7 @@ def run_trt_llm_inference(
                     print("---- PTuning enabled.")
             else:
                 print("---- PTuning could not be enabled and skipping the test.")
-                return None, None, None, None
+                return None, None, None, None, None
 
         lora_ckpt_list = None
         lora_uids = None
@@ -188,7 +192,7 @@ def run_trt_llm_inference(
                     print("---- LoRA enabled.")
             else:
                 print("---- LoRA could not be enabled and skipping the test.")
-                return None, None, None, None
+                return None, None, None, None, None
 
         trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list)
 
@@ -254,23 +258,16 @@ def run_trt_llm_inference(
 
         if run_accuracy:
             print("Start model accuracy testing ...")
-            (
-                trtllm_accuracy,
-                trtllm_accuracy_relaxed,
-                trtllm_deployed_accuracy,
-                trtllm_deployed_accuracy_relaxed,
-                all_trtllm_outputs,
-                all_expected_outputs,
-            ) = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
+            result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
             if test_deployment:
                 nm.stop()
             shutil.rmtree(trt_llm_model_dir)
-            return trtllm_accuracy, trtllm_accuracy_relaxed, trtllm_deployed_accuracy, trtllm_deployed_accuracy_relaxed
+            return result
 
         if test_deployment:
             nm.stop()
         shutil.rmtree(trt_llm_model_dir)
-        return None, None, None, None
+        return None, None, None, None, None
     else:
         raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
 
@@ -290,7 +287,7 @@ def run_existing_checkpoints(
 ):
     if n_gpus > torch.cuda.device_count():
         print("Skipping the test due to not enough number of GPUs")
-        return None, None, None, None
+        return None, None, None, None, None
 
     test_data = get_infer_test_data()
     if not (model_name in test_data.keys()):
@@ -300,7 +297,7 @@ def run_existing_checkpoints(
 
     if n_gpus < model_info["min_gpus"]:
         print("Min n_gpus for this model is {0}".format(n_gpus))
-        return None, None, None, None
+        return None, None, None, None, None
 
     p_tuning_checkpoint = None
     if ptuning:
@@ -445,12 +442,7 @@ def run_inference_tests(args):
             args.max_gpus = args.min_gpus
 
         while n_gpus <= args.max_gpus:
-            (
-                trtllm_accuracy,
-                trtllm_accuracy_relaxed,
-                trtllm_deployed_accuracy,
-                trtllm_deployed_accuracy_relaxed,
-            ) = run_existing_checkpoints(
+            result_dic[n_gpus] = run_existing_checkpoints(
                 model_name=args.model_name,
                 n_gpus=n_gpus,
                 ptuning=args.ptuning,
@@ -462,12 +454,6 @@ def run_inference_tests(args):
                 run_accuracy=args.run_accuracy,
                 test_data_path=args.test_data_path,
             )
-            result_dic[n_gpus] = (
-                trtllm_accuracy,
-                trtllm_accuracy_relaxed,
-                trtllm_deployed_accuracy,
-                trtllm_deployed_accuracy_relaxed,
-            )
 
             n_gpus = n_gpus * 2
     else:
@@ -477,12 +463,7 @@ def run_inference_tests(args):
             args.max_gpus = args.min_gpus
 
         while n_gpus <= args.max_gpus:
-            (
-                trtllm_accuracy,
-                trtllm_accuracy_relaxed,
-                trtllm_deployed_accuracy,
-                trtllm_deployed_accuracy_relaxed,
-            ) = run_trt_llm_inference(
+            result_dic[n_gpus] = run_trt_llm_inference(
                 model_name=args.model_name,
                 model_type=args.model_type,
                 prompt=prompt_template,
@@ -507,29 +488,29 @@ def run_inference_tests(args):
                 test_deployment=args.test_deployment,
                 test_data_path=args.test_data_path,
             )
-            result_dic[n_gpus] = (
-                trtllm_accuracy,
-                trtllm_accuracy_relaxed,
-                trtllm_deployed_accuracy,
-                trtllm_deployed_accuracy_relaxed,
-            )
 
             n_gpus = n_gpus * 2
 
     test_result = "PASS"
-    print("======================================= Test Summary =======================================")
+    print_separator = False
+    print("============= Test Summary ============")
     for i, results in result_dic.items():
         if not results[0] is None and not results[1] is None:
+            if print_separator:
+                print("---------------------------------------")
             print(
-                "Number of GPUS: {0}, Model Accuracy: {1}, Relaxed Model Accuracy: {2}, "
-                "Deployed Model Accuracy: {3}, Deployed Relaxed Model Accuracy: {4}".format(
-                    i, results[0], results[1], results[2], results[3]
-                )
+                "Number of GPUS:                  {}\n"
+                "Model Accuracy:                  {:.4f}\n"
+                "Relaxed Model Accuracy:          {:.4f}\n"
+                "Deployed Model Accuracy:         {:.4f}\n"
+                "Deployed Relaxed Model Accuracy: {:.4f}\n"
+                "Evaluation Time [s]:             {:.2f}".format(i, *results)
             )
+            print_separator = True
             if results[1] < 0.5:
                 test_result = "FAIL"
 
-    print("=============================================================================================")
+    print("=======================================")
     print("TEST: " + test_result)
     if test_result == "FAIL":
         raise Exception("Model accuracy is below 0.5")
diff --git a/tests/infer_data_path.py b/tests/infer_data_path.py
index 32d733117c99..0d4d2d5e7b84 100644
--- a/tests/infer_data_path.py
+++ b/tests/infer_data_path.py
@@ -204,6 +204,115 @@ def get_infer_test_data():
     test_data["LLAMA2-7B-code"]["max_output_token"] = 128
     test_data["LLAMA2-7B-code"]["max_batch_size"] = 10
 
+    test_data["LLAMA2-7B-base-fp8"] = {}
+    test_data["LLAMA2-7B-base-fp8"]["model_type"] = "llama"
+    test_data["LLAMA2-7B-base-fp8"]["min_gpus"] = 1
+    test_data["LLAMA2-7B-base-fp8"]["location"] = "Local"
+    test_data["LLAMA2-7B-base-fp8"]["trt_llm_model_dir"] = "/tmp/LLAMA2-7B-base-fp8/trt_llm_model-1/"
+    test_data["LLAMA2-7B-base-fp8"]["checkpoint"] = "/opt/checkpoints/LLAMA2-7B-base-fp8/LLAMA2-7B-base-fp8-1.qnemo"
+    test_data["LLAMA2-7B-base-fp8"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-7B-base-fp8"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-7B-base-fp8"]["max_output_token"] = 128
+    test_data["LLAMA2-7B-base-fp8"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-7B-base-int4"] = {}
+    test_data["LLAMA2-7B-base-int4"]["model_type"] = "llama"
+    test_data["LLAMA2-7B-base-int4"]["min_gpus"] = 1
+    test_data["LLAMA2-7B-base-int4"]["location"] = "Local"
+    test_data["LLAMA2-7B-base-int4"]["trt_llm_model_dir"] = "/tmp/LLAMA2-7B-base-int4/trt_llm_model-1/"
+    test_data["LLAMA2-7B-base-int4"]["checkpoint"] = "/opt/checkpoints/LLAMA2-7B-base-int4/LLAMA2-7B-base-int4-1.qnemo"
+    test_data["LLAMA2-7B-base-int4"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-7B-base-int4"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-7B-base-int4"]["max_output_token"] = 128
+    test_data["LLAMA2-7B-base-int4"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-7B-base-int8"] = {}
+    test_data["LLAMA2-7B-base-int8"]["model_type"] = "llama"
+    test_data["LLAMA2-7B-base-int8"]["min_gpus"] = 1
+    test_data["LLAMA2-7B-base-int8"]["location"] = "Local"
+    test_data["LLAMA2-7B-base-int8"]["trt_llm_model_dir"] = "/tmp/LLAMA2-7B-base-int8/trt_llm_model-1/"
+    test_data["LLAMA2-7B-base-int8"]["checkpoint"] = "/opt/checkpoints/LLAMA2-7B-base-int8/LLAMA2-7B-base-int8-1.qnemo"
+    test_data["LLAMA2-7B-base-int8"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-7B-base-int8"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-7B-base-int8"]["max_output_token"] = 128
+    test_data["LLAMA2-7B-base-int8"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-13B-base-fp8"] = {}
+    test_data["LLAMA2-13B-base-fp8"]["model_type"] = "llama"
+    test_data["LLAMA2-13B-base-fp8"]["min_gpus"] = 2
+    test_data["LLAMA2-13B-base-fp8"]["location"] = "Local"
+    test_data["LLAMA2-13B-base-fp8"]["trt_llm_model_dir"] = "/tmp/LLAMA2-13B-base-fp8/trt_llm_model-1/"
+    test_data["LLAMA2-13B-base-fp8"]["checkpoint"] = "/opt/checkpoints/LLAMA2-13B-base-fp8/LLAMA2-13B-base-fp8-1-qnemo"
+    test_data["LLAMA2-13B-base-fp8"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-13B-base-fp8"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-13B-base-fp8"]["max_output_token"] = 128
+    test_data["LLAMA2-13B-base-fp8"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-13B-base-int4"] = {}
+    test_data["LLAMA2-13B-base-int4"]["model_type"] = "llama"
+    test_data["LLAMA2-13B-base-int4"]["min_gpus"] = 2
+    test_data["LLAMA2-13B-base-int4"]["location"] = "Local"
+    test_data["LLAMA2-13B-base-int4"]["trt_llm_model_dir"] = "/tmp/LLAMA2-13B-base-int4/trt_llm_model-1/"
+    test_data["LLAMA2-13B-base-int4"][
+        "checkpoint"
+    ] = "/opt/checkpoints/LLAMA2-13B-base-int4/LLAMA2-13B-base-int4-1-qnemo"
+    test_data["LLAMA2-13B-base-int4"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-13B-base-int4"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-13B-base-int4"]["max_output_token"] = 128
+    test_data["LLAMA2-13B-base-int4"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-70B-base-fp8"] = {}
+    test_data["LLAMA2-70B-base-fp8"]["model_type"] = "llama"
+    test_data["LLAMA2-70B-base-fp8"]["min_gpus"] = 8
+    test_data["LLAMA2-70B-base-fp8"]["location"] = "Local"
+    test_data["LLAMA2-70B-base-fp8"]["trt_llm_model_dir"] = "/tmp/LLAMA2-70B-base-fp8/trt_llm_model-1/"
+    test_data["LLAMA2-70B-base-fp8"]["checkpoint"] = "/opt/checkpoints/LLAMA2-70B-base-fp8/LLAMA2-70B-base-fp8-1-qnemo"
+    test_data["LLAMA2-70B-base-fp8"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-70B-base-fp8"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-70B-base-fp8"]["max_output_token"] = 128
+    test_data["LLAMA2-70B-base-fp8"]["max_batch_size"] = 10
+
+    test_data["LLAMA2-70B-base-int4"] = {}
+    test_data["LLAMA2-70B-base-int4"]["model_type"] = "llama"
+    test_data["LLAMA2-70B-base-int4"]["min_gpus"] = 8
+    test_data["LLAMA2-70B-base-int4"]["location"] = "Local"
+    test_data["LLAMA2-70B-base-int4"]["trt_llm_model_dir"] = "/tmp/LLAMA2-70B-base-int4/trt_llm_model-1/"
+    test_data["LLAMA2-70B-base-int4"][
+        "checkpoint"
+    ] = "/opt/checkpoints/LLAMA2-70B-base-int4/LLAMA2-70B-base-int4-1-qnemo"
+    test_data["LLAMA2-70B-base-int4"]["prompt_template"] = [
+        "The capital of France is",
+        "Largest animal in the sea is",
+        "Fastest animal in the world is",
+    ]
+    test_data["LLAMA2-70B-base-int4"]["expected_keyword"] = ["Paris", "Whale", "Cheetah"]
+    test_data["LLAMA2-70B-base-int4"]["max_output_token"] = 128
+    test_data["LLAMA2-70B-base-int4"]["max_batch_size"] = 10
+
     test_data["FALCON-7B-base"] = {}
     test_data["FALCON-7B-base"]["model_type"] = "falcon"
     test_data["FALCON-7B-base"]["min_gpus"] = 1

From 815c5dece27134ecd7e5ddebe19dc8ad0ce262e9 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Tue, 23 Apr 2024 07:50:06 -0500
Subject: [PATCH 05/30] add geglu to mlp swap (#8999)

* add geglu to mlp swap

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* match swiglu

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../common/megatron/adapters/mcore_mixins.py       | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
index 2aeb014c1b40..a5e886f3b479 100644
--- a/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
+++ b/nemo/collections/nlp/modules/common/megatron/adapters/mcore_mixins.py
@@ -15,7 +15,9 @@
 import torch
 import torch.nn.functional as F
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.fusions.fused_bias_geglu import bias_geglu_impl
 from megatron.core.fusions.fused_bias_gelu import bias_gelu_impl
+from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import apply_rotary_pos_emb
 from megatron.core.transformer.attention import SelfAttention
@@ -279,10 +281,16 @@ def forward(self, hidden_states):
 
         if self.config.bias_activation_fusion:
             if self.activation_func == F.gelu:
-                assert self.config.add_bias_linear is True
-                intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
+                if self.config.gated_linear_unit:
+                    intermediate_parallel = bias_geglu_impl(intermediate_parallel, bias_parallel)
+                else:
+                    assert self.config.add_bias_linear is True
+                    intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
             elif self.activation_func == F.silu and self.config.gated_linear_unit:
-                intermediate_parallel = bias_swiglu_impl(intermediate_parallel, bias_parallel)
+                intermediate_parallel = bias_swiglu_impl(
+                    intermediate_parallel, bias_parallel, self.config.activation_func_fp8_input_store,
+                )
+
             else:
                 raise ValueError("Only support fusion of gelu and swiglu")
         else:

From 153dae94690630c1e2a601fd90085b29f4811dae Mon Sep 17 00:00:00 2001
From: Eric Harper <complex451@gmail.com>
Date: Tue, 23 Apr 2024 10:28:34 -0600
Subject: [PATCH 06/30] temporarily remove mcore dep (#9010)

Signed-off-by: eharper <eharper@nvidia.com>
---
 requirements/requirements_nlp.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 5314fd8b5894..46e82089f0ea 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -10,7 +10,7 @@ ijson
 jieba
 markdown2
 matplotlib>=3.3.2
-megatron_core>0.6.0
+#megatron_core>0.6.0 # add back once mcore on pypi is compatible again
 nltk>=3.6.5
 opencc<1.1.7
 pangu

From a3825d54e25a3dc5b76e36cd116874473072ef81 Mon Sep 17 00:00:00 2001
From: phile <simplephile@outlook.com>
Date: Wed, 24 Apr 2024 00:29:25 +0800
Subject: [PATCH 07/30] add timeout for new_group (#8998)

* add timeout for new_group

Signed-off-by: acphile <simplephile@outlook.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: acphile <simplephile@outlook.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 Dockerfile                                  |  1 +
 nemo/collections/nlp/parts/nlp_overrides.py | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4c39b5bad235..396645d37019 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -67,6 +67,7 @@ WORKDIR /workspace/
 RUN git clone https://github.com/NVIDIA/Megatron-LM.git && \
   cd Megatron-LM && \
   git checkout 36e9b6bf3d8034b10c9bbd9fc357c2df2bd1515c && \
+  git cherry-pick -n e69187bc3679ea5841030a165d587bb48b56ee77 && \
   pip install .
 
 # Performance optimizations for distributed optimizer: https://github.com/NVIDIA/apex/pull/1771
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 17a5ac705185..a6f68f0666b5 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -113,7 +113,9 @@
 NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE = "NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE"
 
 
-def init_model_parallel(sharp: bool, nccl_communicator_config_path: str = None) -> None:
+def init_model_parallel(
+    sharp: bool, nccl_communicator_config_path: str = None, distributed_timeout_minutes: int = 30
+) -> None:
     """ Initializes Megatron-LM model parallel if using model parallelism.
 
     Args:
@@ -139,6 +141,7 @@ def init_model_parallel(sharp: bool, nccl_communicator_config_path: str = None)
                 use_sharp=sharp,
                 expert_model_parallel_size=app_state.expert_model_parallel_size,
                 order='tp-pp-dp' if app_state.use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
+                distributed_timeout_minutes=distributed_timeout_minutes,
             )
 
             # assert that fake tp and pp rank match after model parallel init
@@ -219,7 +222,11 @@ def setup_distributed(self, global_rank: int = None, world_size: int = None) ->
             app_state = AppState()
 
             if app_state.model_parallel_size is not None:
-                init_model_parallel(self.sharp, self.nccl_communicator_config_path)
+                init_model_parallel(
+                    self.sharp,
+                    self.nccl_communicator_config_path,
+                    distributed_timeout_minutes=self._timeout.total_seconds() / 60,
+                )
 
     def configure_ddp(self):
         """ Override LightningModule ddp if using model parallel.

From 42b82f80a173a0a6829547015fefc044feb089a4 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Tue, 23 Apr 2024 17:28:31 -0400
Subject: [PATCH 08/30] Zero-shot evaluation pipeline for mcore RETRO (#8941)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update branch

Signed-off-by: eharper <eharper@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749)

* Add dist ckpt support for regular optimizers

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fix imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Pin lhotse=1.19.2 in r1.23.0 (#8303)

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Cache Aware Streaming tutorial notebook (#8296)

* add notebook

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename old notebook to Buffered_Streaming

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* call setup_streaming_params in set_default_att_context_size method

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links to tutorials in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* remove hard-coding

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename var

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* add deallocate pipeline output optimization (#8279)

* add deallocate pipeline output optimization

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix memory leak caused by context parallelism hanging references by omegaconf (#8299)

* save cp_size to self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* use parallel_state instead of self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* remove assertion (#8302)

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Update PEFT Doc (#8262)

* update peft doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove old prompt learning doc and notebook

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Merge branch 'r1.23.0' into chcui/update_peft_doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks  (#8242) (#8324)

* Rebasing canary changes at current main

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move the changes from asr transformer to nlp transformer as originally intended

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update eval to strip spaces before punctuations

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update pc strip

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Refactor: `PromptedAudioToTextLhotseDataset` and `EncDecMultiTaskModel` (#8247)

* Create a separate CanaryDataset and use it inside `transformer_bpe_models.py`. Ditches `token_sequence_format`.

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [canary] Refactor: move changes in transformer_bpe_models.py to Canar… (#8252)

* [canary] Refactor: move changes in transformer_bpe_models.py to CanaryModel

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryModel` to `EncDecMultiTaskModel` and remove inheritance from `EncDecTransfModelBPE`; add a separate config for this model

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryDataset` to `PromptedAudioToTextLhotseDataset`; add `prompt_format_fn` argument; clean-up the `_canary_prompt_format` function a bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move tokenization into `prompt_format_fn`, fix usage, add docs

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Backward-compatible utterance validation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Improve type annotations

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* config and prompt_fn registration changes from review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix transcribe config

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Refactor Canary to follow schema of remaining ASR models (#8260)

* Initial draft of multi task beam decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Stabilize inference

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update AED Multi Task model to mostly conform to Archetype-Type format. Update config

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add change decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Remove redundant imports

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove asr transformer dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* copy token_classifier from nlp to asr

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Address comments

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add typing to beam decoding

Signed-off-by: smajumdar <titu1994@gmail.com>

* Make prompt format configurable

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* drop asr dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>

* fix transcribe, update asr evaluator

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Extend the docs for the canary prompt_fn

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Incorporate changes from Nithin's code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* training bug fix and adding launch script for speech_multitask (#8270)

* bug fix and adding launch script for speech_multitask

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* update launch script example in speech_to_text_aed.py

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* Fix: drop_last must be true in validation/test otherwise the training will hang

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

* revert to current transcribe API

Signed-off-by: stevehuang52 <heh@nvidia.com>

* revert changes to NLP, update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update eval utils

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Remove DALI; rename compute_audio_loss to compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set default use_model_transcribe=False

Signed-off-by: stevehuang52 <heh@nvidia.com>

* change os.path.dirname to pathlib

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Test for CanaryTokenizer + refactoring (#8285)

* Test for CanaryTokenizer

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Attempt at refactor...

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Update config for AED models (#8294)

Signed-off-by: smajumdar <titu1994@gmail.com>

* set default calculate_wer=False in transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 1

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 2

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Document compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
(cherry picked from commit d10726da72f74eb5a95056843d1f9e2562a5051c)

Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* add code for calling mcore_retro in NeMo

* add code for calling mcore_retro in NeMo

* runnable, training curve match retro mcore and nemo

* working on retro inference

* working on megatron_retro_eval.py and megatron_retro_inference.yaml

* refactoring text_generation_utils code and retro inference relevant files

* clean PR

* resolving quick hacks (reading number of train/valid samples from workdir, discrepancy in total samples and samples with neighbors retrieved, tokenizers)

* clean repository

* revert changes to inference/eval code to original in main

* clean code

* runable training code, with already implemented eval code

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* revert to original eval code files

* revert to original eval code files 2

* revert to original eval code files 3

* revert to original eval code files 4

* clean code

* clean code

* update my code to support changes from lastest main

* commit before rebase r1.23.0

* Multimodal r1.23.0 bug fix  (#8315)

* Rename quick-gelu

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* ddpm config guard

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix ddpm edit api

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix insert_image_token cfg issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* neva updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add back jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update default neva template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* copy paste files from r1.23.0

* clean PR

* Fixes for MoE parameter passing & use of AutoTokenizer/Model for mistral. (#8272)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Remove asr webapp (#8347)

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove _target_ at model level in aed config (#8351)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* revert changes for tts and asr

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357)

* Add change_vocabulary and save_tokenizers() support

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* Change default (#8371)

Signed-off-by: smajumdar <titu1994@gmail.com>

* implement retro's own fwd_bwd_step() and validation_step() to not have argument first_val_step, which the MLM commit doesn't support

* adding megatron compile_helpers(), in future can be fixed with correct MLM commit

* bug fix in fast-conformer-aed.yaml and adding jenkins test for speech_to_text_aed model (#8368)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Enable megatron core loggers for GPT pretraining (#8354)

* Logging changes tested for gpt_pretraining

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* Additional args

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* mcore ds fix (#8283)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert apex installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn off the fusion for jenkins

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* addressing Eric's reviews

* adding existing implementation RETRO files

* adding existing implementation RETRO files

* Add Finetuning tutorial with HF Datasets (#8356)

* Add Finetuning tutorial with HF Datasets

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update on Som comments

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* release updates (#8378)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mock ds test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* mcore ds fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* data input fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* MCore dataset compatibility for tokenizers (#8390)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Mcore customization doc (#8298)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* initial placeholder

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* add to intro/index.rst

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* initial content update

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add diff images

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

size

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* minor fixes

* minor language change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean changes

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* wer fix (#8404)

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* updated link to pubmed (#8402)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Update NFA video download link (#8406)

* update nfa nasa video link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update link in markdown

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* revert changes (#8410)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix dreambooth data sampler issue (#8400)

* Turn on drop last

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fixed errors in the CTM gen functions (#8416)

Signed-off-by: Taejin Park <tango4j@gmail.com>

* add ensemble decoding fix (#8427)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* SDE bugfix log (#8430)

Signed-off-by: George <gzelenfroind@nvidia.com>

* mcore customization doc minor fix (#8421)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* NeMo-Mistral to HF converter bugfix. (#8353)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fixing mcore bert for TP, PP and SP (#8336)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add settings to suppress bf16 compile errors in CI on V100 (#8481)

* Add settings to suppress bf16 compile errors in CI on V100

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* MoE parameter passing (#8255)

* MoE parameter passing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass EP/MoE params in consumer scripts.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* PR fixes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Use latest commit of mcore-0.5

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* CI fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update k2 version (#8478) (#8492)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add fp8 support for SD/Update notebook paths (#8489)

* Add fp8 support for SD/Update notebook paths

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* pin to 0.5.0 (#8465)

Signed-off-by: eharper <eharper@nvidia.com>

* Update NeMo Multimodal Requirements (#8515)

* Update requirements_multimodal.txt

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update github raw content link (#8517)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Add dep notice for notebooks (#8522)

* add dep notice

Signed-off-by: eharper <eharper@nvidia.com>

* revert

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Revert FP8 integration (#8520)

* Revert FP8 integration

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update data prep notebook (#8532)

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* before update branch with latest r1.23.0

* update to run with MLM ae2817b3dde4efb1515061a5311d01d8f85bd99c (runnable training and saving checkpoint)

* remove compile_helpers

* reverse changes from main branch to r1.23.0

* adding *_legacy files

* update MLM commit in Jenkinsfile to latest

* debugging Jenkinstest: test different mcore import in retro_dataset

* update Jenkinsfile edit megatron_retro_mutransfer_pretrain_legacy.py

* removing all mcore RETRO to pass the Jenkinstest

* fixing import legacy problem for tests/collections/nlp/test_indexed_retrieval_dataset.py

* update Jenkinsfile file to use TE v0.7

* update NeMo to work with latest mcore RETRO (solving TE problems)

* update TE commit Jenkinsfile to be the same with r1.23.0's Jenkinsfile

* update commit for MLM

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* jenkinstest debugging

* temporary fix RETRO's __init__ for jenkinstest

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* add model.data.dataloader_type=cyclic to jenkinsfile

* runnable for inference

* update code to work with latest megatron-lm main 81dab6067

* update M-LM commit in Jenkinsfile to latest main M-LM 81dab6067

* cleaning inference code

* fix to by pass CI test bf16 problem (following this PR https://github.com/NVIDIA/NeMo/pull/8481/files)

* isort and black

* adjusting model.micro_batch_size to 1

* fix BRANCH = 'r1.23.0'

* replace tutorials dir from main branch to huvu/mcore_retro

* fix minor merges conflict

* update Jenkinsfile

* runnable with a temporary fix from Jacek (unfound -unfinished problem)

* runnable with a temporary fix from Jacek (unfound -unfinished problem)

* modified nlp_overrides.py back to original

* fix checkpoint from Jacek Bieniusiewicz

* config Jenkinsfile test

* set RETRO Jenkins MBS to 1

* black fix

* isort fix

* update TE commit

* update to latest Jenkinsfile with latest container and commits

* remove new RETRO jenkinstest

* merge latest main

* put RETRO Jenkinstest to the right place

* update code for megatron_retro_pretraining_legacy.py

* update Jenkins and _legacy.py

* update new RETRO jenkinstest to run faster

* fixing errors from GitHub Advanced Security / CodeQL

* fixing errors from GitHub Advanced Security / CodeQL

* update manually branch to huvu/mcore_retro

* remove DEBUGGING markers

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* copy paste scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt

* update codes to fix Github warnings; adding cicd-main.yml action tests

* cleaning code, addressing Shanmugam's comments

* saving before pulling from main

* cleaning code

* adding deprecations note

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: George <gzelenfroind@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: root <root@eos0150.eos.clusters.nvidia.com>
---
 .../conf/megatron_retro_inference.yaml        |  44 ++--
 .../conf/megatron_retro_inference_legacy.yaml |  46 ++++
 .../conf/megatron_retro_qatask.yaml           |  40 +++
 .../language_modeling/megatron_retro_eval.py  | 183 +++++++-------
 .../megatron_retro_eval_legacy.py             | 145 +++++++++++
 .../megatron_retro_qatask_eval.py             | 217 +++++++++++++++++
 .../language_modeling/megatron_retro_model.py |  23 +-
 .../common/text_generation_strategy.py        | 227 +++++++++++++++++-
 .../modules/common/text_generation_utils.py   |  76 +++++-
 9 files changed, 856 insertions(+), 145 deletions(-)
 create mode 100644 examples/nlp/language_modeling/conf/megatron_retro_inference_legacy.yaml
 create mode 100644 examples/nlp/language_modeling/conf/megatron_retro_qatask.yaml
 create mode 100644 examples/nlp/language_modeling/megatron_retro_eval_legacy.py
 create mode 100644 examples/nlp/language_modeling/megatron_retro_qatask_eval.py

diff --git a/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml b/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
index 1b99a65f46ad..6cde27f55527 100644
--- a/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_retro_inference.yaml
@@ -3,42 +3,40 @@ inference:
   top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
   top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
   temperature: 1.0 # sampling temperature
-  add_BOS: True # add the bos token at the begining of the prompt
+  add_BOS: False # add the bos token at the begining of the prompt
   tokens_to_generate: 30 # The minimum length of the sequence to be generated.
   all_probs: False  # whether return the log prob for all the tokens in vocab
   repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
   min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
   compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
-
+  end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
+  # RETRO-specific arguments
+  retro_inference:
+    retro_gpt_retrieved_length: 128
+    retro_num_neighbors: 2
+    ft_neighbours: 0
+    reuse_top: False
 
 trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
   logger: False # logger provided by exp_manager
-  precision: 16 # 16, 32, or bf16
-
-inference_batch_size: 2
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+  
 tensor_model_parallel_size: -1
 pipeline_model_parallel_size: -1
 pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
-retro_model_file: null  # RETRO nemo file path
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
 
-use_predict_method: False  # whether to use the predict method
+retro_model_file: null  # Retro nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the Retro training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
 
-prompts: # prompts for RETRO model inference
-  - "hello,"
-  - "good morning,"
-  - "good afternoon,"
-  - "good evening,"
- 
-########### Faiss service parameters ########
-retrieval_service:
-  strategy: RetroModelTextGenerationStrategy  # choose customized inference strategy 
-  neighbors: 4
-  frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
-  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
-  store_retrieved: False # whether store the retrieved documents, so it can be checked
-  combo_service:
-    service_ip: '0.0.0.0'
-    service_port: 17181 
\ No newline at end of file
+# RETRO inference
+prompt: "sample prompt"
+neighbors:
+  - "neighbor text 1"
+  - "neighbor text 2"
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_inference_legacy.yaml b/examples/nlp/language_modeling/conf/megatron_retro_inference_legacy.yaml
new file mode 100644
index 000000000000..83d88339b30b
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_retro_inference_legacy.yaml
@@ -0,0 +1,46 @@
+# (This inferencing script for native NeMo RETRO will be soon deprecated. For new inferencing script for mcore RETRO, see ./megatron_retro_inference.yaml)
+
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: True # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 16 # 16, 32, or bf16
+
+inference_batch_size: 2
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+retro_model_file: null  # RETRO nemo file path
+
+use_predict_method: False  # whether to use the predict method
+
+prompts: # prompts for RETRO model inference
+  - "hello,"
+  - "good morning,"
+  - "good afternoon,"
+  - "good evening,"
+ 
+########### Faiss service parameters ########
+retrieval_service:
+  strategy: RetroModelTextGenerationStrategy  # choose customized inference strategy 
+  neighbors: 4
+  frequent_query: False  # for the current token generation, frequently update the retrieval context. If false, update it every 64 tokens 
+  pad_tokens: True # pad the tokens at the beginning to make it minimum of 64 tokens for retrieving at least once
+  store_retrieved: False # whether store the retrieved documents, so it can be checked
+  combo_service:
+    service_ip: '0.0.0.0'
+    service_port: 17181 
\ No newline at end of file
diff --git a/examples/nlp/language_modeling/conf/megatron_retro_qatask.yaml b/examples/nlp/language_modeling/conf/megatron_retro_qatask.yaml
new file mode 100644
index 000000000000..a68d11e77087
--- /dev/null
+++ b/examples/nlp/language_modeling/conf/megatron_retro_qatask.yaml
@@ -0,0 +1,40 @@
+inference:
+  greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  add_BOS: False # add the bos token at the begining of the prompt
+  tokens_to_generate: 30 # The minimum length of the sequence to be generated.
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  end_strings: ["<|endoftext|>"]  # generation will stop when one of these tokens is generated
+  # RETRO-specific arguments
+  retro_inference:
+    retro_gpt_retrieved_length: 128
+    retro_num_neighbors: 2
+    ft_neighbours: 0
+    reuse_top: False
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  logger: False # logger provided by exp_manager
+  precision: 32 # 16, 32, or bf16
+  use_distributed_sampler: False
+
+tensor_model_parallel_size: -1
+pipeline_model_parallel_size: -1
+pipeline_model_parallel_split_rank: -1 # used for encoder and decoder model (0 for others)
+megatron_amp_O2: False  # Enable O2-level automatic mixed precision to save memory
+
+retro_model_file: null  # Retro nemo file path
+checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the Retro training
+checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
+hparams_file: null # model configuration file, only used for PTL checkpoint loading
+
+# qa tasks
+qa_file_path: null
+pred_file_path: null
diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py
index 9978bab78bfc..89e3fe9c3ddb 100644
--- a/examples/nlp/language_modeling/megatron_retro_eval.py
+++ b/examples/nlp/language_modeling/megatron_retro_eval.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,128 +12,119 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import os
 
-from examples.nlp.language_modeling.megatron_gpt_eval import RequestDataSet
-from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from torch.utils.data import DataLoader
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.utils.data import DataLoader, Dataset
 
-from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
-from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
-from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
 from nemo.core.config import hydra_runner
-
-try:
-    from megatron.core import parallel_state
-
-    HAVE_MEGATRON_CORE = True
-
-except (ImportError, ModuleNotFoundError):
-
-    HAVE_MEGATRON_CORE = False
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
 
 """
-This is the script to run RETRO Model text generation.
+This is the script to run Retro text generation.
 
 Usage:
-    Assume the model has TP=1, PP=1
-    run greedy inference from a nemo file:
+    Currently, Mcore-based RETRO only support batch-size of 1.
+    Example running greedy inference from a distributed checkpoint dir:
         python megatron_retro_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT \
+            checkpoint_name=CHECKPOINT_NAME \
+            inference.greedy=True \
+            inference.add_BOS=False \
             trainer.devices=1 \
             trainer.num_nodes=1 \
-            trainer.accelerator=gpu \
-            trainer.precision=16 \
-            inference.tokens_to_generate=128 \
-            inference.greedy=True \
-            retro_model_file=path_to_retro_nemo_file \
             tensor_model_parallel_size=-1 \
             pipeline_model_parallel_size=-1 \
-            retrieval_service.faiss_devices='0' \
-            retrieval_service.faiss_index=path_to_faiss_index \
-            retrieval_service.retrieval_index=path_to_retrieval_dataset \
-            retrieval_service.neighbors=20
-"""
+            prompt="sample prompt" \
+            inference.retro_inference.retro_num_neighbors=2 \
+            neighbors=["neighbor text 1", "neighbor text 2"]
 
 
-@hydra_runner(config_path="conf", config_name="megatron_retro_inference")
-def main(cfg) -> None:
-    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+        ```
+"""
 
-    model_path = cfg.retro_model_file
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
 
-    save_restore_connector = NLPSaveRestoreConnector()
 
-    if os.path.isdir(model_path):
-        save_restore_connector.model_extracted_dir = model_path
+class RequestDataSet(Dataset):
+    def __init__(self, sentences, neighbors):
+        super().__init__()
+        self.sentences = sentences
+        self.neighbors = neighbors
 
-    model_cfg = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, return_config=True, save_restore_connector=save_restore_connector,
-    )
+    def __len__(self,):
+        return len(self.sentences)
 
-    with open_dict(model_cfg):
-        model_cfg.precision = trainer.precision
-        model_cfg.sequence_parallel = False
-        model_cfg.activations_checkpoint_granularity = None
-        model_cfg.activations_checkpoint_method = None
-
-    if (
-        cfg.tensor_model_parallel_size < 0
-        or cfg.pipeline_model_parallel_size < 0
-        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
-    ):
-        with open_dict(cfg):
-            cfg.tensor_model_parallel_size = model_cfg.get('tensor_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_size = model_cfg.get('pipeline_model_parallel_size', 1)
-            cfg.pipeline_model_parallel_split_rank = model_cfg.get('pipeline_model_parallel_split_rank', 0)
-
-    model = MegatronRetrievalModel.restore_from(
-        model_path, trainer=trainer, save_restore_connector=save_restore_connector, override_config_path=model_cfg,
-    )
+    def __getitem__(self, idx):
+        return {'prompts': self.sentences[idx], 'neighbors': self.neighbors[idx]}
 
-    length_params: LengthParam = {
-        "max_length": cfg.inference.tokens_to_generate,
-        "min_length": cfg.inference.min_tokens_to_generate,
-    }
 
-    sampling_params: SamplingParam = {
-        "use_greedy": cfg.inference.greedy,
-        "temperature": cfg.inference.temperature,
-        "top_k": cfg.inference.top_k,
-        "top_p": cfg.inference.top_p,
-        "repetition_penalty": cfg.inference.repetition_penalty,
-        "add_BOS": cfg.inference.add_BOS,
-        "all_probs": cfg.inference.all_probs,
-        "compute_logprob": cfg.inference.compute_logprob,
-    }
+@hydra_runner(config_path="conf", config_name="megatron_retro_inference")
+def main(cfg) -> None:
+
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=[CustomProgressBar()],
+    )
 
-    # check whether the DDP is initialized
-    if not parallel_state.is_initialized():
+    if cfg.checkpoint_dir:
+        app_state = AppState()
+        if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
+            app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+            app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+            (
+                app_state.tensor_model_parallel_rank,
+                app_state.pipeline_model_parallel_rank,
+                app_state.model_parallel_size,
+                app_state.data_parallel_size,
+                app_state.pipeline_model_parallel_split_rank,
+                app_state.virtual_pipeline_model_parallel_rank,
+            ) = fake_initialize_model_parallel(
+                world_size=app_state.model_parallel_size,
+                rank=trainer.global_rank,
+                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+            )
+        checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+        model = MegatronRetroModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer
+        )
+    else:
+        raise ValueError("Requiring distributed checkpoint dir for loading Mcore RETRO.")
 
-        def dummy():
-            return
+    model.freeze()
 
-        if model.trainer.strategy.launcher is not None:
-            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-        model.trainer.strategy.setup_environment()
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
 
+    prompt = [cfg.prompt]
+    neighbors = [cfg.neighbors]
+    ds = RequestDataSet(prompt, neighbors)
+    bs = 1
+    request_dl = DataLoader(dataset=ds, batch_size=bs)
     config = OmegaConf.to_container(cfg.inference)
-    retrieval_service = OmegaConf.to_container(cfg.retrieval_service)
-    model.set_inference_config(config, retrieval_service)
-
-    if not cfg.use_predict_method:
-        # First method of running text generation, call model.generate method
-        response = model.generate(
-            inputs=OmegaConf.to_container(cfg.prompts),
-            length_params=length_params,
-            sampling_params=sampling_params,
-            strategy=model.inference_strategy,
-        )
-    else:
-        # Second method of running text generation, call trainer.predict
-        ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
-        request_dl = DataLoader(dataset=ds, batch_size=cfg.inference_batch_size)
-        response = trainer.predict(model, request_dl)
+    model.set_inference_config(config)
+
+    response = trainer.predict(model, request_dl)
 
     print("***************************")
     print(response)
diff --git a/examples/nlp/language_modeling/megatron_retro_eval_legacy.py b/examples/nlp/language_modeling/megatron_retro_eval_legacy.py
new file mode 100644
index 000000000000..69222acedd34
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_retro_eval_legacy.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from examples.nlp.language_modeling.megatron_gpt_eval import RequestDataSet
+from omegaconf.omegaconf import OmegaConf, open_dict
+from pytorch_lightning import Trainer
+from torch.utils.data import DataLoader
+
+from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.core.config import hydra_runner
+
+try:
+    from megatron.core import parallel_state
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
+"""
+This is the script to run RETRO Model text generation.
+(This inferencing script for native NeMo RETRO will be soon deprecated. For new inferencing script for mcore RETRO, see ./megatron_retro_eval.py)
+
+Usage:
+    Assume the model has TP=1, PP=1
+    run greedy inference from a nemo file:
+        python megatron_retro_eval.py \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.accelerator=gpu \
+            trainer.precision=16 \
+            inference.tokens_to_generate=128 \
+            inference.greedy=True \
+            retro_model_file=path_to_retro_nemo_file \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            retrieval_service.faiss_devices='0' \
+            retrieval_service.faiss_index=path_to_faiss_index \
+            retrieval_service.retrieval_index=path_to_retrieval_dataset \
+            retrieval_service.neighbors=20
+"""
+
+
+@hydra_runner(config_path="conf", config_name="megatron_retro_inference_legacy")
+def main(cfg) -> None:
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+
+    model_path = cfg.retro_model_file
+
+    save_restore_connector = NLPSaveRestoreConnector()
+
+    if os.path.isdir(model_path):
+        save_restore_connector.model_extracted_dir = model_path
+
+    model_cfg = MegatronRetrievalModel.restore_from(
+        model_path, trainer=trainer, return_config=True, save_restore_connector=save_restore_connector,
+    )
+
+    with open_dict(model_cfg):
+        model_cfg.precision = trainer.precision
+        model_cfg.sequence_parallel = False
+        model_cfg.activations_checkpoint_granularity = None
+        model_cfg.activations_checkpoint_method = None
+
+    if (
+        cfg.tensor_model_parallel_size < 0
+        or cfg.pipeline_model_parallel_size < 0
+        or cfg.get('pipeline_model_parallel_split_rank', -1) < 0
+    ):
+        with open_dict(cfg):
+            cfg.tensor_model_parallel_size = model_cfg.get('tensor_model_parallel_size', 1)
+            cfg.pipeline_model_parallel_size = model_cfg.get('pipeline_model_parallel_size', 1)
+            cfg.pipeline_model_parallel_split_rank = model_cfg.get('pipeline_model_parallel_split_rank', 0)
+
+    model = MegatronRetrievalModel.restore_from(
+        model_path, trainer=trainer, save_restore_connector=save_restore_connector, override_config_path=model_cfg,
+    )
+
+    length_params: LengthParam = {
+        "max_length": cfg.inference.tokens_to_generate,
+        "min_length": cfg.inference.min_tokens_to_generate,
+    }
+
+    sampling_params: SamplingParam = {
+        "use_greedy": cfg.inference.greedy,
+        "temperature": cfg.inference.temperature,
+        "top_k": cfg.inference.top_k,
+        "top_p": cfg.inference.top_p,
+        "repetition_penalty": cfg.inference.repetition_penalty,
+        "add_BOS": cfg.inference.add_BOS,
+        "all_probs": cfg.inference.all_probs,
+        "compute_logprob": cfg.inference.compute_logprob,
+    }
+
+    # check whether the DDP is initialized
+    if parallel_state.is_unitialized():
+
+        def dummy():
+            return
+
+        if model.trainer.strategy.launcher is not None:
+            model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+        model.trainer.strategy.setup_environment()
+
+    config = OmegaConf.to_container(cfg.inference)
+    retrieval_service = OmegaConf.to_container(cfg.retrieval_service)
+    model.set_inference_config(config, retrieval_service)
+
+    if not cfg.use_predict_method:
+        # First method of running text generation, call model.generate method
+        response = model.generate(
+            inputs=OmegaConf.to_container(cfg.prompts),
+            length_params=length_params,
+            sampling_params=sampling_params,
+            strategy=model.inference_strategy,
+        )
+    else:
+        # Second method of running text generation, call trainer.predict
+        ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
+        request_dl = DataLoader(dataset=ds, batch_size=cfg.inference_batch_size)
+        response = trainer.predict(model, request_dl)
+
+    print("***************************")
+    print(response)
+    print("***************************")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/nlp/language_modeling/megatron_retro_qatask_eval.py b/examples/nlp/language_modeling/megatron_retro_qatask_eval.py
new file mode 100644
index 000000000000..b99bcafbab02
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_retro_qatask_eval.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import json
+import os
+
+import torch
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from torch.utils.data import DataLoader, Dataset
+
+from nemo.collections.nlp.data.question_answering.input_example.qa_input_example import QAExample
+from nemo.collections.nlp.metrics.qa_metrics import QAMetrics
+from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
+from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
+from nemo.collections.nlp.parts.nlp_overrides import CustomProgressBar, NLPDDPStrategy
+from nemo.core.config import hydra_runner
+from nemo.utils.app_state import AppState
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+"""
+This is the script to run Retro text generation for QA tasks, such as NQ, TQA.
+
+Usage:
+    Currently, Mcore-based RETRO only support batch-size of 1.
+    Run greedy qa task inference from a distributed checkpoint dir:
+        python megatron_retro_eval.py \
+            checkpoint_dir=PATH_TO_CHECKPOINT \
+            checkpoint_name=CHECKPOINT_NAME \
+            inference.greedy=True \
+            inference.add_BOS=False \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            tensor_model_parallel_size=-1 \
+            pipeline_model_parallel_size=-1 \
+            inference.retro_inference.retro_num_neighbors=2 \
+            qa_file_path=PATH_TO_QAFILE"\
+            pred_file_path =PATH_TO_PREDFILE ""\
+
+
+        ```
+"""
+
+if not torch.cuda.is_available():
+    raise EnvironmentError("GPU is needed for the inference")
+
+
+class RequestDataSet(Dataset):
+    def __init__(self, sentences, neighbors):
+        super().__init__()
+        self.sentences = sentences
+        self.neighbors = neighbors
+
+    def __len__(self,):
+        return len(self.sentences)
+
+    def __getitem__(self, idx):
+        return {'prompts': self.sentences[idx], 'neighbors': self.neighbors[idx]}
+
+
+def process_qasample(sample, retro_num_neighbors=2, ft_neighbours=5):
+    # process prompt
+    question = sample['question']
+    if not question.endswith("?"):
+        question = question + "?"
+    processed_prompt = "Question: {} Answer: The answer is".format(question)
+
+    # process neighbors
+    neighbors = sample['ctxs']
+    neighbors = ["title: " + ctx["title"] + ", source: " + ctx["text"] for ctx in neighbors]
+    processed_neighbors = neighbors[:retro_num_neighbors]
+
+    # # concate neighbors to prompt
+    if ft_neighbours > 0:
+        contexts = "\n\n".join(neighbors[:ft_neighbours]) + "\n\n"
+        processed_prompt = contexts + processed_prompt
+
+    return processed_prompt, processed_neighbors
+
+
+def process_qaresponse(response):
+    prediction = response.split("The answer is")[1]
+    # truncate text
+    prediction = prediction.split(".")[0]
+    prediction = prediction.split("\n")[0]
+    prediction = prediction.split("\n\n")[0]
+    return prediction
+
+
+@hydra_runner(config_path="conf", config_name="megatron_retro_qatask")
+def main(cfg) -> None:
+
+    # trainer required for restoring model parallel models
+    trainer = Trainer(
+        strategy=NLPDDPStrategy(timeout=datetime.timedelta(seconds=18000)),
+        **cfg.trainer,
+        callbacks=[CustomProgressBar()],
+    )
+
+    if cfg.checkpoint_dir:
+        app_state = AppState()
+        if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
+            app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
+            app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
+            app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
+            (
+                app_state.tensor_model_parallel_rank,
+                app_state.pipeline_model_parallel_rank,
+                app_state.model_parallel_size,
+                app_state.data_parallel_size,
+                app_state.pipeline_model_parallel_split_rank,
+                app_state.virtual_pipeline_model_parallel_rank,
+            ) = fake_initialize_model_parallel(
+                world_size=app_state.model_parallel_size,
+                rank=trainer.global_rank,
+                tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
+                pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
+                pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
+            )
+        checkpoint_path = os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name)
+        # checkpoint_path is a dir in case of distributed checkpointing
+        if not os.path.isdir(checkpoint_path):
+            # legacy checkpoint needs model parallel rank injection
+            checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
+        model = MegatronRetroModel.load_from_checkpoint(
+            checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer
+        )
+    else:
+        raise ValueError("Requiring distributed checkpoint dir for loading Mcore RETRO.")
+
+    model.freeze()
+
+    # Have to turn off activations_checkpoint_method for inference
+    try:
+        model.model.language_model.encoder.activations_checkpoint_method = None
+    except AttributeError:
+        pass
+
+    # Reading QA data files
+    qa_samples = []
+    with open(cfg.qa_file_path, 'r', encoding='utf-8') as f:
+        qa_samples = json.load(f)
+
+    # Processing prompts and neighbors
+    prompts = []
+    neighbors = []
+    ground_truths = []
+    for sample in qa_samples:
+        processed_prompt, processed_neighbors = process_qasample(
+            sample, cfg.inference.retro_inference.retro_num_neighbors, cfg.inference.retro_inference.ft_neighbours
+        )
+        prompts.append(processed_prompt)
+        neighbors.append(processed_neighbors)
+        ground_truths.append(
+            sample['answers'][0]
+        )  # Boxin only takes the first value of sample['answers'] (https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/blob/boxin/instructretro-internal-test/tools/retro/text_generation/evaluate.py?ref_type=heads#L85)
+
+    # Running prediction
+    bs = 1
+    ds = RequestDataSet(prompts, neighbors)
+    request_dl = DataLoader(dataset=ds, batch_size=bs)
+    config = OmegaConf.to_container(cfg.inference)
+    model.set_inference_config(config)
+    response = trainer.predict(model, request_dl)
+
+    # Generating answers
+    print("***************************")
+    with open(cfg.pred_file_path, "w", encoding="utf-8") as pred_file:
+        for i in range(len(response)):
+            for sent in response[i]["sentences"]:
+                sent = sent.strip()
+                sent = sent.replace("\n", " ")
+                pred_file.write(sent + "\n")
+            for neighbor in neighbors[i]:
+                neighbor = neighbor.replace("\n", " ")
+                neighbor = "Neighbor: " + neighbor
+                pred_file.write(neighbor + "\n")
+            pred_file.write("---------\n")
+    print(f"Inference Complete, prediction file saved at {cfg.pred_file_path}")
+    print("***************************")
+
+    # Compute metrics
+    predictions = [process_qaresponse(response[i]["sentences"][0]) for i in range(len(response))]
+    formatted_ground_truths = []
+    formatted_predictions = []
+    for i in range(len(predictions)):  # formatting to use NeMo's QAMetrics methods
+        question_id = i
+        qaexample = QAExample(
+            qas_id=question_id,
+            answers=[{'text': ground_truths[i]}],
+            question_text="",
+            context_text="",
+            context_id="",
+            answer_text="",
+            start_position_character="",
+            title="",
+        )
+        formatted_ground_truths.append(qaexample)
+        formatted_predictions.append(predictions[i])
+    eval_results = QAMetrics.evaluate_predictions(formatted_ground_truths, formatted_predictions)
+    print("Eval_results: ", eval_results)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
index 8cc39056554c..377ccbee163b 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -219,10 +219,6 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int]
 
         inference_config = self.get_inference_config()
 
-        if torch.distributed.get_rank() == 0:
-            logging.info("inference_config: ")
-            logging.info(inference_config)
-
         if inference_config is None:
             return None
         else:
@@ -359,9 +355,9 @@ def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
             batch = next(dataloader_iter)
             extra_arg = {}
-            if len(batch) == 5:
+            if len(batch) == 6:
                 batch = [x.cuda() for x in batch]
-                tokens, attention_mask, position_ids, context_input_ids, context_position_ids, context_mask = batch
+                tokens, attention_mask, position_ids, context_input_ids, context_mask, context_position_ids = batch
                 attention_mask = attention_mask[0:1]
             else:
                 (
@@ -369,26 +365,21 @@ def fwd_output_only_func(dataloader_iter, model):
                     attention_mask,
                     position_ids,
                     context_input_ids,
-                    context_position_ids,
                     context_mask,
+                    context_position_ids,
                     set_inference_key_value_memory,
                     inference_max_sequence_len,
                 ) = batch
+                # Transfer needed data to GPU
                 tokens = tokens.cuda()
                 position_ids = position_ids.cuda()
-                if attention_mask is not None:
-                    attention_mask = attention_mask.cuda()
-                    attention_mask = attention_mask[0:1]
                 context_input_ids = context_input_ids.cuda()
                 context_position_ids = context_position_ids.cuda()
                 context_mask = None
                 if self.mcore_gpt:
-                    # if first step, then clear KV cache, otherwise reuse inference_paarms
-                    if set_inference_key_value_memory[0].item():
-                        self.inference_params = InferenceParams(
-                            max_batch_size=tokens.size(0), max_sequence_length=inference_max_sequence_len[0].item()
-                        )
-                    extra_arg['inference_params'] = self.inference_params
+                    # No caching key, value because currently it's not supported for mcore RETRO in NeMo
+                    pass
+
                 else:
                     extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item()
                     extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item()
diff --git a/nemo/collections/nlp/modules/common/text_generation_strategy.py b/nemo/collections/nlp/modules/common/text_generation_strategy.py
index 3abfda2a5e44..e29bb3423c4a 100644
--- a/nemo/collections/nlp/modules/common/text_generation_strategy.py
+++ b/nemo/collections/nlp/modules/common/text_generation_strategy.py
@@ -22,6 +22,7 @@
 import torch
 
 from nemo.collections.nlp.modules.common.lm_utils import pad_batch
+from nemo.collections.nlp.modules.common.megatron.module import Float16Module
 from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
 
 try:
@@ -34,6 +35,8 @@
 
 try:
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.identity_op import IdentityOp
+    from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
 
     HAVE_MEGATRON_CORE = True
 
@@ -593,6 +596,225 @@ def post_process(self, tokens: torch.Tensor, new_tokens: torch.Tensor, context_l
             tokens[:, :context_length][(tokens[:, :context_length] >= pseudo_token_ids_start)] = tokenizer.unk_id
 
 
+class McoreRetroModelTextGenerationStrategy(TextGenerationStrategy):
+    def __init__(self, model):
+        super().__init__(model)
+        self.forward_model = self.model.model
+
+    def clip_max_len(self, maxlen: int) -> int:
+        """ clip the max len based on the LM model max sequence length"""
+
+        # for positional embedding types that allow length extrapolation, don't clip the max length
+        if self.model.cfg.get("position_embedding_type", "learned_absolute") == "learned_absolute":
+            if maxlen > self.model.cfg.encoder_seq_length + 1:
+                maxlen = self.model.cfg.encoder_seq_length + 1
+        return maxlen
+
+    def tokenize_batch(self, sentences, max_len, add_BOS):
+        """
+        convert the sentences into lists of tokens, pad them to the same length, add bos tokens if it is needed
+        Args:
+            sentences (List[str]): list of input sentences in str format.
+            max_len (int): max number of tokens to generate.
+            add_BOS (bool): whether to add the BOS token at the beginning
+        Returns:
+            Tuple[torch.Tensor], the tokenized and padded torch tensor and the token context length tensor.
+        """
+        tokenizer = self.model.tokenizer
+        if add_BOS:
+            context_tokens = [[tokenizer.bos_id] + tokenizer.text_to_ids(s) for s in sentences]
+        else:
+            context_tokens = [tokenizer.text_to_ids(s) for s in sentences]
+
+        # attention, not pad_batch, padding will be done at init_batch
+        context_tokens, context_lengths = pad_batch(batch=context_tokens, pad_id=tokenizer.eos_id, max_len=0)
+
+        context_tokens_tensor = torch.cuda.LongTensor(context_tokens)
+        context_length_tensor = torch.cuda.LongTensor(context_lengths)
+        return context_tokens_tensor, context_length_tensor
+
+    def tokenize_neighbors_batch(self, neighbors, retro_args):
+        tokenizer = self.model.tokenizer
+        r = retro_args['retro_gpt_retrieved_length']
+        retro_num_neighbors = retro_args['retro_num_neighbors']
+        ft_neighbours = retro_args['ft_neighbours']
+        reuse_top = retro_args['reuse_top']
+
+        padded_valid_neighbours_tokens = []
+        for i in range(len(neighbors)):
+            onesample_neighbors = neighbors[i]
+
+            # tokenize neighbors
+            onesample_neighbors_tokens = []
+            for neighbor in onesample_neighbors:
+                onesample_neighbors_tokens.append(tokenizer.text_to_ids(neighbor))
+
+            # take top k neighbours
+            if reuse_top:
+                valid_onesample_neighbours_tokens = onesample_neighbors_tokens[:retro_num_neighbors]
+            else:
+                valid_onesample_neighbours_tokens = onesample_neighbors_tokens[
+                    ft_neighbours : retro_num_neighbors + ft_neighbours
+                ]
+
+            # pad neighbors
+            padded_valid_onesample_neighbours_tokens = []
+            for neighbour_tokens in valid_onesample_neighbours_tokens:
+                if len(neighbour_tokens) >= r:
+                    padded_onesample_neighbour_tokens = neighbour_tokens[:r]
+                else:
+                    padded_onesample_neighbour_tokens = neighbour_tokens + [tokenizer.eos_id] * (
+                        r - len(neighbour_tokens)
+                    )
+                padded_valid_onesample_neighbours_tokens.append(padded_onesample_neighbour_tokens)
+
+            # check if have enough neighbors
+            if len(padded_valid_onesample_neighbours_tokens) < retro_num_neighbors:
+                assert ValueError("neighbours are not enough, add empty ones and create mask for those empty ones")
+
+            # append to batch
+            padded_valid_neighbours_tokens.append(padded_valid_onesample_neighbours_tokens)
+
+        # cast to torch tensor
+        padded_valid_neighbours_tokens = torch.cuda.LongTensor(padded_valid_neighbours_tokens)
+        padded_valid_neighbours_tokens_shape = torch.cuda.LongTensor(padded_valid_neighbours_tokens.shape)
+
+        return padded_valid_neighbours_tokens, padded_valid_neighbours_tokens_shape
+
+    def init_batch(self, context_tokens: torch.Tensor, context_length: int, compute_attention_mask: bool, **extra):
+        """initialize the batch data before the inference steps."""
+
+        # For Mcore retrieval RETRO model, modify tokens and neighbors to set them into 2 chunks, one for question, and one for answer, both having the same length of context_tokens.shape[1]
+        bs, context_tokens_length = context_tokens.shape
+        assert bs == 1  # similar to M-LM RETRO inference code, currently only support batch_size=1
+        context_tokens = [context_tokens[0].tolist() + [self.model.tokenizer.eos_id] * context_tokens_length]
+        context_tokens = torch.cuda.LongTensor(context_tokens)
+        self.model.model.config.retro_gpt_chunk_length = context_tokens_length  # set RetroConfig of M-LM's RETRO model
+        # reshape tensor extra['neighbors_tokens'] (currently: [k, 1, r]) to [bs, l, k, r]
+        neighbors_tokens = extra['neighbors_tokens']
+        neighbors_tokens = neighbors_tokens.permute(1, 0, 2)
+        neighbors_tokens = neighbors_tokens.unsqueeze(0)
+        # duplicate into 2 chunks from [bs, l, k ,r] to [bs, 2*l, k ,r]
+        neighbors_tokens = neighbors_tokens.repeat(1, 2, 1, 1)
+
+        # Move to GPU.
+        tokenizer = self.model.tokenizer
+        tokens = context_tokens.contiguous().cuda()
+        neighbors_tokens = neighbors_tokens.contiguous().cuda()
+
+        # Get the attention mask and postition ids.
+        self.attention_mask, _, self.position_ids = get_ltor_masks_and_position_ids(
+            tokens,
+            tokenizer.eos_id,
+            self.model.cfg.get('reset_position_ids', False),
+            self.model.cfg.get('reset_attention_mask', False),
+            self.model.cfg.get('eod_mask_loss', False),
+            compute_attention_mask=compute_attention_mask,
+        )
+
+        # Get the attention mask and postition ids for neighbors (retro_generation.retro_generate_tokens_probs_and_return_on_first_stage)
+        # Reshape neighbors_tokens tensor to 2D for get_ltor_masks_and_position_ids and as forward arg of RETRO model, original shape is 3D ([bs, k, r])
+        [bs, l, k, r] = neighbors_tokens.shape
+        neighbors_tokens = neighbors_tokens.view(-1, r).long()
+
+        _, _, self.neighbor_position_ids = get_ltor_masks_and_position_ids(
+            neighbors_tokens,
+            tokenizer.eos_id,
+            self.model.cfg.get('reset_position_ids', False),
+            self.model.cfg.get('reset_attention_mask', False),
+            self.model.cfg.get('eod_mask_loss', False),
+        )
+        self.neighbor_attention_mask = torch.zeros(
+            [1, 1]
+        )  # dummy value, since the batch neighbor_attention_mask will be set to None in megatron_retro_model.py in Mcore implementation
+        self.neighbors_tokens = neighbors_tokens
+
+        # For Mcore retrieval RETRO model, following ADLR's Mcore RETRO inferencing implementation, updating the arguments inside RETRO model (retro_num_neighbors, retro_chunk_length) with the inference's sample
+        inference_retro_num_neighbors = k
+        inference_retro_chunk_length = context_tokens_length
+        inference_retro_retrieved_length = r
+        self.forward_model.config.retro_num_neighbors = inference_retro_num_neighbors
+        self.forward_model.config.retro_chunk_length = inference_retro_chunk_length
+        self.forward_model.config.retro_retrieved_length = inference_retro_retrieved_length
+        contain_encoder = True
+        if isinstance(self.forward_model, (Float16Module, MCoreFloat16Module)):
+            layers = self.forward_model.module.decoder.layers
+        else:
+            layers = self.forward_model.decoder.layers
+        for layer in layers:
+            if not (isinstance(layer.cross_attention, IdentityOp)):  # if this is encoder-decoder cross-attention layer
+                # updating RetroDecoder (RetroDecoderCrossAttention, RetroDecoderBiasDropoutAdd)
+                layer.cross_attention.retro_num_neighbors = inference_retro_num_neighbors
+                layer.cross_attention.retro_chunk_length = inference_retro_chunk_length
+                layer.cross_attention.retro_retrieved_length = inference_retro_retrieved_length
+                layer.cross_attn_bda.retro_chunk_length = inference_retro_chunk_length
+
+                # updating RetroEncoder (RetroEncoderCrossAttention, RetroEncoderBiasDropoutAdd, RetroEncoderLayerNorm)
+                if contain_encoder:  # the first cross-attention decoder layer contain encoder
+                    layer.cross_attention.encoder.layers[
+                        0
+                    ].cross_attention.retro_num_neighbors = inference_retro_num_neighbors
+                    layer.cross_attention.encoder.layers[
+                        0
+                    ].cross_attention.retro_chunk_length = inference_retro_chunk_length
+                    layer.cross_attention.encoder.layers[
+                        0
+                    ].cross_attention.retro_retrieved_length = inference_retro_retrieved_length
+                    layer.cross_attention.encoder.layers[
+                        0
+                    ].cross_attn_bda.retro_num_neighbors = inference_retro_num_neighbors
+                    layer.cross_attention.encoder.layers[
+                        0
+                    ].pre_mlp_layernorm.retro_num_neighbors = inference_retro_num_neighbors
+                    contain_encoder = False
+
+        return context_tokens
+
+    def prepare_batch_at_step(
+        self,
+        tokens: torch.Tensor,
+        maxlen: int,
+        micro_batch_size: int,
+        step: int,
+        context_length: int,
+        compute_attention_mask: bool = True,
+        **extra,
+    ) -> Tuple[List[torch.Tensor], List[int]]:
+        """
+        generate the batch used in inference for each of the steps
+        """
+
+        # For Mcore retrieval RETRO model, currently not support memory caching, always allocate memory for the entire context
+        # Allocate memory for the entire context.
+        set_inference_key_value_memory = True
+        tokens2use = tokens
+        positions2use = self.position_ids
+        attention_mask2use = self.attention_mask
+
+        """Prepare batch for each of the inference steps"""
+        attention_mask_repeat = None
+        if compute_attention_mask:
+            attention_mask_repeat = torch.concat([attention_mask2use for _ in range(micro_batch_size)])
+
+        setkey_value_array = torch.tensor(
+            [set_inference_key_value_memory] * micro_batch_size, device=torch.cuda.current_device()
+        )
+        len_array = torch.tensor([maxlen] * micro_batch_size, device=torch.cuda.current_device())
+
+        batch = [
+            tokens2use,
+            attention_mask_repeat,
+            positions2use,
+            self.neighbors_tokens,
+            self.neighbor_attention_mask,
+            self.neighbor_position_ids,
+            setkey_value_array,
+            len_array,
+        ]
+        tensor_shape = [tokens2use.shape[1], micro_batch_size, self.model.cfg.hidden_size]
+        return batch, tensor_shape
+
+
 def model_inference_strategy_dispatcher(model, **args):
     from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
     from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -600,6 +822,7 @@ def model_inference_strategy_dispatcher(model, **args):
         MegatronGPTPromptLearningModel,
     )
     from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel
+    from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
     from nemo.collections.nlp.modules.common.retro_inference_strategies import (
         RetroFileQAModelTextGenerationStrategy,
         RetroModelTextGenerationStrategy,
@@ -610,7 +833,7 @@ def model_inference_strategy_dispatcher(model, **args):
         return NevaModelTextGenerationStrategy(model)
     if isinstance(model, MegatronGPTPromptLearningModel):
         return PromptLearningModelTextGenerationStrategy(model, **args)
-    elif isinstance(model, MegatronGPTModel):
+    elif isinstance(model, MegatronGPTModel) and not (isinstance(model, MegatronRetroModel)):
         return GPTModelTextGenerationStrategy(model)
     elif isinstance(model, MegatronRetrievalModel):
         strategy_name = args['strategy']
@@ -625,6 +848,8 @@ def model_inference_strategy_dispatcher(model, **args):
             return RetroFileQAModelTextGenerationStrategy(model, **args)
         else:
             raise ValueError(f'{strategy_name} is not supported for inference')
+    elif isinstance(model, MegatronRetroModel):
+        return McoreRetroModelTextGenerationStrategy(model)
     else:
         raise ValueError(f'{model} is not supported for inference')
 
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index 3daf93ac0ed2..d130322404b6 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -473,6 +473,7 @@ def synced_generate(
     end_strings=[],
     min_tokens_to_generate=0,
     image_list=None,
+    **strategy_args,
 ):
     context_length = context_length_tensor.min().item()
     tokenizer = model.tokenizer
@@ -488,6 +489,19 @@ def synced_generate(
             temperature=temperature,
         )
     else:
+
+        extra = {
+            "top_p": top_p,
+            "top_k": top_k,
+            "greedy": greedy,
+            "repetition_penalty": repetition_penalty,
+            "min_tokens_to_generate": min_tokens_to_generate,
+        }
+
+        # if input containing neighbors (for Mcore retrieval RETRO model)
+        if "neighbors_tokens" in strategy_args:
+            extra['neighbors_tokens'] = strategy_args['neighbors_tokens']
+
         batch_token_iterator = sample_sequence_batch(
             model,
             inference_strategy,
@@ -500,13 +514,7 @@ def synced_generate(
             temperature=temperature,
             end_strings=end_strings,
             image_list=image_list,
-            extra={
-                "top_p": top_p,
-                "top_k": top_k,
-                "greedy": greedy,
-                "repetition_penalty": repetition_penalty,
-                "min_tokens_to_generate": min_tokens_to_generate,
-            },
+            extra=extra,
         )
 
     for tokens, lengths, output_logits, full_logits in batch_token_iterator:
@@ -626,6 +634,22 @@ def generate(
             end_strings,
             random_seed,
         )
+
+        # tokenize neighbors and broadcast (for Mcore retrieval RETRO model)
+        if 'neighbors' in strategy_args:
+            # tokenize neighbors
+            neighbors_tokens_tensor, neighbors_tokens_tensor_shape = inference_strategy.tokenize_neighbors_batch(
+                strategy_args['neighbors'], strategy_args['retro_inference']
+            )
+
+            # send neighbors tensors to all ranks
+            model_parallel_group = parallel_state.get_model_parallel_group()
+            src = get_model_parallel_src_rank()
+            torch.distributed.broadcast(neighbors_tokens_tensor_shape, src, model_parallel_group)
+            torch.distributed.broadcast(neighbors_tokens_tensor, src, model_parallel_group)
+        else:
+            neighbors_tokens_tensor = None
+
     else:
         (
             context_length_tensor,
@@ -643,6 +667,27 @@ def generate(
             random_seed,
         ) = receive_generate_info()
 
+        # receive broadcast (for Mcore retrieval RETRO model)
+        if 'neighbors' in strategy_args:
+            # receive neighbors tensors to all ranks
+            model_parallel_group = parallel_state.get_model_parallel_group()
+            src = get_model_parallel_src_rank()
+            neighbors_tokens_tensor_shape = torch.empty(2, dtype=torch.float32, device=torch.cuda.current_device())
+            torch.distributed.broadcast(neighbors_tokens_tensor_shape, src, model_parallel_group)
+            neighbors_tokens_tensor = torch.empty(
+                neighbors_tokens_tensor_shape[0],
+                neighbors_tokens_tensor_shape[1],
+                dtype=torch.int64,
+                device=torch.cuda.current_device(),
+            )
+            torch.distributed.broadcast(neighbors_tokens_tensor, src, model_parallel_group)
+        else:
+            neighbors_tokens_tensor = None
+
+    # add neighbors to strategy_args (for retrieval RETRO model)
+    if 'neighbors' in strategy_args:
+        strategy_args['neighbors_tokens'] = neighbors_tokens_tensor
+
     if random_seed is not None:
         seed_everything(random_seed)
 
@@ -663,6 +708,7 @@ def generate(
         end_strings=end_strings,
         min_tokens_to_generate=min_tokens_to_generate,
         image_list=image_list,
+        **strategy_args,
     )
     special_tokens = set()
     if hasattr(tokenizer, 'pad_token') and tokenizer.pad_token is not None:
@@ -771,7 +817,15 @@ def sample_sequence_batch(
     # initialize the batch
     with torch.no_grad():
         context_length = context_lengths.min().item()
-        inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask)
+        if 'neighbors_tokens' in extra:  # for Mcore retrieval RETRO model
+
+            # For Mcore retrieval RETRO model, context_tokens tensors are updated after init_batch() (the length is doubled after processing)
+            context_tokens = inference_strategy.init_batch(
+                context_tokens, context_length, compute_attention_mask, **extra
+            )
+
+        else:
+            inference_strategy.init_batch(context_tokens, context_length, compute_attention_mask)
         # added eos_id to support the function generate_samples_eval that passes
         # eos_id as an argument and needs termination when that id id found.
         eod_id = tokenizer.eos_id
@@ -809,7 +863,11 @@ def sample_sequence_batch(
                     logits = output[:, -1].view(batch_size, -1).contiguous()
 
                 else:
-                    logits = output[0]['logits'][:, -1].contiguous()
+                    if 'neighbors_tokens' in extra:  # for Mcore retrieval RETRO model
+                        # for Mcore RETRO inference, disimilar to GPT, we will get the logits of the (context_length - 1)th token, instead of the last token
+                        logits = output[0]['logits'][:, context_length - 1].contiguous()
+                    else:
+                        logits = output[0]['logits'][:, -1].contiguous()
                     logits = tensor_parallel.gather_from_tensor_model_parallel_region(logits)
                     assert logits is not None
                     logits = logits.view(batch_size, -1)

From d9f20774fd51c90e8a66afff9fb59c37b74cb2f0 Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <anandaraj@wisc.edu>
Date: Tue, 23 Apr 2024 14:30:17 -0700
Subject: [PATCH 09/30] Added fusion for squared relu (#8963)

* Added fusion for squared relu

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 nemo/collections/nlp/modules/common/megatron/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py
index 97022ab5e459..48234459453e 100644
--- a/nemo/collections/nlp/modules/common/megatron/utils.py
+++ b/nemo/collections/nlp/modules/common/megatron/utils.py
@@ -175,6 +175,13 @@ def openai_gelu(x):
     return gelu_impl(x)
 
 
+try:
+    jit_fuser = torch.compile
+except:
+    jit_fuser = torch.jit.script
+
+
+@jit_fuser
 def squared_relu(x):
     return torch.pow(torch.nn.functional.relu(x), 2)
 

From b0bb807081623310c8cf0ab9a3052169fd875625 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Wed, 24 Apr 2024 15:48:52 -0400
Subject: [PATCH 10/30] update gemme for trt-llm 0.9 (#8974)

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 nemo/export/trt_llm/decoder/gemma.py | 89 +---------------------------
 tests/export/run.sh                  |  2 +-
 2 files changed, 3 insertions(+), 88 deletions(-)

diff --git a/nemo/export/trt_llm/decoder/gemma.py b/nemo/export/trt_llm/decoder/gemma.py
index 88196a80dd2b..10301c7a47d7 100644
--- a/nemo/export/trt_llm/decoder/gemma.py
+++ b/nemo/export/trt_llm/decoder/gemma.py
@@ -15,10 +15,8 @@
 from typing import Optional
 
 from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.layers import Attention, AttentionMaskType, GatedMLP, PositionEmbeddingType, RmsNorm
+from tensorrt_llm.models.gemma.model import GemmaDecoderLayer, QuantConfig
 from tensorrt_llm.models.modeling_utils import PretrainedConfig
-from tensorrt_llm.module import Module
-from tensorrt_llm.quantization import QuantMode
 from typing_extensions import override
 
 from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder
@@ -32,88 +30,6 @@
 )
 
 
-class GemmaDecoderLayer(Module):
-    def __init__(self, config, layer_idx):
-        super().__init__()
-        self.layer_idx = layer_idx
-        self.config = config
-
-        self.input_layernorm = RmsNorm(
-            normalized_shape=config.hidden_size, eps=config.norm_epsilon, dtype=config.dtype
-        )
-
-        self.attention = Attention(
-            hidden_size=config.hidden_size,
-            num_attention_heads=config.num_attention_heads,
-            num_kv_heads=config.num_key_value_heads,
-            attention_head_size=config.head_size,
-            max_position_embeddings=config.max_position_embeddings,
-            dtype=config.dtype,
-            attention_mask_type=AttentionMaskType.causal,
-            bias=config.attn_bias,
-            position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
-            rotary_embedding_base=config.rotary_base,
-            rotary_embedding_scaling=config.rotary_scaling,
-            tp_group=config.mapping.tp_group,
-            tp_size=config.mapping.tp_size,
-            quant_mode=config.quant_mode,
-        )
-
-        mlp_hidden_size = config.hidden_size * 4 if config.intermediate_size is None else config.intermediate_size
-
-        self.mlp = GatedMLP(
-            hidden_size=config.hidden_size,
-            ffn_hidden_size=mlp_hidden_size,
-            hidden_act=config.hidden_act,
-            dtype=config.dtype,
-            bias=config.mlp_bias,
-            tp_group=config.mapping.tp_group,
-            tp_size=config.mapping.tp_size,
-            quant_mode=config.quant_mode,
-        )
-        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size, eps=config.norm_epsilon, dtype=config.dtype)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        medusa_packed_mask=None,  # For Medusa support
-        medusa_position_offsets=None,
-        use_cache=False,
-        kv_cache_params=None,
-        attention_params=None,
-        lora_layer_params=None,
-    ):
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-
-        attention_output = self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            medusa_packed_mask=medusa_packed_mask,  # For Medusa support
-            medusa_position_offsets=medusa_position_offsets,
-            use_cache=use_cache,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-            lora_layer_params=lora_layer_params,
-        )
-
-        if use_cache:
-            attention_output, presents = attention_output
-
-        hidden_states = residual + attention_output
-
-        residual = hidden_states
-        hidden_states = self.post_layernorm(hidden_states)
-
-        hidden_states = self.mlp(hidden_states, lora_layer_params=lora_layer_params)
-
-        hidden_states = residual + hidden_states
-        if use_cache:
-            return (hidden_states, presents)
-        return hidden_states
-
-
 class GemmaDecoderLayerConfigBuilder(DecoderLayerConfigBuilder):
     """The LLAMA implementation of the DecoderLayerConfigBuilder."""
 
@@ -200,8 +116,7 @@ def build_decoder(self, layer):
             world_size=self.tensor_parallel,
             tp_size=self.tensor_parallel,
             pp_size=1,
-            quant_mode=QuantMode(0),
-            quant_kwargs=None,
+            quantization=QuantConfig(),
             max_lora_rank=layer.max_lora_rank,
         )
 
diff --git a/tests/export/run.sh b/tests/export/run.sh
index eca22e0d3684..0071b1351113 100644
--- a/tests/export/run.sh
+++ b/tests/export/run.sh
@@ -48,4 +48,4 @@ python tests/export/test_nemo_export.py --model_name FALCON-7B-base --existing_t
 python tests/export/test_nemo_export.py --model_name FALCON-40B-base --existing_test_models --min_gpus 2 --max_gpus 8
 python tests/export/test_nemo_export.py --model_name FALCON-180B-base --existing_test_models --min_gpus 8 --max_gpus 8
 python tests/export/test_nemo_export.py --model_name STARCODER1-15B-base --existing_test_models --min_gpus 1 --max_gpus 1
-python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1 --run_accuracy --test_deployment True
+python tests/export/test_nemo_export.py --model_name GEMMA-base --existing_test_models --min_gpus 1 --max_gpus 1
\ No newline at end of file

From 74a2dd321628f4a6293944ec43f23fc5c24c9ef7 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Wed, 24 Apr 2024 16:40:16 -0700
Subject: [PATCH 11/30] further specialize runners for more parallelism (#9036)

---
 .github/workflows/cicd-main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index f34326b0e16b..091e18e58ebc 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -31,7 +31,7 @@ jobs:
         nvidia-smi
 
   cicd-cluster-clean:
-    runs-on: self-hosted-azure-cpu
+    runs-on: self-hosted-azure-builder
     steps:
     - name: Clean server from old files
       run: |
@@ -53,7 +53,7 @@ jobs:
 
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
-    runs-on: self-hosted-azure-cpu
+    runs-on: self-hosted-azure-builder
     # uses: actions/cache@v2
     #container:
 #      image: nvcr.io/nvidia/pytorch:24.01-py3
@@ -176,7 +176,7 @@ jobs:
     runs-on: self-hosted-azure
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-      options: 
+      options:
         # --user 0:128
         --device=/dev/nvidia0
         --gpus all

From 5e91630f8e45019d0b6705a14157bad6b24fac12 Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 25 Apr 2024 12:57:55 -0400
Subject: [PATCH 12/30] Update mm dataprep notebook based on feedback (#9029)

* update mm dataprep notebook based on feedback

Signed-off-by: Chen Cui <chcui@nvidia.com>

* update mm dataprep notebook based on feedback

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Revert "update mm dataprep notebook based on feedback"

This reverts commit 969aca8a0160487d0379f2cec60820246817cb93.

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
---
 .../Multimodal Data Preparation.ipynb         | 32 ++++++++++++++++---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/tutorials/multimodal/Multimodal Data Preparation.ipynb b/tutorials/multimodal/Multimodal Data Preparation.ipynb
index bc297a4e1f58..e506bbd4d4b4 100644
--- a/tutorials/multimodal/Multimodal Data Preparation.ipynb	
+++ b/tutorials/multimodal/Multimodal Data Preparation.ipynb	
@@ -36,14 +36,28 @@
     "\n",
     "## Install dependencies\n",
     "! pip install img2dataset\n",
-    "! pip uninstall -y opencv-python-headless\n",
-    "! pip install opencv-python==4.8.0.74 # https://github.com/opencv/opencv-python/issues/884\n",
     "\n",
     "### Install NeMo\n",
     "BRANCH = 'main'\n",
     "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "For both running this notebook locally and in a nemo container:\n",
+    "We need to downgrade opencv version to resolve this issue: https://github.com/opencv/opencv-python/issues/884\n",
+    "\"\"\"\n",
+    "! pip uninstall -y opencv-python-headless\n",
+    "! pip install opencv-python==4.8.0.74"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -512,14 +526,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! pip install --upgrade webdataset\n",
     "! SLURM_ARRAY_TASK_ID=0 SLURM_ARRAY_TASK_COUNT=2 python $SCRIPT_DIR/precache_encodings.py \\\n",
     "    input_dir=$DATA_DIR/tarfiles_reorganized \\\n",
     "    output_dir=$DATA_DIR/tarfiles_precached \\\n",
     "    tar_chunk_size=1000 \\\n",
-    "    precache_config_path=conf/precache_sd_example.yaml"
+    "    precache_config_path=$CONF_DIR/precache_sd_example.yaml"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "If you encounter a nemo import problem with the cell above, please also running it in the terminal directly."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -660,4 +682,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}

From fbc77af1158c88ee93ac29dec6139f0b2f3a673e Mon Sep 17 00:00:00 2001
From: Chen Cui <chcui@nvidia.com>
Date: Thu, 25 Apr 2024 12:58:10 -0400
Subject: [PATCH 13/30] Fix import in lora merge script (#9032)

* fix import

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adi Renduchintala <adithya.r@gmail.com>
---
 .../merge_lora_weights/merge.py               | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/scripts/nlp_language_modeling/merge_lora_weights/merge.py b/scripts/nlp_language_modeling/merge_lora_weights/merge.py
index 14fe3db80690..c74a13e7f493 100644
--- a/scripts/nlp_language_modeling/merge_lora_weights/merge.py
+++ b/scripts/nlp_language_modeling/merge_lora_weights/merge.py
@@ -28,13 +28,13 @@
 
 
 import os
+import re
 import tempfile
 from typing import Any, Dict, List
 
 import torch
 from omegaconf import OmegaConf, open_dict
 from pytorch_lightning.trainer.trainer import Trainer
-from scripts.nlp_language_modeling.merge_lora_weights.convert_lora_parallelism import replace_number_add_offset
 from torch.utils.data import DataLoader, Dataset
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
@@ -69,6 +69,27 @@ def __getitem__(self, idx):
         return self.sentences[idx]
 
 
+def replace_number_add_offset(key, offset_value):
+    # This function uses regular expression to find layer number in the state dict key
+    # and replaces it with its value plus an offset value
+
+    if offset_value == 0:
+        return key
+
+    # Define the pattern to match numbers in the string
+    pattern = r'layers.(\d+)'
+
+    # Function to be used as replacement
+    # It converts the found number to integer, adds offset, and returns as string
+    def add_offset(match):
+        return "layers." + str(int(match.group(1)) + offset_value)
+
+    # Use re.sub() to replace all occurrences of the pattern with the result of add_offset
+    result_string = re.sub(pattern, add_offset, key)
+
+    return result_string
+
+
 def load_lora(lora_nemo):
     with tempfile.TemporaryDirectory() as tmpdir:
         NLPSaveRestoreConnector._unpack_nemo_file(lora_nemo, tmpdir)

From 83ef77c7697fa60b15afbc550f2ef035cbd22308 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Thu, 25 Apr 2024 12:25:17 -0700
Subject: [PATCH 14/30] NeMo dev doc restructure (#8896)

* Update intro and why nemo in dev doc

* Categorize tutorials

* Update tutorials link

* update index

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Update flash attention

* Update flash attention

* Fix few structure issue

* Fix migration

* Fix structure

* Fix structure

* Few updates

* Add few more scripts

* Fix scripts

* Fix few things

* Fix tutorial table

* Restructure

* Rename

* Few fixes and moves

* Move sections

* Fix bib

* Refactor files

* Fixes

* Fix

* Fix few issues

* remove scripts

* Update docs

---------

Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 README.rst                                    |   6 +-
 docs/source/ckpt_converters/convert_mlm.rst   |  32 ++
 docs/source/ckpt_converters/intro.rst         |  22 ++
 docs/source/collections.rst                   |  70 ++++
 docs/source/core/core_index.rst               |   4 +-
 docs/source/features/memory_optimizations.rst |  48 +++
 docs/source/features/mixed_precision.rst      |   6 +
 .../parallelisms.rst                          |  18 +-
 .../throughput_optimizations.rst}             |   8 +-
 docs/source/index.rst                         | 110 ++-----
 docs/source/multimodal/mllm/neva.rst          |   2 +-
 docs/source/multimodal/text2img/sd.rst        |   2 +-
 .../nlp/nemo_megatron/flash_attention.rst     |  28 --
 docs/source/nlp/nemo_megatron/intro.rst       |   4 -
 .../nlp/nemo_megatron/mlm_migration.rst       |  24 --
 .../nemo_megatron/positional_embeddings.rst   |  34 +-
 docs/source/starthere/best-practices.rst      | 301 +++---------------
 docs/source/starthere/intro.rst               | 189 ++++++-----
 docs/source/starthere/tutorials.rst           | 204 ++++++------
 docs/source/tools/intro.rst                   |   4 +-
 tutorials/asr/ASR_Context_Biasing.ipynb       |   2 +-
 tutorials/asr/ASR_with_NeMo.ipynb             |   2 +-
 tutorials/asr/README.md                       |   2 +-
 tutorials/multimodal/NeVA Tutorial.ipynb      |   4 +-
 .../Stable Diffusion Tutorial.ipynb           |   2 +-
 ...a_Preprocessing_and_Cleaning_for_NMT.ipynb |   2 +-
 26 files changed, 513 insertions(+), 617 deletions(-)
 create mode 100644 docs/source/ckpt_converters/convert_mlm.rst
 create mode 100644 docs/source/ckpt_converters/intro.rst
 create mode 100644 docs/source/collections.rst
 create mode 100644 docs/source/features/memory_optimizations.rst
 create mode 100644 docs/source/features/mixed_precision.rst
 rename docs/source/{nlp/nemo_megatron => features}/parallelisms.rst (74%)
 rename docs/source/{nlp/nemo_megatron/packed_sequence.rst => features/throughput_optimizations.rst} (96%)
 delete mode 100644 docs/source/nlp/nemo_megatron/flash_attention.rst
 delete mode 100644 docs/source/nlp/nemo_megatron/mlm_migration.rst

diff --git a/README.rst b/README.rst
index c93c48e355d8..66b3a5806c2d 100644
--- a/README.rst
+++ b/README.rst
@@ -46,7 +46,7 @@ Latest News
   <details open>
     <summary><b>Large Language Models and Multimodal</b></summary>
         <details>
-          <summary><a href="https://cloud.google.com/blog/products/compute/gke-and-nvidia-nemo-framework-to-train-generative-ai-models">Accelerate your generative AI journey with NVIDIA NeMo framework on GKE</a> (2024/03/16) </summary>
+          <summary><a href="https://cloud.google.com/blog/products/compute/gke-and-nvidia-nemo-framework-to-train-generative-ai-models">Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE</a> (2024/03/16) </summary>
 
           An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
           <br><br>
@@ -71,7 +71,7 @@ Latest News
     <details>
       <summary><a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">NVIDIA now powers training for Amazon Titan Foundation models</a> (2023/11/28) </summary>
 
-      NVIDIA NeMo framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
+      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
       <br><br>
     </details>
 
@@ -486,7 +486,7 @@ We welcome community contributions! Please refer to `CONTRIBUTING.md <https://gi
 Publications
 ------------
 
-We provide an ever-growing list of `publications <https://nvidia.github.io/NeMo/publications/>`_ that utilize the NeMo framework.
+We provide an ever-growing list of `publications <https://nvidia.github.io/NeMo/publications/>`_ that utilize the NeMo Framework.
 
 If you would like to add your own article to the list, you are welcome to do so via a pull request to this repository's ``gh-pages-src`` branch.
 Please refer to the instructions in the `README of that branch <https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme>`_.
diff --git a/docs/source/ckpt_converters/convert_mlm.rst b/docs/source/ckpt_converters/convert_mlm.rst
new file mode 100644
index 000000000000..61b5b2802e8a
--- /dev/null
+++ b/docs/source/ckpt_converters/convert_mlm.rst
@@ -0,0 +1,32 @@
+Converting from Megatron-LM
+===========================
+
+NVIDIA NeMo and NVIDIA Megatron-LM share many underlying technologies. This document provides guidance for migrating your project from Megatron-LM to NVIDIA NeMo.
+
+Converting Checkpoints
+----------------------
+
+You can convert your GPT-style model checkpoints trained with Megatron-LM into the NeMo Framework using the provided example script. This script facilitates the conversion of Megatron-LM checkpoints to NeMo compatible formats.
+
+.. code-block:: bash
+
+   <NeMo_ROOT_FOLDER>/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
+     --checkpoint_folder <path_to_PTL_checkpoints_folder> \
+     --checkpoint_name megatron_gpt--val_loss=99.99-step={steps}-consumed_samples={consumed}.0 \
+     --nemo_file_path <path_to_output_nemo_file> \
+     --model_type <megatron_model_type> \
+     --tensor_model_parallel_size <tensor_model_parallel_size> \
+     --pipeline_model_parallel_size <pipeline_model_parallel_size> \
+     --gpus_per_node <gpus_per_node>
+
+Resuming Training
+-----------------
+
+To resume training from a converted Megatron-LM checkpoint, it is crucial to correctly set up the training parameters to match the previous learning rate schedule. Use the following setting for the `trainer.max_steps` parameter in your NeMo training configuration:
+
+.. code-block:: none
+
+   trainer.max_steps=round(lr-warmup-fraction * lr-decay-iters + lr-decay-iters)
+
+This configuration ensures that the learning rate scheduler in NeMo continues from where it left off in Megatron-LM, using the `lr-warmup-fraction` and `lr-decay-iters` arguments from the original Megatron-LM training setup.
+
diff --git a/docs/source/ckpt_converters/intro.rst b/docs/source/ckpt_converters/intro.rst
new file mode 100644
index 000000000000..6d4da83499fa
--- /dev/null
+++ b/docs/source/ckpt_converters/intro.rst
@@ -0,0 +1,22 @@
+Community Checkpoint Converter
+==============================
+
+We provide easy-to-use tools that enable users to convert community checkpoints into the NeMo format. These tools facilitate various operations, including resuming training, Sparse Fine-Tuning (SFT), Parameter-Efficient Fine-Tuning (PEFT), and deployment. For detailed instructions and guidelines, please refer to our documentation.
+
+We offer comprehensive guides to assist both end users and developers:
+
+- **User Guide**: Detailed steps on how to convert community model checkpoints for further training or deployment within NeMo. For more information, please see our :doc:`user_guide`.
+
+- **Developer Guide**: Instructions for developers on how to implement converters for community model checkpoints, allowing for broader compatibility and integration within the NeMo ecosystem. For development details, refer to our :doc:`dev_guide`.
+
+- **Megatron-LM Checkpoint Conversion**: NVIDIA NeMo and NVIDIA Megatron-LM share several foundational technologies. You can convert your GPT-style model checkpoints trained with Megatron-LM into the NeMo Framework using our scripts, see our :doc:`convert_mlm`.
+
+Access the user and developer guides directly through the links below:
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Conversion Guides
+
+   user_guide
+   dev_guide
+   convert_mlm
diff --git a/docs/source/collections.rst b/docs/source/collections.rst
new file mode 100644
index 000000000000..1cc7a654b9c1
--- /dev/null
+++ b/docs/source/collections.rst
@@ -0,0 +1,70 @@
+================
+NeMo Collections
+================
+
+Documentation for the individual collections
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Large Language Models (LLMs)
+   :name: Large Language Models
+   :titlesonly:
+
+   nlp/nemo_megatron/intro
+   nlp/models
+   nlp/machine_translation/machine_translation
+   nlp/megatron_onnx_export
+   nlp/quantization
+   nlp/api
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Speech AI
+   :name: Speech AI
+   :titlesonly:
+
+   asr/intro
+   asr/speech_classification/intro
+   asr/speaker_recognition/intro
+   asr/speaker_diarization/intro
+   asr/ssl/intro
+   asr/speech_intent_slot/intro
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Multimodal Models (MMs)
+   :name: Multimodal
+   :titlesonly:
+
+   multimodal/mllm/intro
+   multimodal/vlm/intro
+   multimodal/text2img/intro
+   multimodal/nerf/intro
+   multimodal/api
+
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Text To Speech (TTS)
+   :name: Text To Speech
+   :titlesonly:
+
+   tts/intro
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Vision (CV)
+   :name: vision
+   :titlesonly:
+
+   vision/intro
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Common
+   :name: Common
+   :titlesonly:
+
+   common/intro
\ No newline at end of file
diff --git a/docs/source/core/core_index.rst b/docs/source/core/core_index.rst
index 28cd149bdcb5..01977c1b5101 100644
--- a/docs/source/core/core_index.rst
+++ b/docs/source/core/core_index.rst
@@ -1,5 +1,5 @@
 =========
-NeMo Core
+NeMo APIs
 =========
 
 You can learn more about the underlying principles of the NeMo codebase in this section.
@@ -30,7 +30,7 @@ Alternatively, you can jump straight to the documentation for the individual col
 
 * :doc:`Automatic Speech Recognition (ASR) <../asr/intro>`
 
-* :doc:`Multimodal (MM) Models <../multimodal/mllm/intro>`
+* :doc:`Multimodal Models (MMs) <../multimodal/mllm/intro>`
 
 * :doc:`Text-to-Speech (TTS) <../tts/intro>`
 
diff --git a/docs/source/features/memory_optimizations.rst b/docs/source/features/memory_optimizations.rst
new file mode 100644
index 000000000000..0e0b3ad84402
--- /dev/null
+++ b/docs/source/features/memory_optimizations.rst
@@ -0,0 +1,48 @@
+Memory Optimizations
+====================
+
+Parallelism
+-----------
+Refer to :doc:`Parallelism <./parallelism>`.
+
+Flash Attention
+---------------
+
+Overview
+^^^^^^^^
+
+Flash Attention is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as Natural Language Processing (NLP). Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
+
+Turn Flash Attention On and Off
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the NeMo Framework, Flash Attention is supported through the Transformer Engine with the inclusion of Flash Attention 2. By default, Flash Attention is enabled, but the Transformer Engine may switch to a different kernel if the tensor dimensions are not optimal for Flash Attention. Users can completely disable Flash Attention by setting the environment variable ``NVTE_FLASH_ATTN=0``.
+
+For more details on the supported Dot Attention backend, please refer to the Transformer Engine source code available at `Transformer Engine's Attention Mechanism <https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/attention.py>`_.
+
+.. bibliography:: ./nlp_all.bib
+    :style: plain
+    :labelprefix: nlp-megatron
+    :keyprefix: nlp-megatron-
+
+Overview
+^^^^^^^^
+
+Full Activation Recomputation
+"""""""""""""""""""""""""""""
+This method recalculates all the intermediate activations during the backward pass of a model's training, instead of storing them during the forward pass. This technique maximizes memory efficiency at the cost of computational overhead, as each activation is recomputed when needed.
+
+Partial Activation Recomputation
+""""""""""""""""""""""""""""""""
+This method recomputes only a subset of layers during the backward phase. It is a trade-off between the full recomputation and no recomputation, balancing memory savings with computational efficiency.
+
+Selective Activation Recomputation
+""""""""""""""""""""""""""""""""""
+This method reduces memory footprint of activations significantly via smart activation checkpointing. This approach involves selectively storing only crucial activations and recomputing the others as needed. It is particularly useful in large models to minimize memory usage while controlling the computational cost.
+
+Refer to "Reducing Activation Recomputation in Large Transformer Models" for more details: https://arxiv.org/abs/2205.05198
+
+.. bibliography:: ./nlp_all.bib
+    :style: plain
+    :labelprefix: nlp-megatron
+    :keyprefix: nlp-megatron-
\ No newline at end of file
diff --git a/docs/source/features/mixed_precision.rst b/docs/source/features/mixed_precision.rst
new file mode 100644
index 000000000000..d193752e5475
--- /dev/null
+++ b/docs/source/features/mixed_precision.rst
@@ -0,0 +1,6 @@
+.. _mix_precision:
+
+Mixed Precision Training
+------------------------
+
+Mixed precision training significantly enhances computational efficiency by conducting operations in half-precision and fp8 formats, while selectively maintaining minimal data in single-precision to preserve critical information throughout key areas of the network. NeMo now supports FP16, BF16, and FP8 (via Transformer Engine) across most models. Further details will be provided shortly.
diff --git a/docs/source/nlp/nemo_megatron/parallelisms.rst b/docs/source/features/parallelisms.rst
similarity index 74%
rename from docs/source/nlp/nemo_megatron/parallelisms.rst
rename to docs/source/features/parallelisms.rst
index 9129963ef021..b10477e4232c 100644
--- a/docs/source/nlp/nemo_megatron/parallelisms.rst
+++ b/docs/source/features/parallelisms.rst
@@ -3,13 +3,13 @@
 Parallelisms
 ------------
 
-NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitraritly):
+NeMo Megatron supports 5 types of parallelisms (which can be mixed together arbitrarily):
 
-Distributed Data parallelism
+Distributed Data Parallelism
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Distributed Data parallelism (DDP) creates idential copies of the model across multiple GPUs.
+Distributed Data Parallelism (DDP) creates idential copies of the model across multiple GPUs.
 
-.. image:: images/ddp.gif
+.. image:: ../nlp/nemo_megatron/images/ddp.gif
     :align: center
     :width: 800px
     :alt: Distributed Data Parallel
@@ -20,7 +20,7 @@ Tensor Parallelism
 With Tensor Paralellism (TP) a tensor is split into non-overlapping pieces and
 different parts are distributed and processed on separate GPUs.
 
-.. image:: images/tp.gif
+.. image:: ../nlp/nemo_megatron/images/tp.gif
     :align: center
     :width: 800px
     :alt: Tensor Parallel
@@ -29,7 +29,7 @@ Pipeline Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 With Pipeline Paralellism (PP) consecutive layer chunks are assigned to different GPUs.
 
-.. image:: images/pp.gif
+.. image:: ../nlp/nemo_megatron/images/pp.gif
     :align: center
     :width: 800px
     :alt: Pipeline Parallel
@@ -37,7 +37,7 @@ With Pipeline Paralellism (PP) consecutive layer chunks are assigned to differen
 Sequence Parallelism
 ^^^^^^^^^^^^^^^^^^^^
 
-.. image:: images/sp.gif
+.. image:: ../nlp/nemo_megatron/images/sp.gif
     :align: center
     :width: 800px
     :alt: Sequence Parallel
@@ -47,7 +47,7 @@ Expert Parallelism
 Expert Paralellim (EP) distributes experts across GPUs.
 
 
-.. image:: images/ep.png
+.. image:: ../nlp/nemo_megatron/images/ep.png
     :align: center
     :width: 800px
     :alt: Expert Parallelism
@@ -57,7 +57,7 @@ Parallelism nomenclature
 
 When reading and modifying NeMo Megatron code you will encounter the following terms.
 
-.. image:: images/pnom.gif
+.. image:: ../nlp/nemo_megatron/images/pnom.gif
     :align: center
     :width: 800px
     :alt: Parallelism nomenclature
diff --git a/docs/source/nlp/nemo_megatron/packed_sequence.rst b/docs/source/features/throughput_optimizations.rst
similarity index 96%
rename from docs/source/nlp/nemo_megatron/packed_sequence.rst
rename to docs/source/features/throughput_optimizations.rst
index e31444fe1e60..825c3add5dfb 100644
--- a/docs/source/nlp/nemo_megatron/packed_sequence.rst
+++ b/docs/source/features/throughput_optimizations.rst
@@ -1,7 +1,9 @@
+Throughput Optimizations
+========================
+
 Sequence Packing for SFT/PEFT
 -----------------------------
 
-
 Overview
 ^^^^^^^^
 
@@ -133,6 +135,10 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co
 
 Now you are all set to finetune your model with a much improved throughput!
 
+Communication Overlap
+---------------------
+NeMo leverages Megatron-Core's optimizations to enhance bandwidth utilization and effectively overlap computation with communication. Additional details will be provided soon.
+
 
 .. rubric:: Footnotes
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8dc74ecc771d..82d3359480ca 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,7 +1,19 @@
 NVIDIA NeMo Framework Developer Docs
 ====================================
 
-NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customize, and deploy generative AI models anywhere.
+NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build, customize, and deploy generative AI models anywhere.
+
+`NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ supports large-scale training features, including:
+
+- Mixed Precision Training
+- Parallelism
+- Distributed Optimizer
+- Fully Sharded Data Parallel (FSDP)
+- Flash Attention
+- Activation Recomputation
+- Positional Embeddings and Positional Interpolation
+- Post-Training Quantization (PTQ) with Ammo
+- Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:
 
@@ -9,7 +21,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customi
 
 * :doc:`Automatic Speech Recognition (ASR) <asr/intro>`
 
-* :doc:`Multimodal (MM) Models <multimodal/mllm/intro>`
+* :doc:`Multimodal Models (MMs) <multimodal/mllm/intro>`
 
 * :doc:`Text-to-Speech (TTS) <tts/intro>`
 
@@ -29,105 +41,49 @@ For quick guides and tutorials, see the "Getting started" section below.
    :titlesonly:
 
    starthere/intro
-   starthere/tutorials
    starthere/best-practices
+   starthere/tutorials
 
 For more information, browse the developer docs for your area of interest in the contents section below or on the left sidebar.
 
+
 .. toctree::
    :maxdepth: 1
-   :caption: NeMo Core
-   :name: core
-   :titlesonly:
+   :caption: Key Optimizations
+   :name: Key Optimizations
 
-   core/core_index
+   features/mixed_precision
+   features/parallelisms
+   features/memory_optimizations
+   features/throughput_optimizations
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Community Model Converters
    :name: CheckpointConverters
 
-   ckpt_converters/user_guide
-   ckpt_converters/dev_guide
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Large Language Models (LLMs)
-   :name: Large Language Models
-   :titlesonly:
-
-   nlp/nemo_megatron/intro
-   nlp/models
-   nlp/machine_translation/machine_translation
-   nlp/megatron_onnx_export
-   nlp/quantization
-   nlp/api
-
+   ckpt_converters/intro
 
 .. toctree::
    :maxdepth: 1
-   :caption: Speech AI
-   :name: Speech AI
+   :caption: APIs
+   :name: APIs
    :titlesonly:
 
-   asr/intro
-   asr/speech_classification/intro
-   asr/speaker_recognition/intro
-   asr/speaker_diarization/intro
-   asr/ssl/intro
-   asr/speech_intent_slot/intro
-
+   core/core_index
 
 .. toctree::
    :maxdepth: 1
-   :caption: Multimodal (MM)
-   :name: Multimodal
+   :caption: Collections
+   :name: Collections
    :titlesonly:
 
-   multimodal/mllm/intro
-   multimodal/vlm/intro
-   multimodal/text2img/intro
-   multimodal/nerf/intro
-   multimodal/api
-
+   collections
 
 .. toctree::
    :maxdepth: 1
-   :caption: Text To Speech (TTS)
-   :name: Text To Speech
-   :titlesonly:
-
-   tts/intro
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Vision (CV)
-   :name: vision
-   :titlesonly:
-
-   vision/intro
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Common
-   :name: Common
-   :titlesonly:
-
-   common/intro
-
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Speech Tools
-   :name: Speech Tools
-   :titlesonly:
-
-   tools/intro
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Upgrade Guide
-   :name: Upgrade Guide
+   :caption: Speech AI Tools
+   :name: Speech AI Tools
    :titlesonly:
 
-   starthere/migration-guide
\ No newline at end of file
+   tools/intro
\ No newline at end of file
diff --git a/docs/source/multimodal/mllm/neva.rst b/docs/source/multimodal/mllm/neva.rst
index 83fb6b681e29..5484ab358c2f 100644
--- a/docs/source/multimodal/mllm/neva.rst
+++ b/docs/source/multimodal/mllm/neva.rst
@@ -25,7 +25,7 @@ In NeMo, the text encoder is anchored in the :class:`~nemo.collections.nlp.model
 Vision Model
 ^^^^^^^^^^^^
 
-For visual interpretation, NeVA harnesses the power of the pre-trained CLIP visual encoder, ViT-L/14, recognized for its visual comprehension acumen. Images are first partitioned into standardized patches, for instance, 16x16 pixels. These patches are linearly embedded, forming a flattened vector that subsequently feeds into the transformer. The culmination of the transformer's processing is a unified image representation. In the NeMo framework, the NeVA vision model, anchored on the CLIP visual encoder ViT-L/14, can either be instantiated via the :class:`~nemo.collections.multimodal.models.multimodal_llm.clip.megatron_clip_models.CLIPVisionTransformer` class or initiated through the `transformers` package from Hugging Face.
+For visual interpretation, NeVA harnesses the power of the pre-trained CLIP visual encoder, ViT-L/14, recognized for its visual comprehension acumen. Images are first partitioned into standardized patches, for instance, 16x16 pixels. These patches are linearly embedded, forming a flattened vector that subsequently feeds into the transformer. The culmination of the transformer's processing is a unified image representation. In the NeMo Framework, the NeVA vision model, anchored on the CLIP visual encoder ViT-L/14, can either be instantiated via the :class:`~nemo.collections.multimodal.models.multimodal_llm.clip.megatron_clip_models.CLIPVisionTransformer` class or initiated through the `transformers` package from Hugging Face.
 
 Projection and Integration
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
index 11ccfd010058..6f5092f93f5f 100644
--- a/docs/source/multimodal/text2img/sd.rst
+++ b/docs/source/multimodal/text2img/sd.rst
@@ -1,7 +1,7 @@
 Stable Diffusion
 ================
 
-This section gives a brief overview of the stable diffusion model in NeMo framework.
+This section gives a brief overview of the stable diffusion model in NeMo Framework.
 
 Model Introduction
 --------------------
diff --git a/docs/source/nlp/nemo_megatron/flash_attention.rst b/docs/source/nlp/nemo_megatron/flash_attention.rst
deleted file mode 100644
index b00b7a38d63a..000000000000
--- a/docs/source/nlp/nemo_megatron/flash_attention.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Flash attention
----------------
-Flash Attention :cite:`nlp-megatron-dao2022flashattention` is a method designed to enhance the efficiency of Transformer models, which are widely utilized in applications such as natural language processing. Traditional Transformers are slow and consume a lot of memory, especially with long sequences, due to the quadratic time and memory complexity of self-attention. FlashAttention, an IO-aware exact attention algorithm that leverages tiling to minimize the number of memory reads/writes between the GPU's high bandwidth memory (HBM) and on-chip SRAM. This approach is designed to be more efficient in terms of IO complexity compared to standard attention mechanisms.
-
-GPT
-^^^
-To enable Flash Attention while Megatron GPT model training or fine-tuning, modify the following configuration: 
-
-.. code::
-
-   model.use_flash_attention=True
-
-T5
-^^
-To enable Flash Attention while Megatron T5 model training, modify the following configuration: 
-
-.. code::
-
-   model.encoder.use_flash_attention=True
-   model.decoder.use_flash_attention=True
-
-References
-----------
-
-.. bibliography:: ../nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-megatron
-    :keyprefix: nlp-megatron-
diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
index c582edbffd61..fab448f3d4f2 100644
--- a/docs/source/nlp/nemo_megatron/intro.rst
+++ b/docs/source/nlp/nemo_megatron/intro.rst
@@ -12,18 +12,14 @@ To learn more about using NeMo to train Large Language Models at scale, please r
 .. toctree::
    :maxdepth: 1
 
-   mlm_migration   
    gpt/gpt_training
    batching
-   parallelisms  
    prompt_learning
    retro/retro_model
    hiddens/hiddens_module
    peft/landing_page
-   flash_attention
    positional_embeddings
    mcore_customization
-   packed_sequence
 
 
 References
diff --git a/docs/source/nlp/nemo_megatron/mlm_migration.rst b/docs/source/nlp/nemo_megatron/mlm_migration.rst
deleted file mode 100644
index ffe9764615b5..000000000000
--- a/docs/source/nlp/nemo_megatron/mlm_migration.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-Migrating from Megatron-LM
---------------------------
-
-NeMo Megatron and Megatron-LM share many underlying technology. You should be able to convert your GPT model checkpoints trained with Megatron-LM into NeMo Megatron.
-Example conversion script:
-
-.. code-block:: bash
-
-   <NeMo_ROOT_FOLDER>/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py \
-     --checkpoint_folder <path_to_PTL_checkpoints_folder> \
-     --checkpoint_name megatron_gpt--val_loss=99.99-step={steps}-consumed_samples={consumed}.0 \
-     --nemo_file_path <path_to_output_nemo_file> \
-     --model_type <megatron model type> \
-     --tensor_model_parallel_size <tensor_model_parallel_size> \
-     --pipeline_model_parallel_size <pipeline_model_parallel_size>  \
-     --gpus_per_node  <gpus per node>
-
-
-
-To resume the training from converted MegatronLM checkpoint, make sure to set the 
-`trainer.max_steps=round(lr-warmup-fraction * lr-decay-iters + lr-decay-iters)`
-where  `lr-warmup-fraction` and `lr-decay-iters` are arguments from MegatronLM training
-so the learning rate scheduler will follow the same curve.
-
diff --git a/docs/source/nlp/nemo_megatron/positional_embeddings.rst b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
index b8dea5280c28..332ce304049d 100644
--- a/docs/source/nlp/nemo_megatron/positional_embeddings.rst
+++ b/docs/source/nlp/nemo_megatron/positional_embeddings.rst
@@ -18,26 +18,26 @@ GPT
      - .. code::
           
           model.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression. 
+     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **rope**
      - .. code::
-          
+
           model.position_embedding_type='rope'
           model.rotary_percentage=1.0
-     - Rotary Position Embedding (RoPE) :cite:`nlp-megatron-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations by leveraging the geometric properties of vectors and complex numbers, applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings. 
-   
+     - Rotary Position Embedding (RoPE) :cite:`nlp-megatron-su2022roformer` incorporates positional information by utilizing a rotation matrix to encode the absolute positions of tokens while maintaining relative positional relationships in self-attention formulations. It achieves this by leveraging the geometric properties of vectors and complex numbers and applying a rotation based on a preset non-zero constant and the relative positions of the tokens to the word embeddings.
+
    * - **alibi**
      - .. code::
-          
+
           model.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude. 
+     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
 
           model.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner. 
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
    * - **xpos**
      - .. code::
@@ -64,43 +64,43 @@ T5
 
    * - **learned_absolute**
      - .. code::
-          
+
           model.encoder.position_embedding_type='learned_absolute'
           model.decoder.position_embedding_type='learned_absolute'
-     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression. 
+     - Absolute Position Encodings :cite:`nlp-megatron-vaswani2023attention` are position embeddings used in Transformer-based models, added to input embeddings in the encoder and decoder sections. These encodings match the dimension of embeddings and are created using sine and cosine functions of various frequencies. Each dimension in the encoding corresponds to a sinusoid with wavelengths forming a geometric progression.
 
    * - **relative**
      - .. code::
-          
+
           model.encoder.position_embedding_type='relative'
           model.decoder.position_embedding_type='relative'
      - Relative Position Representations :cite:`nlp-megatron-shaw2018selfattention`
 
    * - **alibi**
      - .. code::
-          
+
           model.encoder.position_embedding_type='alibi'
           model.decoder.position_embedding_type='alibi'
-     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude. 
+     - Attention with Linear Biases (ALiBi) :cite:`nlp-megatron-press2022train` modifies the way attention scores are computed in the attention sublayer of the network. ALiBi introduces a static, non-learned bias after the query-key dot product during the computation of attention scores. This bias is added in the form of a head-specific slope that is determined before training, creating a geometric sequence of slopes for the different heads in the model. The method has an inductive bias towards recency, penalizing attention scores between distant query-key pairs with the penalty increasing as the distance grows, and it leverages different rates of penalty increase across different heads based on the slope magnitude.
 
    * - **kerple**
      - .. code::
-          
+
           model.encoder.position_embedding_type='kerple'
           model.decoder.position_embedding_type='kerple'
-     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using conditionally positive definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner. 
+     - Kernelized Relative Positional Embedding for Length Extrapolation (KERPLE) :cite:`nlp-megatron-chi2022kerple` generalizes relative positional embeddings (RPE) by kernelizing positional differences using Conditionally Positive Definite (CPD) kernels known for generalizing distance metrics. They transform CPD kernels into positive definite (PD) kernels by adding a constant offset, which is absorbed during softmax normalization in the self-attention mechanism of transformers. This approach allows for a variety of RPEs that facilitate length extrapolation in a principled manner.
 
 Positional interpolation
 ------------------------
 Position Interpolation (PI) :cite:`nlp-megatron-chen2023extending` is a method introduced to extend the context window sizes of Rotary Position Embedding (RoPE)-based pretrained large language models (LLMs). The central principle of PI is to reduce the position indices so that they align with the initial context window size through interpolation.
 
-Positional Interpolation is supported in Megatron GPT SFT models. Set RoPE Interpolation factor for sequence length :code:`seq_len_interpolation_factor` to enable it.  
+Positional Interpolation is supported in Megatron GPT SFT models. Set RoPE Interpolation factor for sequence length :code:`seq_len_interpolation_factor` to enable it.
 
 .. code::
-   
+
    model.position_embedding_type='rope'
    model.rotary_percentage=1.0
-   model.seq_len_interpolation_factor: 2 
+   model.seq_len_interpolation_factor: 2
 
 References
 ----------
diff --git a/docs/source/starthere/best-practices.rst b/docs/source/starthere/best-practices.rst
index 5e2f5db23cfb..ec0fea1985cc 100644
--- a/docs/source/starthere/best-practices.rst
+++ b/docs/source/starthere/best-practices.rst
@@ -1,299 +1,72 @@
 .. _best-practices:
 
-Best Practices
-==============
-
-The NVIDIA NeMo Toolkit is available on GitHub as `open source <https://github.com/NVIDIA/NeMo>`_ as well as
-a `Docker container on NGC <https://ngc.nvidia.com/catalog/containers/nvidia:nemo>`_. It's assumed the user has
-already installed NeMo by following the :ref:`quick_start_guide` instructions.
-
-The conversational AI pipeline consists of three major stages:
-
-- Automatic Speech Recognition (ASR)
-- Natural Language Processing (NLP) or Natural Language Understanding (NLU)
-- Text-to-Speech (TTS) Synthesis
-
-As you talk to a computer, the ASR phase converts the audio signal into text, the NLP stage interprets the question
-and generates a smart response, and finally the TTS phase converts the text into speech signals to generate audio for
-the user. The toolkit enables development and training of deep learning models involved in conversational AI and easily
-chain them together.
-
 Why NeMo?
----------
-
-Deep learning model development for conversational AI is complex. It involves defining, building, and training several
-models in specific domains; experimenting several times to get high accuracy, fine tuning on multiple tasks and domain
-specific data, ensuring training performance and making sure the models are ready for deployment to inference applications.
-Neural modules are logical blocks of AI applications which take some typed inputs and produce certain typed outputs. By
-separating a model into its essential components in a building block manner, NeMo helps researchers develop state-of-the-art
-accuracy models for domain specific data faster and easier.
-
-Collections of modules for core tasks as well as specific to speech recognition, natural language, speech synthesis help
-develop modular, flexible, and reusable pipelines.
-
-A neural module’s inputs/outputs have a neural type, that describes the semantics, the axis order and meaning, and the dimensions
-of the input/output tensors. This typing allows neural modules to be safely chained together to build models for applications.
-
-NeMo can be used to train new models or perform transfer learning on existing pre-trained models. Pre-trained weights per module
-(such as encoder, decoder) help accelerate model training for domain specific data.
-
-ASR, NLP and TTS pre-trained models are trained on multiple datasets (including some languages such as Mandarin) and optimized
-for high accuracy. They can be used for transfer learning as well.
-
-NeMo supports developing models that work with Mandarin Chinese data. Tutorials help users train or fine tune models for
-conversational AI with the Mandarin Chinese language. The export method provided in NeMo makes it easy to transform a trained
-model into inference ready format for deployment.
-
-A key area of development in the toolkit is interoperability with other tools used by speech researchers. Data layer for Kaldi
-compatibility is one such example.
+=========
 
-NeMo, PyTorch Lightning, And Hydra
-----------------------------------
+Developing deep learning models for Gen AI is a complex process, encompassing the design, construction, and training of models across specific domains. Achieving high accuracy requires extensive experimentation, fine-tuning for diverse tasks and domain-specific datasets, ensuring optimal training performance, and preparing models for deployment.
 
-Conversational AI architectures are typically very large and require a lot of data and compute for training. NeMo uses
-`Pytorch Lightning <https://github.com/PyTorchLightning/pytorch-lightning>`_ for easy and performant multi-GPU/multi-node
-mixed precision training.
+NeMo simplifies this intricate development landscape through its modular approach. It introduces neural modules—logical blocks of AI applications with typed inputs and outputs—facilitating the seamless construction of models by chaining these blocks based on neural types. This methodology accelerates development, improves model accuracy on domain-specific data, and promotes modularity, flexibility, and reusability within AI workflows.
 
-Pytorch Lightning is a high-performance PyTorch wrapper that organizes PyTorch code, scales model training, and reduces
-boilerplate. PyTorch Lightning has two main components, the ``LightningModule`` and the Trainer. The ``LightningModule`` is
-used to organize PyTorch code so that deep learning experiments can be easily understood and reproduced. The Pytorch Lightning
-Trainer is then able to take the ``LightningModule`` and automate everything needed for deep learning training.
+Further enhancing its utility, NeMo provides collections of modules designed for core tasks in speech recognition, natural language processing, and speech synthesis. It supports the training of new models or fine-tuning of existing pre-trained modules, leveraging pre-trained weights to expedite the training process.
 
-NeMo models are LightningModules that come equipped with all supporting infrastructure for training and reproducibility. This
-includes the deep learning model architecture, data preprocessing, optimizer, check-pointing and experiment logging. NeMo
-models, like LightningModules, are also PyTorch modules and are fully compatible with the broader PyTorch ecosystem. Any NeMo
-model can be taken and plugged into any PyTorch workflow.
+The framework encompasses models trained and optimized for multiple languages, including Mandarin, and offers extensive tutorials for conversational AI development across these languages. NeMo's emphasis on interoperability with other research tools broadens its applicability and ease of use.
 
-Configuring conversational AI applications is difficult due to the need to bring together many different Python libraries into
-one end-to-end system. NeMo uses Hydra for configuring both NeMo models and the PyTorch Lightning Trainer. `Hydra <https://github.com/facebookresearch/hydra>`_
-is a flexible solution that makes it easy to configure all of these libraries from a configuration file or from the command-line.
+Large Language Models & Multimodal (LLM & MM)
+---------------------------------------------
 
-Every NeMo model has an example configuration file and a corresponding script that contains all configurations needed for training
-to state-of-the-art accuracy. NeMo models have the same look and feel so that it is easy to do conversational AI research across
-multiple domains.
+NeMo excels in training large-scale LLM & MM, utilizing optimizations from Megatron-LM and Transformer Engine to deliver state-of-the-art performance. It includes a comprehensive feature set for large-scale training:
 
-Using Optimized Pretrained Models With NeMo
--------------------------------------------
+- Supports Multi-GPU and Multi-Node computing to enable scalability.
+- Precision options including FP32/TF32, FP16, BF16, and TransformerEngine/FP8.
+- Parallelism strategies: Data parallelism, Tensor parallelism, Pipeline parallelism, Interleaved Pipeline parallelism, Sequence parallelism and Context parallelism, Distributed Optimizer, and Fully Shared Data Parallel.
+- Optimized utilities such as Flash Attention, Activation Recomputation, and Communication Overlap.
+- Advanced checkpointing through the Distributed Checkpoint Format.
 
-`NVIDIA GPU Cloud (NGC) <https://ngc.nvidia.com/catalog>`_ is a software repository that has containers and models optimized
-for deep learning. NGC hosts many conversational AI models developed with NeMo that have been trained to state-of-the-art accuracy
-on large datasets. NeMo models on NGC can be automatically downloaded and used for transfer learning tasks. Pretrained models
-are the quickest way to get started with conversational AI on your own data. NeMo has many `example scripts <https://github.com/NVIDIA/NeMo/tree/stable/examples>`_
-and `Jupyter Notebook tutorials <https://github.com/NVIDIA/NeMo#tutorials>`_ showing step-by-step how to fine-tune pretrained NeMo
-models on your own domain-specific datasets.
-
-For BERT based models, the model weights provided are ready for
-downstream NLU tasks. For speech models, it can be helpful to start with a pretrained model and then continue pretraining on your
-own domain-specific data. Jasper and QuartzNet base model pretrained weights have been known to be very efficient when used as
-base models. For an easy to follow guide on transfer learning and building domain specific ASR models, you can follow this `blog <https://developer.nvidia.com/blog/how-to-build-domain-specific-automatic-speech-recognition-models-on-gpus/>`_.
-All pre-trained NeMo models can be found on the `NGC NeMo Collection <https://ngc.nvidia.com/catalog/collections?orderBy=scoreDESC&pageNumber=0&query=NeMo&quickFilter=&filters=>`_. Everything needed to quickly get started
-with NeMo ASR, NLP, and TTS models is there.
-
-Pre-trained models are packaged as a ``.nemo`` file and contain the PyTorch checkpoint along with everything needed to use the model.
-NeMo models are trained to state-of-the-art accuracy and trained on multiple datasets so that they are robust to small differences
-in data. NeMo contains a large variety of models such as speaker identification and Megatron BERT and the best models in speech and
-language are constantly being added as they become available. NeMo is the premier toolkit for conversational AI model building and
-training.
-
-For a list of supported models, refer to the :ref:`tutorials` section.
-
-ASR Guidance
-------------
-
-This section is to help guide your decision making by answering our most asked ASR questions.
-
-**Q: Is there a way to add domain specific vocabulary in NeMo? If so, how do I do that?**
-A: QuartzNet and Jasper models are character-based. So pretrained models we provide for these two output lowercase English
-letters and ‘. Users can re-retrain them on vocabulary with upper case letters and punctuation symbols.
-
-**Q: When training, there are “Reference” lines and “Decoded” lines that are printed out. It seems like the reference line should
-be the “truth” line and the decoded line should be what the ASR is transcribing. Why do I see that even the reference lines do not
-appear to be correct?**
-A: Because our pre-trained models can only output lowercase letters and apostrophe, everything else is dropped. So the model will
-transcribe 10 as ten. The best way forward is to prepare the training data first by transforming everything to lowercase and convert
-the numbers from digit representation to word representation using a simple library such as `inflect <https://pypi.org/project/inflect/>`_. Then, add the uppercase letters
-and punctuation back using the NLP punctuation model. Here is an example of how this is incorporated: `NeMo voice swap demo <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/VoiceSwapSample.ipynb>`_.
-
-**Q: What languages are supported in NeMo currently?**
-A: Along with English, we provide pre-trained models for Zh, Es, Fr, De, Ru, It, Ca and Pl languages.
-For more information, see `NeMo Speech Models <https://ngc.nvidia.com/catalog/collections/nvidia:nemo_asr>`_.
+Speech AI
+--------
 
 Data Augmentation
------------------
-
-Data augmentation in ASR is invaluable. It comes at the cost of increased training time if samples are augmented during training
-time. To save training time, it is recommended to pre-process the dataset offline for a one time preprocessing cost and then train
-the dataset on this augmented training set.
+~~~~~~~~~~~~~~~~~
 
-For example, processing a single sample involves:
-
-- Speed perturbation
-- Time stretch perturbation (sample level)
-- Noise perturbation
-- Impulse perturbation
-- Time stretch augmentation (batch level, neural module)
-
-A simple tutorial guides users on how to use these utilities provided in `GitHub: NeMo <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_Noise_Augmentation.ipynb>`_.
+Augmenting ASR data is essential but can be time-consuming during training. NeMo advocates for offline dataset preprocessing to conserve training time, illustrated in a tutorial covering speed perturbation and noise augmentation techniques.
 
 Speech Data Explorer
---------------------
-
-Speech data explorer is a `Dash-based tool <https://plotly.com/dash/>`_ for interactive exploration of ASR/TTS datasets.
+~~~~~~~~~~~~~~~~~~~~
 
-Speech data explorer collects:
-
-- dataset statistics (alphabet, vocabulary, and duration-based histograms)
-- navigation across datasets (sorting and filtering)
-- inspections of individual utterances (waveform, spectrogram, and audio player)
-- errors analysis (word error rate, character error rate, word match rate, mean word accuracy, and diff)
-
-In order to use the tool, it needs to be installed separately. Perform the steps `here <https://github.com/NVIDIA/NeMo/tree/stable/tools/speech_data_explorer>`_ to install speech data explorer.
+A Dash-based tool for interactive exploration of ASR/TTS datasets, providing insights into dataset statistics, utterance inspections, and error analysis. Installation instructions for this tool are available in NeMo’s GitHub repository.
 
 Using Kaldi Formatted Data
---------------------------
-
-The `Kaldi Speech Recognition Toolkit <https://kaldi-asr.org/>`_ project began in 2009 at `Johns Hopkins University <https://www.jhu.edu/>`. It is a toolkit written in C++. If
-researchers have used Kaldi and have datasets that are formatted to be used with the toolkit; they can use NeMo to develop models
-based on that data.
-
-To load Kaldi-formatted data, you can simply use ``KaldiFeatureDataLayer`` instead of ``AudioToTextDataLayer``. The ``KaldiFeatureDataLayer``
-takes in the argument ``kaldi_dir`` instead of a ``manifest_filepath``. The ``manifest_filepath`` argument should be set to the directory
-that contains the files ``feats.scp`` and ``text``.
-
-Using Speech Command Recognition Task For ASR Models
-----------------------------------------------------
-
-Speech Command Recognition is the task of classifying an input audio pattern into a set of discrete classes. It is a subset of ASR,
-sometimes referred to as Key Word Spotting, in which a model is constantly analyzing speech patterns to detect certain ``action`` classes.
-
-Upon detection of these commands, a specific action can be taken. An example Jupyter notebook provided in NeMo shows how to train a
-QuartzNet model with a modified decoder head trained on a speech commands dataset.
-
-.. note:: It is preferred that you use absolute paths to ``data_dir`` when preprocessing the dataset.
-
-NLP Fine-Tuning BERT
---------------------
-
-BERT, or Bidirectional Encoder Representations from Transformers, is a neural approach to pre-train language representations which
-obtains near state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks, including the GLUE benchmark and
-SQuAD Question & Answering dataset.
-
-BERT model checkpoints (`BERT-large-uncased <https://ngc.nvidia.com/catalog/models/nvidia:bertlargeuncasedfornemo>`_ and `BERT-base-uncased <https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo>`_) are provided can be used for either fine tuning BERT on your custom
-dataset, or fine tuning downstream tasks, including GLUE benchmark tasks, Question & Answering tasks, Joint Intent & Slot detection,
-Punctuation and Capitalization, Named Entity Recognition, and Speech Recognition post processing model to correct mistakes.
-
-.. note:: Almost all NLP examples also support RoBERTa and ALBERT models for downstream fine-tuning tasks (see the list of all supported models by calling ``nemo.collections.nlp.modules.common.lm_utils.get_pretrained_lm_models_list()``). The user needs to specify the name of the model desired while running the example scripts.
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-BioMegatron Medical BERT
-------------------------
+NeMo supports Kaldi-formatted datasets, enabling the development of models with existing Kaldi data by substituting the AudioToTextDataLayer with the KaldiFeatureDataLayer.
 
-BioMegatron is a large language model (Megatron-LM) trained on larger domain text corpus (PubMed abstract + full-text-commercial).
-It achieves state-of-the-art results for certain tasks such as Relationship Extraction, Named Entity Recognition and Question &
-Answering. Follow these tutorials to learn how to train and fine tune BioMegatron; pretrained models are provided on NGC:
+Speech Command Recognition
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-- `Relation Extraction BioMegatron <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb>`_
-- `Token Classification BioMegatron <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification-BioMegatron.ipynb>`_
+Specialized training for speech command recognition is covered in a dedicated NeMo Jupyter notebook, guiding users through the process of training a QuartzNet model on a speech commands dataset.
 
-Efficient Training With NeMo
-----------------------------
+General Optimizations
+---------------------
 
-Using Mixed Precision
-^^^^^^^^^^^^^^^^^^^^^
+Mixed Precision Training
+~~~~~~~~~~~~~~~~~~~~~~~~
 
-Mixed precision accelerates training speed while protecting against noticeable loss. Tensor Cores is a specific hardware unit that
-comes starting with the Volta and Turing architectures to accelerate large matrix to matrix multiply-add operations by operating them
-on half precision inputs and returning the result in full precision.
-
-Neural networks which usually use massive matrix multiplications can be significantly sped up with mixed precision and Tensor Cores.
-However, some neural network layers are numerically more sensitive than others. Apex AMP is an NVIDIA library that maximizes the
-benefit of mixed precision and Tensor Cores usage for a given network.
+Utilizing NVIDIA’s Apex AMP, mixed precision training enhances training speeds with minimal precision loss, especially on hardware equipped with Tensor Cores.
 
 Multi-GPU Training
-^^^^^^^^^^^^^^^^^^
-
-This section is to help guide your decision making by answering our most asked multi-GPU training questions.
-
-**Q: Why is multi-GPU training preferred over other types of training?**
-A: Multi-GPU training can reduce the total training time by distributing the workload onto multiple compute instances. This is
-particularly important for large neural networks which would otherwise take weeks to train until convergence. Since NeMo supports
-multi-GPU training, no code change is needed to move from single to multi-GPU training, only a slight change in your launch command
-is required.
-
-**Q: What are the advantages of mixed precision training?**
-A: Mixed precision accelerates training speed while protecting against noticeable loss in precision. Tensor Cores is a specific
-hardware unit that comes starting with the Volta and Turing architectures to accelerate large matrix multiply-add operations by
-operating on half precision inputs and returning the result in full precision in order to prevent loss in precision. Neural
-networks which usually use massive matrix multiplications can be significantly sped up with mixed precision and Tensor Cores.
-However, some neural network layers are numerically more sensitive than others. Apex AMP is a NVIDIA library that maximizes the
-benefit of mixed precision and Tensor Core usage for a given network.
-
-**Q: What is the difference between multi-GPU and multi-node training?**
-A: Multi-node is an abstraction of multi-GPU training, which requires a distributed compute cluster, where each node can have multiple
-GPUs. Multi-node training is needed to scale training beyond a single node to large amounts of GPUs.
-
-From the framework perspective, nothing changes from moving to multi-node training. However, a master address and port needs to be set
-up for inter-node communication. Multi-GPU training will then be launched on each node with passed information. You might also consider
-the underlying inter-node network topology and type to achieve full performance, such as HPC-style hardware such as NVLink, InfiniBand
-networking, or Ethernet.
-
-
-Recommendations For Optimization And FAQs
------------------------------------------
-
-This section is to help guide your decision making by answering our most asked NeMo questions.
-
-**Q: Are there areas where performance can be increased?**
-A: You should try using mixed precision for improved performance. Note that typically when using mixed precision, memory consumption
-is decreased and larger batch sizes could be used to further improve the performance.
-
-When fine-tuning ASR models on your data, it is almost always possible to take advantage of NeMo's pre-trained modules. Even if you
-have a different target vocabulary, or even a different language; you can still try starting with pre-trained weights from Jasper or
-QuartzNet ``encoder`` and only adjust the ``decoder`` for your needs.
-
-**Q: What is the recommended sampling rate for ASR?**
-A: The released models are based on 16 KHz audio, therefore, ensure you use models with 16 KHz audio. Reduced performance should be
-expected for any audio that is up-sampled from a sampling frequency less than 16 KHz data.
+~~~~~~~~~~~~~~~~~~
 
-**Q: How do we use this toolkit for audio with different types of compression and frequency than the training domain for ASR?**
-A: You have to match the compression and frequency.
+NeMo enables multi-GPU training, substantially reducing training durations for large models. This section clarifies the advantages of mixed precision and the distinctions between multi-GPU and multi-node training.
 
-**Q: How do you replace the 6-gram out of the ASR model with a custom language model? What is the language format supported in NeMo?**
-A: NeMo’s Beam Search decoder with Levenberg-Marquardt (LM) neural module supports the KenLM language model.
+NeMo, PyTorch Lightning, and Hydra
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-- You should retrain the KenLM language model on your own dataset. Refer to `KenLM’s documentation <https://github.com/kpu/kenlm#kenlm>`_.
-- If you want to use a different language model, other than KenLM, you will need to implement a corresponding decoder module.
-- Transformer-XL example is present in OS2S. It would need to be updated to work with NeMo. `Here is the code <https://github.com/NVIDIA/OpenSeq2Seq/tree/master/external_lm_rescore>`_.
+Integrating PyTorch Lightning for training efficiency and Hydra for configuration management, NeMo streamlines conversational AI research by organizing PyTorch code and automating training workflows.
 
-**Q: How do I use text-to-speech (TTS) synthesis?**
-A:
+Optimized Pretrained Models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-- Obtain speech data ideally at 22050 Hz or alternatively at a higher sample rate and then down sample to 22050 Hz.
-    - If less than 22050 Hz and at least 16000 Hz:
-        - Retrain WaveGlow on your own dataset.
-        - Tweak the spectrogram generation parameters, namely the ``window_size`` and the ``window_stride`` for their fourier transforms.
-    - For below 16000 Hz, look into obtaining new data.
-- In terms of bitrate/quantization, the general advice is the higher the better. We have not experimented enough to state how much
-  this impacts quality.
-- For the amount of data, again the more the better, and the more diverse in terms of phonemes the better. Aim for around 20 hours
-  of speech after filtering for silences and non-speech audio.
-- Most open speech datasets are in ~10 second format so training spectrogram generators on audio on the order of 10s - 20s per sample is known
-  to work. Additionally, the longer the speech samples, the more difficult it will be to train them.
-- Audio files should be clean. There should be little background noise or music. Data recorded from a studio mic is likely to be easier
-  to train compared to data captured using a phone.
-- To ensure pronunciation of words are accurate; the technical challenge is related to the dataset, text to phonetic spelling is
-  required, use phonetic alphabet (notation) that has the name correctly pronounced.
-- Here are some example parameters you can use to train spectrogram generators:
-    - use single speaker dataset
-    - Use AMP level O0
-    - Trim long silences in the beginning and end
-    - ``optimizer="adam"``
-    - ``beta1 = 0.9``
-    - ``beta2 = 0.999``
-    - ``lr=0.001 (constant)``
-    - ``amp_opt_level="O0"``
-    - ``weight_decay=1e-6``
-    - ``batch_size=48 (per GPU)``
-    - ``trim_silence=True``
+Through NVIDIA GPU Cloud (NGC), NeMo offers a collection of optimized, pre-trained models for various conversational AI applications, facilitating easy integration into research projects and providing a head start in conversational AI development.
 
 Resources
 ---------
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
index eaeab3c212d0..63fdcfb0406e 100644
--- a/docs/source/starthere/intro.rst
+++ b/docs/source/starthere/intro.rst
@@ -8,42 +8,125 @@ Introduction
 
 .. _dummy_header:
 
-NVIDIA NeMo Framework is an end-to-end, cloud-native framework to build, customize, and deploy generative AI models anywhere. 
-To learn more about using NeMo in generative AI workflows, please refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_.
+NVIDIA NeMo Framework is an end-to-end, cloud-native framework for building, customizing, and deploying generative AI models anywhere. It allows for the creation of state-of-the-art models across a wide array of domains, including speech, language, and vision. For detailed information on utilizing NeMo in your generative AI workflows, refer to the `NeMo Framework User Guide <https://docs.nvidia.com/nemo-framework/user-guide/latest/index.html>`_.
 
-`NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for Large Language Models (LLMs), 
-Multimodal (MM), Computer Vision (CV), Automatic Speech Recognition (ASR), 
-and Text-to-Speech (TTS) models. Each collection consists of
-prebuilt modules that include everything needed to train on your data.
-Every module can easily be customized, extended, and composed to create new generative AI
-model architectures.
+Training generative AI architectures typically requires significant data and computing resources. NeMo utilizes `PyTorch Lightning <https://www.pytorchlightning.ai/>`_ for efficient and performant multi-GPU/multi-node mixed-precision training.
+NeMo is built on top of NVIDIA's powerful Megatron-LM and Transformer Engine for its Large Language Models (LLMs) and Multimodal Models (MMs), leveraging cutting-edge advancements in model training and optimization. For Speech AI applications, Automatic Speech Recognition (ASR) and Text-to-Speech (TTS), NeMo is developed with native PyTorch and PyTorch Lightning, ensuring seamless integration and ease of use. Future updates are planned to align Speech AI models with the Megatron framework, enhancing training efficiency and model performance.
 
-Generative AI architectures are typically large and require a lot of data and compute
-for training. NeMo uses `PyTorch Lightning <https://www.pytorchlightning.ai/>`_ for easy and performant multi-GPU/multi-node
-mixed-precision training.
 
-`Pre-trained NeMo models <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_ are available
-in 14+ languages.
+`NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ features separate collections for Large Language Models (LLMs), Multimodal Models (MMs), Computer Vision (CV), Automatic Speech Recognition (ASR), and Text-to-Speech (TTS) models. Each collection comprises prebuilt modules that include everything needed to train on your data. These modules can be easily customized, extended, and composed to create new generative AI model architectures.
+
+(TODO: Still valid? LLM is not included here.) `Pre-trained NeMo models <https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC>`_ are available in 14+ languages.
 
 Prerequisites
 -------------
 
-Before you begin using NeMo, it's assumed you meet the following prerequisites.
+Before using NeMo, make sure you meet the following prerequisites:
+
+#. Python version 3.10 or above.
+
+#. Pytorch version 1.13.1 or 2.0+.
+
+#. Access to an NVIDIA GPU for model training.
+
+Installation
+------------
+
+**Using NVIDIA PyTorch Container**
+
+To leverage all optimizations for LLM training, including 3D Model Parallel, fused kernels, FP8, and more, we recommend using the NVIDIA PyTorch container.
+
+.. code-block:: bash
+
+    docker pull nvcr.io/nvidia/pytorch:24.01-py3
+    docker run --gpus all -it nvcr.io/nvidia/pytorch:24.01-py3
+
+Within the container, you can install NeMo and its dependencies as follows:
+
+NeMo Installation
+
+.. code-block:: bash
+
+    apt-get update && apt-get install -y libsndfile1 ffmpeg
+    pip install Cython
+    pip install nemo_toolkit['all']
+
+Transformer Engine Installation
+
+This step involves cloning the Transformer Engine repository, checking out a specific commit, and installing it with specific flags.
 
-#. You have Python version 3.10 or above.
+.. code-block:: bash
+
+    git clone https://github.com/NVIDIA/TransformerEngine.git && \
+    cd TransformerEngine && \
+    git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \
+    git checkout FETCH_HEAD && \
+    git submodule init && git submodule update && \
+    NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
+
+Apex Installation
+
+This step includes a bug fix for Apex in the PyTorch 23.11 container.
+
+.. code-block:: bash
+
+    git clone https://github.com/NVIDIA/apex.git && \
+    cd apex && \
+    git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \
+    cp -R apex /usr/local/lib/python3.10/dist-packages
+
+PyTorch Lightning Installation
+
+This step involves installing a bug-fixed version of PyTorch Lightning from a specific branch.
+
+.. code-block:: bash
 
-#. You have Pytorch version 1.13.1 or 2.0+.
+    git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \
+    cd pytorch-lightning && \
+    PACKAGE_NAME=pytorch pip install -e .
 
-#. You have access to an NVIDIA GPU, if you intend to do model training.
+Megatron Core Installation
 
-.. _quick_start_guide:
+This section details the steps to clone and install the Megatron Core.
+
+.. code-block:: bash
+
+    git clone https://github.com/NVIDIA/Megatron-LM.git && \
+    cd Megatron-LM && \
+    git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
+    pip install .
+
+AMMO Installation
+
+This final step involves installing the AMMO package.
+
+.. code-block:: bash
+
+    pip install nvidia-ammo~=0.7.0 --extra-index-url https://pypi.nvidia.com --no-cache-dir
+
+
+.. code-block:: bash
+
+    apt-get update && apt-get install -y libsndfile1 ffmpeg
+    pip install Cython
+    pip install nemo_toolkit['all']
+
+**Conda Installation**
+
+If you do not use the NVIDIA PyTorch container, we recommend installing NeMo in a clean Conda environment.
+
+.. code-block:: bash
+
+    conda create --name nemo python==3.10.12
+    conda activate nemo
+
+Refer to the PyTorch configurator for instructions on installing PyTorch. `configurator <https://pytorch.org/get-started/locally/>`_
 
 Quick Start Guide
 -----------------
 
-You can try out NeMo's ASR, LLM and TTS functionality with the example below, which is based on the `Audio Translation <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/AudioTranslationSample.ipynb>`_ tutorial.
+To explore NeMo's capabilities in LLM, ASR, and TTS, follow the example below based on the `Audio Translation <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/AudioTranslationSample.ipynb>`_ tutorial. Ensure NeMo is :ref:`installed <installation>` before proceeding.
 
-Once you have :ref:`installed NeMo <installation>`, then you can run the code below:
 
 .. code-block:: python
 
@@ -66,7 +149,7 @@ Once you have :ref:`installed NeMo <installation>`, then you can run the code be
     english_text = nmt_model.translate(mandarin_text)
     print(english_text)
 
-    # Instantiate a spectrogram generator (which converts text -> spectrogram) 
+    # Instantiate a spectrogram generator (which converts text -> spectrogram)
     # and vocoder model (which converts spectrogram -> audio waveform)
     spectrogram_generator = nemo_tts.models.FastPitchModel.from_pretrained(model_name="tts_en_fastpitch")
     vocoder = nemo_tts.models.HifiGanModel.from_pretrained(model_name="tts_en_hifigan")
@@ -80,67 +163,19 @@ Once you have :ref:`installed NeMo <installation>`, then you can run the code be
     import soundfile as sf
     sf.write("output_audio.wav", audio.to('cpu').detach().numpy()[0], 22050)
 
-You can learn more by about specific tasks you are interested in by checking out the NeMo :doc:`tutorials <./tutorials>`, or documentation (e.g. read :doc:`here <../asr/intro>` to learn more about ASR).
-
-You can also learn more about NeMo in the `NeMo Primer <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/00_NeMo_Primer.ipynb>`_ tutorial, which introduces NeMo, PyTorch Lightning, and OmegaConf, and shows how to use, modify, save, and restore NeMo models. Additionally, the `NeMo Models <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/01_NeMo_Models.ipynb>`__ tutorial explains the fundamentals of how NeMo models are created. These concepts are also explained in detail in the :doc:`NeMo Core <../core/core>` documentation.
-
-
-Introductory videos
--------------------
-
-See the two introductory videos below for a high level overview of NeMo.
-
-**Developing State-Of-The-Art Conversational AI Models in Three Lines of Code**
+For detailed tutorials and documentation on specific tasks or to learn more about NeMo, check out the NeMo :doc:`tutorials <./tutorials>` or dive deeper into the documentation, such as learning about ASR in :doc:`here <../asr/intro>`.
 
-.. raw:: html
-
-    <div style="position: relative; padding-bottom: 3%; height: 0; overflow: hidden; max-width: 100%; height: auto;">
-        <iframe width="560" height="315" src="https://www.youtube.com/embed/wBgpMf_KQVw" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
-    </div>
-
-.. _installation:
-
-Installation
-------------
-
-The simplest way to install NeMo is via pip, see info below. 
-
-.. note:: Full NeMo installation instructions (with more ways to install NeMo, and how to handle optional dependencies) can be found in the `GitHub README <https://github.com/NVIDIA/NeMo#installation>`_.
-
-Conda
-~~~~~
-
-We recommend installing NeMo in a fresh Conda environment.
-
-.. code-block:: bash
-
-    conda create --name nemo python==3.10.12
-    conda activate nemo
-
-Install PyTorch using their `configurator <https://pytorch.org/get-started/locally/>`_.
-
-Pip
-~~~
-Use this installation mode if you want the latest released version.
-
-.. code-block:: bash
-
-    apt-get update && apt-get install -y libsndfile1 ffmpeg
-    pip install Cython
-    pip install nemo_toolkit['all']
-
-Depending on the shell used, you may need to use ``"nemo_toolkit[all]"`` instead in the above command.
-
-Discussion board
+Discussion Board
 ----------------
-For more information and questions, visit the `NVIDIA NeMo Discussion Board <https://github.com/NVIDIA/NeMo/discussions>`_.
 
-Contributing
-------------
+For additional information and questions, visit the `NVIDIA NeMo Discussion Board <https://github.com/NVIDIA/NeMo/discussions>`_.
+
+Contribute to NeMo
+------------------
 
-We welcome community contributions! Refer to the `CONTRIBUTING.md <https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md>`_  file for the process.
+Community contributions are welcome! See the `CONTRIBUTING.md <https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md>`_ file for how to contribute.
 
 License
 -------
 
-NeMo is released under an `Apache 2.0 license <https://github.com/NVIDIA/NeMo/blob/stable/LICENSE>`_.
\ No newline at end of file
+NeMo is released under the `Apache 2.0 license <https://github.com/NVIDIA/NeMo/blob/stable/LICENSE>`_.
diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
index a61c078175f5..5ca48904ed9b 100644
--- a/docs/source/starthere/tutorials.rst
+++ b/docs/source/starthere/tutorials.rst
@@ -3,40 +3,74 @@
 Tutorials
 =========
 
-The best way to get started with NeMo is to start with one of our tutorials.
+The best way to get started with NeMo is to start with one of our tutorials. These tutorials cover various domains and provide both introductory and advanced topics. They are designed to help you understand and use the NeMo toolkit effectively.
+
+Running Tutorials on Colab
+--------------------------
 
 Most NeMo tutorials can be run on `Google's Colab <https://colab.research.google.com/notebooks/intro.ipynb>`_.
 
 To run a tutorial:
 
-#. Click the **Colab** link (see table below).
-#. Connect to an instance with a GPU. For example, click **Runtime** > **Change runtime type** and select **GPU** for the hardware accelerator.
+1. Click the **Colab** link associated with the tutorial you are interested in from the table below.
+2. Once in Colab, connect to an instance with a GPU by clicking **Runtime** > **Change runtime type** and selecting **GPU** as the hardware accelerator.
+
+Tutorial Overview
+-----------------
 
-.. list-table:: **Tutorials**
-   :widths: 15 25 25
+.. list-table:: **General Tutorials**
+   :widths: 15 25 60
    :header-rows: 1
 
    * - Domain
      - Title
      - GitHub URL
    * - General
-     - Getting Started: Exploring Nemo Fundamentals
+     - Getting Started: NeMo Fundamentals
      - `NeMo Fundamentals <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/00_NeMo_Primer.ipynb>`_
    * - General
-     - Getting Started: Sample Conversational AI application
+     - Getting Started: Audio translator example
      - `Audio translator example <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/AudioTranslationSample.ipynb>`_
    * - General
-     - Getting Started: Voice swap application
+     - Getting Started: Voice swap example
      - `Voice swap example <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/VoiceSwapSample.ipynb>`_
    * - General
-     - Exploring NeMo Model Construction
+     - Getting Started: NeMo Models
      - `NeMo Models <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/01_NeMo_Models.ipynb>`_
    * - General
-     - Exploring NeMo Adapters
+     - Getting Started: NeMo Adapters
      - `NeMo Adapters <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/02_NeMo_Adapters.ipynb>`_
    * - General
-     - Publishing NeMo models on Hugging Face Hub
+     - Getting Started: NeMo Models on Hugging Face Hub
      - `NeMo Models on HF Hub <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/Publish_NeMo_Model_On_Hugging_Face_Hub.ipynb>`_
+
+.. list-table:: **Multimodal Tutorials**
+   :widths: 20 25 55
+   :header-rows: 1
+
+   * - Domain
+     - Title
+     - GitHub URL
+   * - Multimodal
+     - Preparations and Advanced Applications: Multimodal Data Preparation
+     - `Multimodal Data Preparation <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb>`_
+   * - Multimodal
+     - Preparations and Advanced Applications: NeVA (LLaVA) Tutorial
+     - `NeVA (LLaVA) Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/NeVA%20Tutorial.ipynb>`_
+   * - Multimodal
+     - Preparations and Advanced Applications: Stable Diffusion Tutorial
+     - `Stable Diffusion Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Stable%20Diffusion%20Tutorial.ipynb>`_
+   * - Multimodal
+     - Preparations and Advanced Applications: DreamBooth Tutorial
+     - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/DreamBooth%20Tutorial.ipynb>`_
+
+.. list-table:: **Automatic Speech Recognition (ASR) Tutorials**
+   :widths: 15 30 55
+   :header-rows: 1
+
+   * - Domain
+     - Title
+     - GitHub URL
    * - ASR
      - ASR with NeMo
      - `ASR with NeMo <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_with_NeMo.ipynb>`_
@@ -44,16 +78,16 @@ To run a tutorial:
      - ASR with Subword Tokenization
      - `ASR with Subword Tokenization <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_with_Subword_Tokenization.ipynb>`_
    * - ASR
-     - Offline ASR Inference with Beam Search and External Language Model Rescoring
+     - Offline ASR
      - `Offline ASR <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Offline_ASR.ipynb>`_
    * - ASR
-     - Online ASR inference with Microphone (Cache-Aware Streaming)
+     - Online ASR Microphone Cache Aware Streaming
      - `Online ASR Microphone Cache Aware Streaming <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_ASR_Microphone_Demo_Cache_Aware_Streaming.ipynb>`_
    * - ASR
-     - Online ASR inference with Microphone (Buffered Streaming)
+     - Online ASR Microphone Buffered Streaming
      - `Online ASR Microphone Buffered Streaming <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_ASR_Microphone_Demo_Buffered_Streaming.ipynb>`_
    * - ASR
-     - Fine-tuning CTC Models on New Languages
+     - ASR CTC Language Fine-Tuning
      - `ASR CTC Language Fine-Tuning <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_CTC_Language_Finetuning.ipynb>`_
    * - ASR
      - Intro to Transducers
@@ -68,13 +102,13 @@ To run a tutorial:
      - Speech Commands
      - `Speech Commands <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Speech_Commands.ipynb>`_
    * - ASR
-     - Online and Offline Speech Commands Inference
+     - Online Offline Microphone Speech Commands
      - `Online Offline Microphone Speech Commands <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_Offline_Speech_Commands_Demo.ipynb>`_
    * - ASR
-     - Voice Activity Detection (VAD)
+     - Voice Activity Detection
      - `Voice Activity Detection <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Voice_Activity_Detection.ipynb>`_
    * - ASR
-     - Online and Offline VAD Inference
+     - Online Offline Microphone VAD
      - `Online Offline Microphone VAD <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Online_Offline_Microphone_VAD_Demo.ipynb>`_
    * - ASR
      - Speaker Recognition and Verification
@@ -92,19 +126,19 @@ To run a tutorial:
      - ASR for Telephony Speech
      - `ASR for Telephony Speech <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/ASR_for_telephony_speech.ipynb>`_
    * - ASR
-     - Streaming inference for ASR
+     - Streaming inference
      - `Streaming inference <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/asr/Streaming_ASR.ipynb>`_
    * - ASR
-     - Buffered Transducer inference for ASR
+     - Buffered Transducer inference
      - `Buffered Transducer inference <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Buffered_Transducer_Inference.ipynb>`_
    * - ASR
-     - Buffered Transducer inference with LCS Merge Algorithm
+     - Buffered Transducer inference with LCS Merge
      - `Buffered Transducer inference with LCS Merge <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Buffered_Transducer_Inference_with_LCS_Merge.ipynb>`_
    * - ASR
      - Offline ASR with VAD for CTC models
      - `Offline ASR with VAD for CTC models <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Offline_ASR_with_VAD_for_CTC_models.ipynb>`_
    * - ASR
-     - Self-supervised pre-training for ASR
+     - Self-supervised Pre-training for ASR
      - `Self-supervised Pre-training for ASR <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Self_Supervised_Pre_Training.ipynb>`_
    * - ASR
      - Multi-lingual ASR
@@ -118,105 +152,75 @@ To run a tutorial:
    * - ASR
      - Confidence-based Ensembles
      - `Confidence-based Ensembles <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/asr/Confidence_Ensembles.ipynb>`_
-   * - NLP
-     - Using Pretrained Language Models for Downstream Tasks
-     - `Pretrained Language Models for Downstream Tasks <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/01_Pretrained_Language_Models_for_Downstream_Tasks.ipynb>`_
-   * - NLP
-     - Exploring NeMo NLP Tokenizers
-     - `NLP Tokenizers <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/02_NLP_Tokenizers.ipynb>`_
-   * - NLP
-     - Text Classification (Sentiment Analysis) with BERT
-     - `Text Classification (Sentiment Analysis) <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Text_Classification_Sentiment_Analysis.ipynb>`_
-   * - NLP
-     - Question Answering
-     - `Question Answering <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Question_Answering.ipynb>`_
-   * - NLP
-     - Token Classification (Named Entity Recognition)
-     - `Token Classification: Named Entity Recognition <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification_Named_Entity_Recognition.ipynb>`_
-   * - NLP
-     - Joint Intent Classification and Slot Filling
-     - `Joint Intent and Slot Classification <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb>`_
-   * - NLP
-     - GLUE Benchmark
-     - `GLUE Benchmark <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/GLUE_Benchmark.ipynb>`_
-   * - NLP
-     - Punctuation and Capitalization
-     - `Punctuation and Capitalization <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Punctuation_and_Capitalization.ipynb>`_
-   * - NLP
-     - Spellchecking ASR Customization - SpellMapper
-     - `Spellchecking ASR Customization - SpellMapper <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb>`_
-   * - NLP
-     - Entity Linking
-     - `Entity Linking <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Entity_Linking_Medical.ipynb>`_
-   * - NLP
-     - Named Entity Recognition - BioMegatron
-     - `Named Entity Recognition - BioMegatron <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Token_Classification-BioMegatron.ipynb>`_
-   * - NLP
-     - Relation Extraction - BioMegatron
-     - `Relation Extraction - BioMegatron <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Relation_Extraction-BioMegatron.ipynb>`_
-   * - NLP
-     - P-Tuning/Prompt-Tuning
-     - `P-Tuning/Prompt-Tuning <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Multitask_Prompt_and_PTuning.ipynb>`_
-   * - NLP
-     - Synthetic Tabular Data Generation
-     - `Synthetic Tabular Data Generation <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/nlp/Megatron_Synthetic_Tabular_Data_Generation.ipynb>`_
-   * - Multimodal
-     - Multimodal Data Preparation
-     - `Multimodal Data Preparation <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb>`_
-   * - Multimodal
-     - NeVA (LLaVA) Tutorial
-     - `NeVA (LLaVA) Tutorial <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/multimodal/NeVA%20Tutorial.ipynb>`_
-   * - Multimodal
-     - Stable Diffusion Tutorial
-     - `Stable Diffusion Tutorial <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/multimodal/Stable%20Diffusion%20Tutorial.ipynb>`_
-   * - Multimodal
-     - DreamBooth Tutorial
-     - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/stable/tutorials/multimodal/DreamBooth%20Tutorial.ipynb>`_
+
+.. list-table:: **Text-to-Speech (TTS) Tutorials**
+   :widths: 15 35 50
+   :header-rows: 1
+
+   * - Domain
+     - Title
+     - GitHub URL
    * - TTS
-     - NeMo TTS Primer
+     - Basic and Advanced: NeMo TTS Primer
      - `NeMo TTS Primer <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/NeMo_TTS_Primer.ipynb>`_
    * - TTS
-     - TTS Speech/Text Aligner Inference
+     - Basic and Advanced: TTS Speech/Text Aligner Inference
      - `TTS Speech/Text Aligner Inference <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Aligner_Inference_Examples.ipynb>`_
    * - TTS
-     - FastPitch and MixerTTS Model Training
+     - Basic and Advanced: FastPitch and MixerTTS Model Training
      - `FastPitch and MixerTTS Model Training <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/FastPitch_MixerTTS_Training.ipynb>`_
    * - TTS
-     - FastPitch Finetuning
+     - Basic and Advanced: FastPitch Finetuning
      - `FastPitch Finetuning <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/FastPitch_Finetuning.ipynb>`_
    * - TTS
-     - FastPitch and HiFiGAN Model Training for German
+     - Basic and Advanced: FastPitch and HiFiGAN Model Training for German
      - `FastPitch and HiFiGAN Model Training for German <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/FastPitch_GermanTTS_Training.ipynb>`_
    * - TTS
-     - Tacotron2 Model Training
+     - Basic and Advanced: Tacotron2 Model Training
      - `Tacotron2 Model Training <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Tacotron2_Training.ipynb>`_
    * - TTS
-     - FastPitch Duration and Pitch Control
+     - Basic and Advanced: FastPitch Duration and Pitch Control
      - `FastPitch Duration and Pitch Control <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Inference_DurationPitchControl.ipynb>`_
    * - TTS
-     - FastPitch Speaker Interpolation
+     - Basic and Advanced: FastPitch Speaker Interpolation
      - `FastPitch Speaker Interpolation <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/FastPitch_Speaker_Interpolation.ipynb>`_
    * - TTS
-     - Inference and Model Selection
+     - Basic and Advanced: TTS Inference and Model Selection
      - `TTS Inference and Model Selection <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Inference_ModelSelect.ipynb>`_
    * - TTS
-     - Pronunciation_customization
-     - `TTS Pronunciation_customization <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Pronunciation_customization.ipynb>`_
-   * - Tools
-     - NeMo Forced Aligner
+     - Basic and Advanced: TTS Pronunciation Customization
+     - `TTS Pronunciation Customization <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tts/Pronunciation_customization.ipynb>`_
+
+.. list-table:: **Tools and Utilities**
+   :widths: 15 25 60
+   :header-rows: 1
+
+   * - Domain
+     - Title
+     - GitHub URL
+   * - Utility Tools
+     - Utility Tools for Speech and Text: NeMo Forced Aligner
      - `NeMo Forced Aligner <https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb>`_
-   * - Tools
-     - Speech Data Explorer
-     - `Speech Data Explorer  <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tools/SDE_HowTo_v2.ipynb>`_   
-   * - Tools
-     - CTC Segmentation
+   * - Utility Tools
+     - Utility Tools for Speech and Text: Speech Data Explorer
+     - `Speech Data Explorer <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tools/SDE_HowTo_v2.ipynb>`_
+   * - Utility Tools
+     - Utility Tools for Speech and Text: CTC Segmentation
      - `CTC Segmentation <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/tools/CTC_Segmentation_Tutorial.ipynb>`_
-   * - Text Processing (TN/ITN)
-     - Text Normalization and Inverse Normalization for ASR and TTS
+
+.. list-table:: **Text Processing (TN/ITN) Tutorials**
+   :widths: 25 35 60
+   :header-rows: 1
+
+   * - Domain
+     - Title
+     - GitHub URL
+   * - Text Processing
+     - Text Normalization Techniques: Text Normalization
      - `Text Normalization <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb>`_
-   * - Text Processing (TN/ITN)
-     - Inverse Text Normalization for ASR - Thutmose Tagger
+   * - Text Processing
+     - Text Normalization Techniques: Inverse Text Normalization with Thutmose Tagger
      - `Inverse Text Normalization with Thutmose Tagger <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/ITN_with_Thutmose_Tagger.ipynb>`_
-   * - Text Processing (TN/ITN)
-     - Constructing Normalization Grammars with WFSTs
+   * - Text Processing
+     - Text Normalization Techniques: WFST Tutorial
      - `WFST Tutorial <https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/text_processing/WFST_Tutorial.ipynb>`_
diff --git a/docs/source/tools/intro.rst b/docs/source/tools/intro.rst
index 9e1b19f83b9e..5a08d05f3405 100644
--- a/docs/source/tools/intro.rst
+++ b/docs/source/tools/intro.rst
@@ -1,5 +1,5 @@
-Tools
-=====
+Speech AI Tools
+===============
 
 NeMo provides a set of tools useful for developing Automatic Speech Recognitions (ASR) and Text-to-Speech (TTS) synthesis models: \
 `https://github.com/NVIDIA/NeMo/tree/stable/tools <https://github.com/NVIDIA/NeMo/tree/stable/tools>`__ .
diff --git a/tutorials/asr/ASR_Context_Biasing.ipynb b/tutorials/asr/ASR_Context_Biasing.ipynb
index f001ce3d65a2..bca4585e45cb 100644
--- a/tutorials/asr/ASR_Context_Biasing.ipynb
+++ b/tutorials/asr/ASR_Context_Biasing.ipynb
@@ -13,7 +13,7 @@
    "id": "1156d1d1",
    "metadata": {},
    "source": [
-    "This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo framework\n",
+    "This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo Framework\n",
     "for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter.\n",
     "\n",
     "## Tutorial content:\n",
diff --git a/tutorials/asr/ASR_with_NeMo.ipynb b/tutorials/asr/ASR_with_NeMo.ipynb
index f88dc7bbd8c1..bd95c7194655 100644
--- a/tutorials/asr/ASR_with_NeMo.ipynb
+++ b/tutorials/asr/ASR_with_NeMo.ipynb
@@ -75,7 +75,7 @@
             "source": [
                 "# Introduction to End-To-End Automatic Speech Recognition\n",
                 "\n",
-                "This notebook contains a basic tutorial of Automatic Speech Recognition (ASR) concepts, introduced with code snippets using the [NeMo framework](https://github.com/NVIDIA/NeMo).\n",
+                "This notebook contains a basic tutorial of Automatic Speech Recognition (ASR) concepts, introduced with code snippets using the [NeMo Framework](https://github.com/NVIDIA/NeMo).\n",
                 "We will first introduce the basics of the main concepts behind speech recognition, then explore concrete examples of what the data looks like and walk through putting together a simple end-to-end ASR pipeline.\n",
                 "\n",
                 "We assume that you are familiar with general machine learning concepts and can follow Python code, and we'll be using the [AN4 dataset from CMU](http://www.speech.cs.cmu.edu/databases/an4/) (with processing using `sox`)."
diff --git a/tutorials/asr/README.md b/tutorials/asr/README.md
index 138e13f58a08..565e9eafd9d3 100644
--- a/tutorials/asr/README.md
+++ b/tutorials/asr/README.md
@@ -34,7 +34,7 @@ In this repository, you will find several tutorials discussing what is Automatic
 
 13) `ASR_Example_CommonVoice_Finetuning`: Learn how to fine-tune an ASR model using CommonVoice to a new alphabet, Esperanto. We walk through the data processing steps of MCV data using HuggingFace Datasets, preparation of the tokenizer, model and then setup fine-tuning.
 
-14) `ASR_Context_Biasing`: This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo framework for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter.
+14) `ASR_Context_Biasing`: This tutorial aims to show how to improve the recognition accuracy of specific words in NeMo Framework for CTC and Trasducer (RNN-T) ASR models by using the fast context-biasing method with CTC-based Word Spotter.
 
 ----------------
 
diff --git a/tutorials/multimodal/NeVA Tutorial.ipynb b/tutorials/multimodal/NeVA Tutorial.ipynb
index 7a9a1a3a7b4b..20b5e5a1c82c 100644
--- a/tutorials/multimodal/NeVA Tutorial.ipynb	
+++ b/tutorials/multimodal/NeVA Tutorial.ipynb	
@@ -18,7 +18,7 @@
     "\n",
     "## Introduction\n",
     "\n",
-    "This notebook illustrates how to train and perform inference using NeVA with the NeMo Toolkit. NeVA originates from [LLaVA](https://github.com/haotian-liu/LLaVA) (Large Language and Vision Assistant) and is a powerful multimodal image-text instruction tuned model optimized within the NeMo framework. \n",
+    "This notebook illustrates how to train and perform inference using NeVA with the NeMo Toolkit. NeVA originates from [LLaVA](https://github.com/haotian-liu/LLaVA) (Large Language and Vision Assistant) and is a powerful multimodal image-text instruction tuned model optimized within the NeMo Framework. \n",
     "\n",
     "\n",
     "This tutorial will guide you through the following topics:\n",
@@ -270,7 +270,7 @@
    "source": [
     "### Running Inference\n",
     "\n",
-    "NeVA inference via the NeMo framework can be quickly spun up via the NeMo Launcher and a few modifications to use the default NeVA inference config file.\n",
+    "NeVA inference via the NeMo Framework can be quickly spun up via the NeMo Launcher and a few modifications to use the default NeVA inference config file.\n",
     "\n",
     "Inference can be run with a similar command leveraging the provided inference script `neva_evaluation.py` within the container.\n",
     "\n",
diff --git a/tutorials/multimodal/Stable Diffusion Tutorial.ipynb b/tutorials/multimodal/Stable Diffusion Tutorial.ipynb
index 48da90dcb23d..8df695a994ef 100644
--- a/tutorials/multimodal/Stable Diffusion Tutorial.ipynb	
+++ b/tutorials/multimodal/Stable Diffusion Tutorial.ipynb	
@@ -86,7 +86,7 @@
     "\n",
     "**Note**: if you want to customize the saved location, make sure it is also reflected in your training config.\n",
     "#### B. Prepare Text Encoder\n",
-    "For the text encoder used in Stable Diffusion, you can either use [HuggingFace CLIP ViT-L/14 model](https://huggingface.co/openai/clip-vit-large-patch14) or use NeMo's CLIP-ViT. NeMo Stable Diffusion also supports native CLIP ViT model trained in NeMo framework.\n",
+    "For the text encoder used in Stable Diffusion, you can either use [HuggingFace CLIP ViT-L/14 model](https://huggingface.co/openai/clip-vit-large-patch14) or use NeMo's CLIP-ViT. NeMo Stable Diffusion also supports native CLIP ViT model trained in NeMo Framework.\n",
     "\n",
     "Make sure the following settings are used in `cond_stage_config`:\n",
     "\n",
diff --git a/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb b/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb
index 323bfa1c49b8..df5ac458dc9c 100644
--- a/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb
+++ b/tutorials/nlp/Data_Preprocessing_and_Cleaning_for_NMT.ipynb
@@ -20,7 +20,7 @@
    "source": [
     "# Data Preprocessing & Cleaning for NMT\n",
     "\n",
-    "This notebook contains a tutorial of data processing and cleaning for NMT (Neural Machine Translation) to train translation models with the [NeMo framework](https://github.com/NVIDIA/NeMo).\n",
+    "This notebook contains a tutorial of data processing and cleaning for NMT (Neural Machine Translation) to train translation models with the [NeMo Framework](https://github.com/NVIDIA/NeMo).\n",
     "\n",
     "A pre-requisite to train supervised neural machine translation systems is the availability of *parallel corpora* of reasonable quality.\n",
     "\n",

From 32b9ae6020da5298e66ecadbeda407d084507192 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Fri, 26 Apr 2024 13:32:37 -0400
Subject: [PATCH 15/30] Developer Documents for mcore RETRO (#9026)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update branch

Signed-off-by: eharper <eharper@nvidia.com>

* Add dist ckpt support for regular optimizers (#7749)

* Add dist ckpt support for regular optimizers

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* fix imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ci imports fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr notebook

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Pin lhotse=1.19.2 in r1.23.0 (#8303)

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Cache Aware Streaming tutorial notebook (#8296)

* add notebook

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename old notebook to Buffered_Streaming

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* call setup_streaming_params in set_default_att_context_size method

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update links to tutorials in docs

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* remove hard-coding

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* rename var

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* fix path location and branch (#8304)

* fix path location and branch

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* change to a floating point number

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* add deallocate pipeline output optimization (#8279)

* add deallocate pipeline output optimization

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fix memory leak caused by context parallelism hanging references by omegaconf (#8299)

* save cp_size to self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

* use parallel_state instead of self

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>

---------

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* remove assertion (#8302)

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* Update PEFT Doc (#8262)

* update peft doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* remove old prompt learning doc and notebook

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix table

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Merge branch 'r1.23.0' into chcui/update_peft_doc

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

* revert accidental changes

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks  (#8242) (#8324)

* Rebasing canary changes at current main

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move the changes from asr transformer to nlp transformer as originally intended

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update eval to strip spaces before punctuations

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update pc strip

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Refactor: `PromptedAudioToTextLhotseDataset` and `EncDecMultiTaskModel` (#8247)

* Create a separate CanaryDataset and use it inside `transformer_bpe_models.py`. Ditches `token_sequence_format`.

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* [canary] Refactor: move changes in transformer_bpe_models.py to Canar… (#8252)

* [canary] Refactor: move changes in transformer_bpe_models.py to CanaryModel

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryModel` to `EncDecMultiTaskModel` and remove inheritance from `EncDecTransfModelBPE`; add a separate config for this model

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Rename `CanaryDataset` to `PromptedAudioToTextLhotseDataset`; add `prompt_format_fn` argument; clean-up the `_canary_prompt_format` function a bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Move tokenization into `prompt_format_fn`, fix usage, add docs

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Backward-compatible utterance validation

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Improve type annotations

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* config and prompt_fn registration changes from review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix transcribe config

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Refactor Canary to follow schema of remaining ASR models (#8260)

* Initial draft of multi task beam decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Stabilize inference

Signed-off-by: smajumdar <titu1994@gmail.com>

* Update AED Multi Task model to mostly conform to Archetype-Type format. Update config

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add change decoding strategy

Signed-off-by: smajumdar <titu1994@gmail.com>

* Remove redundant imports

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* Cleanup

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove asr transformer dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

* clean up

Signed-off-by: stevehuang52 <heh@nvidia.com>

* copy token_classifier from nlp to asr

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Address comments

Signed-off-by: smajumdar <titu1994@gmail.com>

* Add typing to beam decoding

Signed-off-by: smajumdar <titu1994@gmail.com>

* Make prompt format configurable

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* drop asr dependency on nlp

Signed-off-by: stevehuang52 <heh@nvidia.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>

* fix transcribe, update asr evaluator

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Extend the docs for the canary prompt_fn

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Incorporate changes from Nithin's code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* training bug fix and adding launch script for speech_multitask (#8270)

* bug fix and adding launch script for speech_multitask

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

* update launch script example in speech_to_text_aed.py

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>

---------

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* Fix: drop_last must be true in validation/test otherwise the training will hang

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>

* revert to current transcribe API

Signed-off-by: stevehuang52 <heh@nvidia.com>

* revert changes to NLP, update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update eval utils

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update docs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Remove DALI; rename compute_audio_loss to compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* set default use_model_transcribe=False

Signed-off-by: stevehuang52 <heh@nvidia.com>

* change os.path.dirname to pathlib

Signed-off-by: stevehuang52 <heh@nvidia.com>

* [canary] Test for CanaryTokenizer + refactoring (#8285)

* Test for CanaryTokenizer

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Attempt at refactor...

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Update config for AED models (#8294)

Signed-off-by: smajumdar <titu1994@gmail.com>

* set default calculate_wer=False in transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 1

Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review, part 2

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Document compute_loss

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* update transcribe_speech.py

Signed-off-by: stevehuang52 <heh@nvidia.com>

* add docstring

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Attention encoder-decoder models for multiple speech-to-text tasks

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
Co-authored-by: stevehuang52 <heh@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
(cherry picked from commit d10726da72f74eb5a95056843d1f9e2562a5051c)

Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* add code for calling mcore_retro in NeMo

* add code for calling mcore_retro in NeMo

* runnable, training curve match retro mcore and nemo

* working on retro inference

* working on megatron_retro_eval.py and megatron_retro_inference.yaml

* refactoring text_generation_utils code and retro inference relevant files

* clean PR

* resolving quick hacks (reading number of train/valid samples from workdir, discrepancy in total samples and samples with neighbors retrieved, tokenizers)

* clean repository

* revert changes to inference/eval code to original in main

* clean code

* runable training code, with already implemented eval code

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* revert to original eval code files

* revert to original eval code files 2

* revert to original eval code files 3

* revert to original eval code files 4

* clean code

* clean code

* update my code to support changes from lastest main

* commit before rebase r1.23.0

* Multimodal r1.23.0 bug fix  (#8315)

* Rename quick-gelu

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* ddpm config guard

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix ddpm edit api

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix insert_image_token cfg issue

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* neva updates

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add back jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix jenkins

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update default neva template

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* copy paste files from r1.23.0

* clean PR

* Fixes for MoE parameter passing & use of AutoTokenizer/Model for mistral. (#8272)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334)

Signed-off-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Remove asr webapp (#8347)

Signed-off-by: smajumdar <titu1994@gmail.com>

* remove _target_ at model level in aed config (#8351)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>

* revert changes for tts and asr

* Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357)

* Add change_vocabulary and save_tokenizers() support

Signed-off-by: smajumdar <titu1994@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update nemo/collections/asr/models/aed_multitask_models.py

Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>

---------

Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>

* Change default (#8371)

Signed-off-by: smajumdar <titu1994@gmail.com>

* implement retro's own fwd_bwd_step() and validation_step() to not have argument first_val_step, which the MLM commit doesn't support

* adding megatron compile_helpers(), in future can be fixed with correct MLM commit

* bug fix in fast-conformer-aed.yaml and adding jenkins test for speech_to_text_aed model (#8368)

Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>

* Enable megatron core loggers for GPT pretraining (#8354)

* Logging changes tested for gpt_pretraining

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* Additional args

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* mcore ds fix (#8283)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update apex & TE commits

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert apex installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* turn off the fusion for jenkins

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* addressing Eric's reviews

* adding existing implementation RETRO files

* adding existing implementation RETRO files

* Add Finetuning tutorial with HF Datasets (#8356)

* Add Finetuning tutorial with HF Datasets

Signed-off-by: Nithin Rao Koluguri <nithinraok>

* update on Som comments

Signed-off-by: Nithin Rao Koluguri <nithinraok>

---------

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* release updates (#8378)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* mcore ds fix

Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update mcore

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* revert asr files

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add comments

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for mcore mock dataset

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore version

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update gpt cfg

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update mcore commit

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix Bert unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* update bert tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix bert mcore test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* fix gpt jenkins tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add support for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add mock ds test

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* add test for dict data input type

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* mcore ds fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

* data input fix

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* MCore dataset compatibility for tokenizers (#8390)

* Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

* Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer.

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>

---------

Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>

* Mcore customization doc (#8298)

* [tutorial] fixed missing RIR scripts file. (#8257)

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>

* add values to en tts dict (#7879)

Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>

* Add Bert HF checkpoint converter (#8088)

* Add Bert HF checkpoint converter

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Reformat

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Add BERT ONNX export

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add NeMo BERT to HF BERT script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Clean code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update argument names

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update build_transformer_config in Bert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>

* initial placeholder

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* add to intro/index.rst

Signed-off-by: Huiying Li <huiyingl@nvidia.com>

* initial content update

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* add diff images

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

size

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* minor fixes

* minor language change

Signed-off-by: Chen Cui <chcui@nvidia.com>

* clean changes

---------

Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>

* wer fix (#8404)

Signed-off-by: Travis Bartley <tbartley@nvidia.com>

* updated link to pubmed (#8402)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* Update NFA video download link (#8406)

* update nfa nasa video link

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* update link in markdown

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

---------

Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>

* revert changes (#8410)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Fix dreambooth data sampler issue (#8400)

* Turn on drop last

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Some neva fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Fixed errors in the CTM gen functions (#8416)

Signed-off-by: Taejin Park <tango4j@gmail.com>

* add ensemble decoding fix (#8427)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>

* SDE bugfix log (#8430)

Signed-off-by: George <gzelenfroind@nvidia.com>

* mcore customization doc minor fix (#8421)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>

* NeMo-Mistral to HF converter bugfix. (#8353)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Fixing mcore bert for TP, PP and SP (#8336)

* Fixing mcore bert for TP, PP and SP

* Fixing mcore bert for TP, PP and SP

* Fixing mcore version

* Fixing mcore version

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

* Update Jenkinsfile

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>

---------

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* Add settings to suppress bf16 compile errors in CI on V100 (#8481)

* Add settings to suppress bf16 compile errors in CI on V100

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* MoE parameter passing (#8255)

* MoE parameter passing

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Pass EP/MoE params in consumer scripts.

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* PR fixes

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Use latest commit of mcore-0.5

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* CI fix

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update k2 version (#8478) (#8492)

Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>

* Add fp8 support for SD/Update notebook paths (#8489)

* Add fp8 support for SD/Update notebook paths

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>

* pin to 0.5.0 (#8465)

Signed-off-by: eharper <eharper@nvidia.com>

* Update NeMo Multimodal Requirements (#8515)

* Update requirements_multimodal.txt

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* update github raw content link (#8517)

Signed-off-by: Chen Cui <chcui@nvidia.com>

* Add dep notice for notebooks (#8522)

* add dep notice

Signed-off-by: eharper <eharper@nvidia.com>

* revert

Signed-off-by: eharper <eharper@nvidia.com>

---------

Signed-off-by: eharper <eharper@nvidia.com>

* Revert FP8 integration (#8520)

* Revert FP8 integration

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* Update data prep notebook (#8532)

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* before update branch with latest r1.23.0

* update to run with MLM ae2817b3dde4efb1515061a5311d01d8f85bd99c (runnable training and saving checkpoint)

* remove compile_helpers

* reverse changes from main branch to r1.23.0

* adding *_legacy files

* update MLM commit in Jenkinsfile to latest

* debugging Jenkinstest: test different mcore import in retro_dataset

* update Jenkinsfile edit megatron_retro_mutransfer_pretrain_legacy.py

* removing all mcore RETRO to pass the Jenkinstest

* fixing import legacy problem for tests/collections/nlp/test_indexed_retrieval_dataset.py

* update Jenkinsfile file to use TE v0.7

* update NeMo to work with latest mcore RETRO (solving TE problems)

* update TE commit Jenkinsfile to be the same with r1.23.0's Jenkinsfile

* update commit for MLM

* jenkinstest debugging

* temporary fix RETRO's __init__ for jenkinstest

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster

* add model.data.dataloader_type=cyclic to jenkinsfile

* update code to work with latest megatron-lm main 81dab6067

* update M-LM commit in Jenkinsfile to latest main M-LM 81dab6067

* fix to by pass CI test bf16 problem (following this PR https://github.com/NVIDIA/NeMo/pull/8481/files)

* isort and black

* adjusting model.micro_batch_size to 1

* fix BRANCH = 'r1.23.0'

* replace tutorials dir from main branch to huvu/mcore_retro

* fix minor merges conflict

* update Jenkinsfile

* runnable with a temporary fix from Jacek (unfound -unfinished problem)

* runnable with a temporary fix from Jacek (unfound -unfinished problem)

* modified nlp_overrides.py back to original

* fix checkpoint from Jacek Bieniusiewicz

* config Jenkinsfile test

* set RETRO Jenkins MBS to 1

* black fix

* isort fix

* update TE commit

* update to latest Jenkinsfile with latest container and commits

* remove new RETRO jenkinstest

* merge latest main

* put RETRO Jenkinstest to the right place

* update code for megatron_retro_pretraining_legacy.py

* untrack ipa_cmudict-0.7b_nv23.01.txt

* untrack ipa_cmudict-0.7b_nv23.01.txt

* set config in megatron_retro_pretraining_legacy.py to megatron_retro_config_legacy

* update new RETRO jenkinstest to run faster

* merging latest main, and edit Jenkinstest

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* huvu/mcore_retro_docs first commit

* update with main

* update RETRO docs

* fix scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt

* update docs

* update docs

* udpate RETRO docs

* update with Jennifer's comments

---------

Signed-off-by: eharper <eharper@nvidia.com>
Signed-off-by: Mikołaj Błaż <mblaz@nvidia.com>
Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Signed-off-by: Elena Rastorgueva <erastorgueva@nvidia.com>
Signed-off-by: Nithin Rao Koluguri <nithinraok>
Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com>
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Sangkug Lym <slym@nvidia.com>
Signed-off-by: smajumdar <titu1994@gmail.com>
Signed-off-by: Krishna Puvvada <kpuvvada@nvidia.com>
Signed-off-by: Somshubra Majumdar <titu1994@gmail.com>
Signed-off-by: Aishwarya Bhandare <abhandare@nvidia.com>
Signed-off-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Signed-off-by: Valerie Sarge <vsarge@nvidia.com>
Signed-off-by: Huiying Li <huiyingl@nvidia.com>
Signed-off-by: Huiying Li <willwin.lee@gmail.com>
Signed-off-by: Travis Bartley <tbartley@nvidia.com>
Signed-off-by: Taejin Park <tango4j@gmail.com>
Signed-off-by: George <gzelenfroind@nvidia.com>
Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Vladimir Bataev <vbataev@nvidia.com>
Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: eharper <eharper@nvidia.com>
Co-authored-by: mikolajblaz <mikolajblaz@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com>
Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Co-authored-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com>
Co-authored-by: Jimmy Zhang <jiemingz@nvidia.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com>
Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: Bobby Chen <bobchen@nvidia.com>
Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com>
Co-authored-by: Sangkug Lym <slym@nvidia.com>
Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com>
Co-authored-by: Krishna Puvvada <kpuvvada@nvidia.com>
Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
Co-authored-by: Aishwarya Bhandare <abhandare@nvidia.com>
Co-authored-by: Dmytro Pykhtar <dpykhtar@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
Co-authored-by: Valerie Sarge <vsarge@nvidia.com>
Co-authored-by: Huiying <willwin.lee@gmail.com>
Co-authored-by: Huiying Li <huiyingl@nvidia.com>
Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com>
Co-authored-by: Taejin Park <tango4j@gmail.com>
Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Co-authored-by: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Co-authored-by: Alexandros Koumparoulis <akoumparouli@dgx1v-loki-21.nvidia.com>
Co-authored-by: Vladimir Bataev <vbataev@nvidia.com>
Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: Huy Vu2 <huvu@login-eos02.eos.clusters.nvidia.com>
---
 .../nlp/nemo_megatron/retro/retro_model.rst   | 512 ++++--------------
 .../{retro => retro_legacy}/images/arch.png   | Bin
 .../retro_legacy/retro_model_legacy.rst       | 469 ++++++++++++++++
 3 files changed, 574 insertions(+), 407 deletions(-)
 rename docs/source/nlp/nemo_megatron/{retro => retro_legacy}/images/arch.png (100%)
 create mode 100644 docs/source/nlp/nemo_megatron/retro_legacy/retro_model_legacy.rst

diff --git a/docs/source/nlp/nemo_megatron/retro/retro_model.rst b/docs/source/nlp/nemo_megatron/retro/retro_model.rst
index e490b70797d4..5bd7f03f77ac 100644
--- a/docs/source/nlp/nemo_megatron/retro/retro_model.rst
+++ b/docs/source/nlp/nemo_megatron/retro/retro_model.rst
@@ -1,281 +1,92 @@
-NeMo RETRO Model
+RETRO Model
 ================
 
-The Retrieval-Enhanced Transformer (RETRO) model is an autoregressive language model that takes into account document chunks retrieved from a large 
-corpus when making predictions. The RETRO model has a similar architecture to the GPT model, but it includes an encoder that encodes the retrieved 
-context and cross-attention layers that integrate the context to improve the model's output. Below is a simple diagram of the RETRO model architecture.
+The Retrieval-Enhanced Transformer (RETRO) `(Borgeaud et al., 2022) <https://arxiv.org/abs/2112.04426>`_ is an autoregressive decoder-only language model (LM)
+pretrained with retrieval-augmentation.
+RETRO features practical scalability to support large-scale pretraining from scratch by retrieving from trillions of
+tokens.
+Pretraining with retrieval provides a more efficient storage mechanism of factual knowledge, when compared to storing factual knowledge implicitly within the network's parameters. This approach significantly reduces the model's parameter count while achieving lower perplexity than the standard GPT model.
+RETRO also provides the flexibility to update the
+knowledge stored in LMs `(Wang et al., 2023a) <https://arxiv.org/abs/2304.06762>`_
+by updating the retrieval database without training LMs again. 
 
-.. image:: images/arch.png
-    :align: center
-    :width: 800px
-    :alt: RETRO model architecture
+For the legacy native NeMo RETRO model documentation, please see `NeMo RETRO Model (Legacy) <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/nemo_megatron/retro/retro_model_legacy.rst>`_.
 
-For more detailed information on the model, please refer to the `RETRO paper <https://arxiv.org/abs/2112.04426>`_ :cite:`nlp-retro-borgeaud2021improving` by Deepmind. 
-The NeMo RETRO Model is an open-source implementation of the paper, and it has the following differences/features compared to Deepmind's proposed implementation:
-
-1. The NeMo RETRO Model is built on top of NeMo Megatron code, allowing for efficient training of large language models in a cluster environment.
-2. The NeMo RETRO Model uses `Faiss <https://github.com/facebookresearch/faiss>`_ :cite:`nlp-retro-jegou2022faiss` as the K$N search library, which can be accelerated by GPUs. 
-3. The NeMo RETRO uses `RoPe relative positional encoding <https://arxiv.org/abs/2104.09864>`_ :cite:`nlp-retro-su2021roformer`. 
-4. The NeMo RETRO uses `SentenceTransformers <https://www.sbert.net>`_ :cite:`nlp-retro-reimers2019sentence` as the retriever encoder.
-5. The NeMo RETRO supports `mu-Transfer <https://openreview.net/pdf?id=Bx6qKuBM2AD>`_ :cite:`nlp-retro-yang2022tensor`, allowing for scalable training of the RETRO model via Zero-Shot Hyperparameter Transfer.
-
-Quick start
+Quick Start
 ************
-Steps below demonstrate training and evaluating a NeMo RETRO model
+The following instructions demonstrate how to preprocess the data as well as train and evaluate a RETRO model.
 
-Data pre-processing
+Data Preprocessing
 -------------------
 
-Step 1: Collect training data
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The RETRO model uses two types of data: training data, which typically consists of 64-token chunks, and retrieval data, which typically consists of 128-token chunks.
-The training data is used to train the model, while the retrieval data is used to supplement the language model. 
-It's possible to use the same data for both training and retrieval, as long as duplicates are removed properly, as described below. 
-Both types of data are stored in a loose JSON format, with each line containing a single text sample. For example:
-
-.. code-block:: json
-
-    {"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
-    {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
-
-The name of the text field of the json can be changed by using the ``--json-key`` flag in ``preprocess_data_for_megatron.py``.  The other metadata are optional and are not used in training.
+For detailed information on data preprocessing, refer to the `Megatron-LM Github <https://github.com/NVIDIA/Megatron-LM/>`_ repository. This repository contains scripts and comprehensive instructions for the entire preprocessing procedure, specifically focusing on `RETRO Data Preparation <https://github.com/NVIDIA/Megatron-LM/blob/0fecd76e995c136021d478c6c52caa57c2f9aa25/tools/retro/build_db.md>`_. The main stages of the process are summarized below. 
 
-Step 2: Convert training data into memory map format
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The outcome of the preparation step yields a processed RETRO data directory, fully primed for pre-training. Specifically, this directory encompasses the following key files and subdirectories:
 
-The loose json is then processed into a binary format for training and retrieval. To convert the json into mmap, cached index file. 
-Set the ``--dataset-impl`` flag to `retmmap`, which is the memory map format dedicated for RETRO model. 
-
-An example script to prepare data for RETRO training is:
-
-.. code-block:: bash
-
-    python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
-        --input=/dataset/pubmed_train.jsonl \
-        --json-keys=text \
-        --tokenizer-library=megatron \
-        --apply-ftfy \
-        --dataset-impl=retmmap \
-        --merge-file=/dataset/gpt2-merges.txt \
-        --vocab-file=/dataset/gpt2-vocab.json \
-        --tokenizer-type=GPT2BPETokenizer \
-        --output-prefix=/result/pubmed_train \
-        --need-pad-id \
-        --append-eod \
-        --retrieval-db \
-        --chunk_size=64 \
-        --workers=48
-
-The RETRO model processes chunked documents using 64 tokens as the default chunk size. The RETRO memory map dataset will add padding 
-tokens to the end of each document to make it a multiple of 64. The ``--need-pad-id`` argument adds a padding token to the tokenizer
-if it doesn't already have one. The ``--append-eod`` argument controls whether to add ``end-of-document`` tokens to the preprocessed 
-data, and the ``--retrieval-db`` argument indicates whether to create a retrieval database for the preprocessed data. If ``--retrieval-db``
-is used, it will add an additional 64 padding tokens at the end of the document. The ``--chunk_size`` and ``--workers`` arguments 
-control the size of the data chunks to be processed and the number of worker processes to use, respectively.
-
-Following is the retro memory map index data format:
-
-.. list-table::
-   :widths: 25 25 25 25 25 25
-
-   * - 'MMIDRET\x00\x00' (header 9 bytes)
-     - 1 (version 8 byte)
-     - dtype code :sup:`1` (1 byte)
-     - sentence count (8 byte)
-     - chunk size (8 byte)
-     - chunk count (8 byte)
-   * - retrieved db :sup:`2` (1 byte)
-     - number of tokens for each of sentences ( int32 array)
-     - start of sentence address in byte (int64 array)	
-     - start of chunk id (int64 array)
-     - chunk id address in byte (int64 array)
-     -
-
-:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float64, 7: np.double, 8: np.uint16
-
-:sup:`2` When building the indexed dataset, we pad each sentence to be a multiple of ``chunk_size`` with ``pad_id`` from the tokenizer. 
-The number of tokens for each sentence includes the padded token ids. For retrieval data, there is an extra ``chunk_size`` padding at 
-the end of each sentence, and the ``retrieved_db`` flag is set to True. However, the number of tokens for each sentence excludes this extra ``chunk_size`` padding.
-
-Following is the retro memory map binary data format:
-
-.. list-table::
-   :widths: 65
-
-   * - token id array for sentence 0,1, 2 ... (dtype :sup:`3` array)
-
-:sup:`3` np.uint16 vocab_size < 65500 else np.int32
-
-Step 3: Create Faiss index for retrieval data
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-After creating the memory map retrieval data binary file and index files, we can build a Faiss index that can quickly find the K-nearest neighbors of a given
-chunk ID based on a query embedding vector. Because the retrieval data is typically very large, we break this process down into three steps.
-
-Step 3.1: Train the Faiss index structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In this step, it uses a subset of the retrieval data to train a empty Faiss index. An example script is:
-
-.. code-block:: bash
+* ``config.json``: contains the hyperparameters used in the data preparation step, which will then be retrieved to use in the pre-training step for consistency. For example: sample length, chunk length, data splits, tokenizer files, etc.
+* ``data``: contains the original data before any preprocessing.
+* ``tokenizer``: contains tokenizer files used in the preparation step.
+* ``db``: contains the chunk database of processed and chunked text used for retrieving neighbors. 
+* ``index``: contains the Faiss index of the chunk database for retrieval.
+* ``query``: contains the queried neighboring chunks for all training samples.
 
-    python scripts/nlp_language_modeling/build_retrieval_index.py \
-        --input_file=/result/pubmed_train_text_document  \
-        --tokenizer-library=megatron \
-        --tokenizer-type=GPT2BPETokenizer \
-        --merge-file=/dataset/gpt2-merges.txt \
-        --vocab-file=/dataset/gpt2-vocab.json \
-        --percent=1.0 \
-        --sentence_transformer_model=all-mpnet-base-v2 \
-        --batch_size=1024 \
-        --train_index_size=2000000 \
-        --workers=2 \
-        --devices=0,1,2,3,4,5,6,7 \
-        --stage=0 \
-        --output_file=/result/pubmed_faiss_learn.index
-
-This command is used to build an empty Faiss index using the 2000000 training data in ``pubmed_train_text_document``. 
-The ``all-mpnet-base-v2`` sentence transformer model is used to encode the chunk tokens into an embedding vector. 
-The index will be saved in the result directory as ``pubmed_faiss_learn.index``. This command specifies using 8 GPUs to train the Faiss index.
-
-Step 3.2: Add retrieval data into sharding index
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This step adds all the retrieval data to the empty Faiss index created in the previous step. An example script is:
 
-.. code-block:: bash
+The data preparation process contains the following main stages:
 
-    python scripts/nlp_language_modeling/build_retrieval_index.py \
-        --input_file=/result/pubmed_train_text_document  \
-        --tokenizer-library=megatron \
-        --tokenizer-type=GPT2BPETokenizer \
-        --merge-file=/dataset/gpt2-merges.txt \
-        --vocab-file=/dataset/gpt2-vocab.json \
-        --percent=1.0 \
-        --sentence_transformer_model=all-mpnet-base-v2 \
-        --batch_size=1024 \
-        --shard_id=0 \
-        --total_shards=10 \
-        --workers=2 \
-        --devices=0,1,2,3,4,5,6,7 \
-        --stage=1 \
-        --learned_index=/result/pubmed_faiss_learn.index \
-        --output_file=/result/pubmed_faiss_shard0.save
-
-This command breaks the retrieval data into ``total_shards`` shards and adds the data in the shard specified by ``shard_id``. 
-The result is saved to a file specified by ``output_file``. In the example above, 10 sharding indexes are created.
-
-Step 3.3: Merge the sharding indexes into final Faiss index
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This step merges all the sharding indexes created in the previous step into the final Faiss index.  An example script is:
+Build Retrieval Chunk Database
+##############################
 
-.. code-block:: bash
+This stage involves creating a database of text chunks from a corpus such as Wikipedia to be used for retrievals. The chunks are non-overlapping and extracted from the original GPT token dataset, with each chunk traditionally being 64 tokens in length. The database is stored as a 2-D array and is not a relational database. 
 
-    python scripts/nlp_language_modeling/build_retrieval_index.py \
-        --stage=2 \
-        --devices=0,1,2,3,4,5,6,7 \
-        --learned_index=/result/pubmed_faiss_learn.index \
-        --shard_index_input=/result/pubmed_faiss_shard \
-        --output_file=/result/pubmed_faiss_final.index
+The main output of this stage is:
 
-Step 4: Build KNN index
-^^^^^^^^^^^^^^^^^^^^^^^
+* ``/db/merged/train.hdf5``: the database containing all processed and chunked text.
+* ``/db/merged/sampled.hdf5``: the database containing a small portion of all chunks, only used for training the index in the next stage.
 
-During training, it is inefficient to run a query to find the K-nearest neighbor chunk IDs for each training data point. 
-This can be pre-calculated by building a KNN index before training. The KNN index maps the training data chunk IDs to the K-nearest neighbor chunk IDs 
-in the retrieval data. As with building the Faiss index, this process is divided into two steps.
+Build Index for Similarity Search
+#################################
 
-Following is the KNN index data format:
+The second stage is to build a search index using Faiss, a library for efficient similarity search. The index is trained on a subset of the chunks ``sampled.hdf5`` from the database. After training, all chunks are added to the index to enable querying. The index accepts 1-D floating point vectors, so chunks must be embedded using Bert embeddings before they can be added to the index. Particularly, the stage is comprised of two sub-stages:
 
-.. list-table::
-   :widths: 25 25 25 25 45
+    \- Extract BERT embeddings from the sampled chunk database (``sampled.hdf5``) and use them to train a Faiss index.
 
-   * - 'KNNRETM\x00\x00' (header 9 bytes)
-     - 1 (version 8 byte)
-     - K number of neighbors (8 byte)
-     - Number chunks (8 byte)
-     - Map to K retrieval data chunk IDs, shape (number_chunks, K) ( int64 array)
+    \- Extract BERT embeddings for each chunk in the all chunks database (``train.hdf5``) and add them to the trained Faiss index.
 
-Step 4.1: Build KNN sharding index
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The main output of this stage is:
 
-The KNN index is built using the memory-mapped training data created by the ``preprocess_data_for_megatron.py`` script and the Faiss index 
-file for the retrieval data built by the ``build_retrieval_index.py`` script.
+* ``/index/<RETRO_INDEX_TYPE>/<RETRO_INDEX_STR>/added.faissindex``: the trained index, with all chunks in the database added to it
 
-An example script is:
+Query Pretraining Neighbors
+###########################
 
-.. code-block:: bash
+To speed up the RETRO pretraining process, you pre-retrieve neighbors for all training samples instead of retrieving them on-the-fly. In this stage, the pretraining datasets are processed to find and save k-nearest neighbors for each chunk in each sample. The neighbors are saved to disk and labeled with unique properties to ensure they match the pretraining configuration. Query-time hyperparameters can be tuned to improve the quality of the neighbors.
 
-    python scripts/nlp_language_modeling/build_knn_map_index.py \
-        --input_file=/result/pubmed_eval_text_document  \
-        --tokenizer-library=megatron \
-        --tokenizer-type=GPT2BPETokenizer \
-        --merge-file=/dataset/gpt2-merges.txt \
-        --vocab-file=/dataset/gpt2-vocab.json \
-        --process_chunk_size=10000 \
-        --sentence_transformer_model=all-mpnet-base-v2 \
-        --batch_size=1024 \
-        --K_neighbors=50 \
-        --workers=2 \
-        --devices=0,1,2,3,4,5,6,7 \
-        --remove_duplicate \
-        --dedup_margin=70 \
-        --nprobe=100 \
-        --shard_id=0 \
-        --total_shards=10 \
-        --stage=1 \
-        --output_file=/dataset/pubmed_knn_shard0.save \
-        --faiss_index=/result/pubmed_faiss_final.index
-
-In this example, the training data is broken into ``total_shards`` shards, and the KNN index is calculated for the shard specified by ``shard_id``. 
-The result is saved to a file specified by ``output_file``. In the example above, 10 KNN sharding indexes are created.
-
-Use the ``remove_duplicate`` flag if the training data and retrieval data are the same to remove neighbors from the same document.
-
-Step 4.2: Merge KNN sharding index into final KNN index
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-An example script is:
+The main output of this stage is:
 
-.. code-block:: bash
+* ``train_<UNIQUE_HASH>``: directory containing retrieved neighbors for all training samples.
+* ``valid_<UNIQUE_HASH>``: directory containing retrieved neighbors for all validating samples.
 
-    python scripts/nlp_language_modeling/build_knn_map_index.py  \
-    --stage=2 \
-    --output_file=pubmed_knn_final.save \
-    --shard_index_input=pubmed_knn_shard
 
 
-Train NeMo RETRO Model
+Train RETRO Model
 -----------------------
 
-Once the training data, retrieval data, KNN index, and Faiss index are prepared, we are ready to train the RETRO model. In the NeMo implementation, 
-the RETRO model can be pre-trained with or without the `mu-Transfer <https://openreview.net/pdf?id=Bx6qKuBM2AD>`_ :cite:`nlp-retro-yang2022tensor` feature. We will introduce both ways.
-
+Once the training samples, pre-retrieved neighbors, and other data are prepared, you are ready to train the RETRO model. The training process will use the output directory from the data preparation step. We set the path to this directory at the ``retro.retro_project_dir`` argument. Many of the data hyperparameters will be retrieved from the ``config.json`` file in this directory, including data splits, sequence length, chunk length, number of training and validating samples, tokenizer, etc.
 
-The table below lists some of the common parameters that can be configured for model pre-training.
+The table below lists some of the common architecture and optimizer parameters that can be configured for model pre-training. Many of these values are set in ``examples/nlp/language_modeling/conf/megatron_retro_config.yaml``, which is used when training unless being overriden by the running command. Notice unlike other NeMo models, the `model.data.data_prefix` value is set to None, because all data information will be retrieved from `model.retro.retro_project_dir`.
 
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
 | **Parameter**                    | **Default** | **Description**                                                                        |
 +==================================+=============+========================================================================================+
-| model.micro_batch_size           | 4           | the micro batch size used for training                                                 |
-+----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.tensor_model_parallel_size | 1           | tensor model parallel size                                                             |
+| retro_data.retro_chunk_length    | 64          | the chunk size used to retrieve                                                        |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.encoder_seq_length         | 2048        | token sequence length                                                                  |
-+----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.chunk_size                 | 64          | the chunk size used to retrieve                                                        |
-+----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.enc_num_layers             | 4           | total number of encoder layers                                                         |
+| retro.retro_num_neighbors        | 2           | token sequence length                                                                  |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.dec_num_layers             | 6           | total number of decoder layers                                                         |
+| retro_encoder_num_layers         | 2           | total number of encoder layers                                                         |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.enc_cross_attention        | [3]         | layer numbers for cross attention in encoder                                           |
+| model.num_layers                 | 12          | total number of decoder layers                                                         |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.dec_cross_attention        | [3,4,5]     | layer numbers for chunked cross attention in decoder                                   |
-+----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.add_position_embedding     | FALSE       | whether to add the absolute position encoding                                          |
+| model.encoder_seq_length         | 2048        | token sequence length                                                                  |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
 | model.hidden_size                | 768         | model hidden size                                                                      |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
@@ -283,187 +94,74 @@ The table below lists some of the common parameters that can be configured for m
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
 | model.num_attention_heads        | 12          | number of attention heads                                                              |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.init_method_std            | 0.02        | standard deviation of the zero mean normal distribution used for weight initialization |
+| model.init_method_std            | 0.023       | standard deviation of the zero mean normal distribution used for weight initialization |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
 | model.hidden_dropout             | 0.1         | dropout probability for hidden state transformer                                       |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
 | model.attention_dropout          | 0.1         | dropout probability in the attention layer                                             |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
-| model.ffn_dropout                | 0           | dropout probability in the feed-forward layer                                          |
+| model.ffn_dropout                | 0.1         | dropout probability in the feed-forward layer                                          |
 +----------------------------------+-------------+----------------------------------------------------------------------------------------+
 
-
-Option 1: Train the NeMo RETRO model *without* mu-Transfer
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-An example RETRO pre-training script is:
-
-.. code-block:: bash
-
-    python examples/nlp/language_modeling/megatron_retro_pretraining.py \
-        trainer.devices=8 \
-        trainer.num_nodes=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=800000 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/result/retro_model \
-        model.apply_query_key_layer_scaling=False \
-        model.tensor_model_parallel_size=8 \
-        model.optim.name=adamw \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=32 \
-        model.enc_cross_attention=[0] \
-        model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
-        model.hidden_size=4096 \
-        model.ffn_hidden_size=16384 \
-        model.num_attention_heads=32 \
-        model.tokenizer.merge_file=/dataset/gpt2-merges.txt \
-        model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \
-        model.data.data_prefix=[/result/pubmed_eval_text_document] \
-        model.data.knn_index=[dataset/pubmed_knn_final.save] \
-        model.data.retrieval_prefix=/result/pubmed_eval_text_document \
-        model.micro_batch_size=8
-
-During the training, launch Tensorboard to monitor training like so:
-
-.. code-block:: bash
-
-    tensorboard --logdir /result/retro_model --bind_all
-
-.. note:: Weights and Biases (WandB) is supported too. Add ``exp_manager.create_wandb_logger=True`` to the model training arguments to enable it.
-
-After the training, the model nemo file can be found at the result checkpoint directory.
-
-Option 2: Train the NeMo RETRO model *with* mu-Transfer
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-`mu-Transfer <https://openreview.net/pdf?id=Bx6qKuBM2AD>`_ :cite:`nlp-retro-yang2022tensor` paper proposed a method to zero-shot transfer hyperparameter to train a larger model.
-This can be done in 3 steps in NeMo RETRO implementation. 
-
-
-Step 1. find optimal hyper parameter for a small base model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use the pre-training code in Option 1, either manually or automatically ind a set of optimal hyperparameter for a small base RETRO 
-model. This is can be done cheaply ans fast due to the small model size.
-
-Step 2. calculate the shape file that can be used to run mu-Transfer 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The shape file determines which hyperparameters will be scaled up, allowing the model to adjust the learning rate, weight scaling factor, etc.
-
-Here is an example shape file calculation script:
-
+The following example shows a RETRO pre-training script. The rest of the argument values are retrieved from ``examples/nlp/language_modeling/conf/megatron_retro_config.yaml``.
 
 .. code-block:: bash
 
-    python examples/nlp/language_modeling/megatron_retro_cal_shape.py \
-        trainer.devices=8 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        exp_manager.exp_dir=/result/retro_model \
-        base_model.enc_num_layers=2 \
-        delta_model.enc_num_layers=2 \
-        base_model.dec_num_layers=32 \
-        delta_model.dec_num_layers=32 \
-        base_model.tensor_model_parallel_size=8 \
-        delta_model.tensor_model_parallel_size=8 \
-        base_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
-        delta_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
-        base_model.enc_cross_attention=[0] \
-        delta_model.enc_cross_attention=[0] \
-        base_model.hidden_size=768 \
-        base_model.ffn_hidden_size=3072 \
-        delta_model.hidden_size=96 \
-        delta_model.ffn_hidden_size=384 \
-        base_model.num_attention_heads=16 \
-        delta_model.num_attention_heads=16 \
-        model.shape_file=tp8_32depth_o1_rel_shape_info.yaml 
-
-In this example, the ``base_model`` refers to the small base model for which an optimal set of hyperparameters has been determined. 
-The ``delta_model`` refers to a model with certain hyperparameters that have been scaled up or down. In this case, 
-the ``hidden_size`` and ``ffn_hidden_size`` have been changed in the ``delta_model``, allowing these two parameters to be scaled freely later.
-
-Step 3. Pretrain mu-Transfer RETRO model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Once the shape file is created, we can start training a RETRO model.  The model training can be scale up freely using the hyperparameters 
-specified by the delta model and the shape file. 
-
-An example mu-Transfer pre-training script is:
-
-.. code-block:: bash
-
-    python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
-        trainer.devices=8 \
-        trainer.num_nodes=2 \
-        trainer.accelerator=gpu \
-        trainer.max_steps=500000 \
-        trainer.precision=16 \
-        exp_manager.exp_dir=/result/retro_model \
-        model.apply_query_key_layer_scaling=False \
-        model.tensor_model_parallel_size=8 \
-        model.optim.name=muadamw \
-        model.enc_num_layers=2 \
-        model.dec_num_layers=32 \
-        model.enc_cross_attention=[0] \
-        model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
-        model.hidden_size=4096 \
-        model.ffn_hidden_size=16384 \
-        model.num_attention_heads=32 \
-        model.tokenizer.merge_file=/dataset/gpt2-merges.txt \
-        model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \
-        model.data.data_prefix=[/result/pubmed_eval_text_document] \
-        model.data.knn_index=[dataset/pubmed_knn_final.save] \
-        model.data.retrieval_prefix=/result/pubmed_eval_text_document \
-        model.micro_batch_size=8 \
-        model.shape_file=tp8_32depth_o1_rel_shape_info.yaml
-
-.. note:: We have chosen to use ``muadamw`` as the optimizer for use with the mu-transfer method.  Currently, only ``muadam`` and ``muadamw`` are supported. 
-
-Similarly to the pre-training in Option 1, the model nemo file can be found at the result checkpoint directory after training is complete.
-
-Run NeMo RETRO Model Inference
+        python /examples/nlp/language_modeling/megatron_retro_pretraining.py \
+            trainer.num_nodes=1 \
+            trainer.devices=8 \
+            trainer.precision=bf16 \
+            trainer.accelerator=gpu \
+            trainer.max_steps=750000
+            trainer.val_check_interval=10 \
+            trainer.precision=16 \
+            exp_manager.exp_dir=/path/to/exp_dir \
+            model.mcore_gpt=True \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.megatron_amp_O2=True \
+            model.retro.num_layers=12 \
+            model.retro.retro_encoder_num_layers=2 \
+            model.retro.retro_num_retrieved_chunks=2 \
+            model.retro.retro_project_dir=/path/to/retro_workdir \
+            model.micro_batch_size=4 \
+            model.data.num_workers=4 \
+            model.data.data_prefix=["none"] \
+            model.data.shuffle_documents=False \
+            model.data.dataloader_type=single \
+            model.data.splits_string=\'98,2,0\' \
+            model.optim.lr=6.0e-4 \
+            model.optim.weight_decay=0.1 \
+            model.optim.sched.name=CosineAnnealing \
+            model.optim.sched.min_lr=6.0e-5 \
+            model.optim.sched.max_steps=650000 \
+            model.optim.name=distributed_fused_adam
+
+During the training, we can monitor the process with Weights and Biases (WandB) by setting ``exp_manager.create_wandb_logger=True`` and set relevant wandb arguments.
+After training, the model distributed checkpoint directory can be found at the result checkpoint directory.
+
+Run RETRO Model Inference
 -------------------------------
 
-Once the NeMo RETRO model has been trained, we can put it into inference mode and experiment with it. 
-During inference, we are not limited to the static Faiss index that we built earlier for KNN queries. 
-We can feed any external data to the model as retrieval context. NeMo RETRO implementation supports dynamic retrieval service, 
-allowing users to add, reset, and query new documents on the fly.
-
-We have built a simple web client that makes it easy for users to play around with the model. Here is an example script to launch the server:
+Once the RETRO model has been trained, you can put it into inference mode and experiment with it. 
+During inference, you are not limited to the indexed corpus to retrieve relevant chunks, but can directly provide any relevant contexts to the prompt through the argument ``neighbors``.
+When performing inference, the input for RETRO differs from that used during training structurally. Specifically, the model’s input consists of only two chunks: one for the prompt and another for the answer to be generated. Unlike during training, these chunks do not necessarily have a fixed length of 64 tokens; instead, they match the length of the tokenized prompt. When context neighbors are supplied for a prompt, these neighbors correspond to the first chunk and are processed through the RETRO encoder to generate text for the second chunk.
+The following example shows a RETRO inferencing script. The rest of the argument values are retrieved from ``examples/nlp/language_modeling/conf/megatron_retro_inference.yaml``.
 
 .. code-block:: bash
 
-    python examples/nlp/language_modeling/megatron_retro_eval.py \
-        trainer.devices=8 \
-        trainer.num_nodes=1 \
-        trainer.accelerator=gpu \
-        trainer.precision=16 \
-        retro_model_file=megatron_retro.nemo \
-        tensor_model_parallel_size=8 \
-        pipeline_model_parallel_size=1 \
-        retrieval_service.sentence_bert.devices=\'0,1,2,3,4,5,6,7\' \
-        retrieval_service.services.0.faiss_devices=\'0,1,2,3,4,5,6,7\' \
-        retrieval_service.services.1.faiss_devices=\'0,1,2,3,4,5,6,7\' \
-        retrieval_service.services.0.faiss_index=/result/pubmed_faiss_final.index \
-        retrieval_service.services.0.retrieval_index=/result/pubmed_eval_text_document \
-        retrieval_service.neighbors=2 \
-        retrieval_service.pad_tokens=True \
-        retrieval_service.store_retrieved=True \
-        server=True \
-        web_server=True \
-        share=True \
-        username=test \
-        password=test123
-
-Set the retro_model_file to use the nemo file generated in the pre-training step. After launching the server, copy-paste the URL from 
-the terminal into your browser. Use the specified username and password to log in and have fun experimenting with the RETRO model.
-
-References
-************
-
-.. bibliography:: ../../nlp_all.bib
-    :style: plain
-    :labelprefix: nlp-retro
-    :keyprefix: nlp-retro-
+        python /examples/nlp/language_modeling/megatron_retro_eval.py \
+            checkpoint_dir=/path/to/checkpoints \
+            checkpoint_name=/checkpoint_name \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.accelerator=gpu \
+            trainer.precision=32 \
+            megatron_amp_O2=False \
+            inference.tokens_to_generate=10 \
+            inference.greedy=False \
+            inference.add_BOS=False \
+            inference.temperature=1.0 \
+            inference.retro_inference.retro_num_neighbors=2 \
+            prompt="sample prompt" \
+            neighbors=["sample neighbor 1","sample neighbor 2"]
diff --git a/docs/source/nlp/nemo_megatron/retro/images/arch.png b/docs/source/nlp/nemo_megatron/retro_legacy/images/arch.png
similarity index 100%
rename from docs/source/nlp/nemo_megatron/retro/images/arch.png
rename to docs/source/nlp/nemo_megatron/retro_legacy/images/arch.png
diff --git a/docs/source/nlp/nemo_megatron/retro_legacy/retro_model_legacy.rst b/docs/source/nlp/nemo_megatron/retro_legacy/retro_model_legacy.rst
new file mode 100644
index 000000000000..e490b70797d4
--- /dev/null
+++ b/docs/source/nlp/nemo_megatron/retro_legacy/retro_model_legacy.rst
@@ -0,0 +1,469 @@
+NeMo RETRO Model
+================
+
+The Retrieval-Enhanced Transformer (RETRO) model is an autoregressive language model that takes into account document chunks retrieved from a large 
+corpus when making predictions. The RETRO model has a similar architecture to the GPT model, but it includes an encoder that encodes the retrieved 
+context and cross-attention layers that integrate the context to improve the model's output. Below is a simple diagram of the RETRO model architecture.
+
+.. image:: images/arch.png
+    :align: center
+    :width: 800px
+    :alt: RETRO model architecture
+
+For more detailed information on the model, please refer to the `RETRO paper <https://arxiv.org/abs/2112.04426>`_ :cite:`nlp-retro-borgeaud2021improving` by Deepmind. 
+The NeMo RETRO Model is an open-source implementation of the paper, and it has the following differences/features compared to Deepmind's proposed implementation:
+
+1. The NeMo RETRO Model is built on top of NeMo Megatron code, allowing for efficient training of large language models in a cluster environment.
+2. The NeMo RETRO Model uses `Faiss <https://github.com/facebookresearch/faiss>`_ :cite:`nlp-retro-jegou2022faiss` as the K$N search library, which can be accelerated by GPUs. 
+3. The NeMo RETRO uses `RoPe relative positional encoding <https://arxiv.org/abs/2104.09864>`_ :cite:`nlp-retro-su2021roformer`. 
+4. The NeMo RETRO uses `SentenceTransformers <https://www.sbert.net>`_ :cite:`nlp-retro-reimers2019sentence` as the retriever encoder.
+5. The NeMo RETRO supports `mu-Transfer <https://openreview.net/pdf?id=Bx6qKuBM2AD>`_ :cite:`nlp-retro-yang2022tensor`, allowing for scalable training of the RETRO model via Zero-Shot Hyperparameter Transfer.
+
+Quick start
+************
+Steps below demonstrate training and evaluating a NeMo RETRO model
+
+Data pre-processing
+-------------------
+
+Step 1: Collect training data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The RETRO model uses two types of data: training data, which typically consists of 64-token chunks, and retrieval data, which typically consists of 128-token chunks.
+The training data is used to train the model, while the retrieval data is used to supplement the language model. 
+It's possible to use the same data for both training and retrieval, as long as duplicates are removed properly, as described below. 
+Both types of data are stored in a loose JSON format, with each line containing a single text sample. For example:
+
+.. code-block:: json
+
+    {"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
+    {"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
+
+The name of the text field of the json can be changed by using the ``--json-key`` flag in ``preprocess_data_for_megatron.py``.  The other metadata are optional and are not used in training.
+
+Step 2: Convert training data into memory map format
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The loose json is then processed into a binary format for training and retrieval. To convert the json into mmap, cached index file. 
+Set the ``--dataset-impl`` flag to `retmmap`, which is the memory map format dedicated for RETRO model. 
+
+An example script to prepare data for RETRO training is:
+
+.. code-block:: bash
+
+    python scripts/nlp_language_modeling/preprocess_data_for_megatron.py \
+        --input=/dataset/pubmed_train.jsonl \
+        --json-keys=text \
+        --tokenizer-library=megatron \
+        --apply-ftfy \
+        --dataset-impl=retmmap \
+        --merge-file=/dataset/gpt2-merges.txt \
+        --vocab-file=/dataset/gpt2-vocab.json \
+        --tokenizer-type=GPT2BPETokenizer \
+        --output-prefix=/result/pubmed_train \
+        --need-pad-id \
+        --append-eod \
+        --retrieval-db \
+        --chunk_size=64 \
+        --workers=48
+
+The RETRO model processes chunked documents using 64 tokens as the default chunk size. The RETRO memory map dataset will add padding 
+tokens to the end of each document to make it a multiple of 64. The ``--need-pad-id`` argument adds a padding token to the tokenizer
+if it doesn't already have one. The ``--append-eod`` argument controls whether to add ``end-of-document`` tokens to the preprocessed 
+data, and the ``--retrieval-db`` argument indicates whether to create a retrieval database for the preprocessed data. If ``--retrieval-db``
+is used, it will add an additional 64 padding tokens at the end of the document. The ``--chunk_size`` and ``--workers`` arguments 
+control the size of the data chunks to be processed and the number of worker processes to use, respectively.
+
+Following is the retro memory map index data format:
+
+.. list-table::
+   :widths: 25 25 25 25 25 25
+
+   * - 'MMIDRET\x00\x00' (header 9 bytes)
+     - 1 (version 8 byte)
+     - dtype code :sup:`1` (1 byte)
+     - sentence count (8 byte)
+     - chunk size (8 byte)
+     - chunk count (8 byte)
+   * - retrieved db :sup:`2` (1 byte)
+     - number of tokens for each of sentences ( int32 array)
+     - start of sentence address in byte (int64 array)	
+     - start of chunk id (int64 array)
+     - chunk id address in byte (int64 array)
+     -
+
+:sup:`1` 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float64, 7: np.double, 8: np.uint16
+
+:sup:`2` When building the indexed dataset, we pad each sentence to be a multiple of ``chunk_size`` with ``pad_id`` from the tokenizer. 
+The number of tokens for each sentence includes the padded token ids. For retrieval data, there is an extra ``chunk_size`` padding at 
+the end of each sentence, and the ``retrieved_db`` flag is set to True. However, the number of tokens for each sentence excludes this extra ``chunk_size`` padding.
+
+Following is the retro memory map binary data format:
+
+.. list-table::
+   :widths: 65
+
+   * - token id array for sentence 0,1, 2 ... (dtype :sup:`3` array)
+
+:sup:`3` np.uint16 vocab_size < 65500 else np.int32
+
+Step 3: Create Faiss index for retrieval data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After creating the memory map retrieval data binary file and index files, we can build a Faiss index that can quickly find the K-nearest neighbors of a given
+chunk ID based on a query embedding vector. Because the retrieval data is typically very large, we break this process down into three steps.
+
+Step 3.1: Train the Faiss index structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this step, it uses a subset of the retrieval data to train a empty Faiss index. An example script is:
+
+.. code-block:: bash
+
+    python scripts/nlp_language_modeling/build_retrieval_index.py \
+        --input_file=/result/pubmed_train_text_document  \
+        --tokenizer-library=megatron \
+        --tokenizer-type=GPT2BPETokenizer \
+        --merge-file=/dataset/gpt2-merges.txt \
+        --vocab-file=/dataset/gpt2-vocab.json \
+        --percent=1.0 \
+        --sentence_transformer_model=all-mpnet-base-v2 \
+        --batch_size=1024 \
+        --train_index_size=2000000 \
+        --workers=2 \
+        --devices=0,1,2,3,4,5,6,7 \
+        --stage=0 \
+        --output_file=/result/pubmed_faiss_learn.index
+
+This command is used to build an empty Faiss index using the 2000000 training data in ``pubmed_train_text_document``. 
+The ``all-mpnet-base-v2`` sentence transformer model is used to encode the chunk tokens into an embedding vector. 
+The index will be saved in the result directory as ``pubmed_faiss_learn.index``. This command specifies using 8 GPUs to train the Faiss index.
+
+Step 3.2: Add retrieval data into sharding index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This step adds all the retrieval data to the empty Faiss index created in the previous step. An example script is:
+
+.. code-block:: bash
+
+    python scripts/nlp_language_modeling/build_retrieval_index.py \
+        --input_file=/result/pubmed_train_text_document  \
+        --tokenizer-library=megatron \
+        --tokenizer-type=GPT2BPETokenizer \
+        --merge-file=/dataset/gpt2-merges.txt \
+        --vocab-file=/dataset/gpt2-vocab.json \
+        --percent=1.0 \
+        --sentence_transformer_model=all-mpnet-base-v2 \
+        --batch_size=1024 \
+        --shard_id=0 \
+        --total_shards=10 \
+        --workers=2 \
+        --devices=0,1,2,3,4,5,6,7 \
+        --stage=1 \
+        --learned_index=/result/pubmed_faiss_learn.index \
+        --output_file=/result/pubmed_faiss_shard0.save
+
+This command breaks the retrieval data into ``total_shards`` shards and adds the data in the shard specified by ``shard_id``. 
+The result is saved to a file specified by ``output_file``. In the example above, 10 sharding indexes are created.
+
+Step 3.3: Merge the sharding indexes into final Faiss index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This step merges all the sharding indexes created in the previous step into the final Faiss index.  An example script is:
+
+.. code-block:: bash
+
+    python scripts/nlp_language_modeling/build_retrieval_index.py \
+        --stage=2 \
+        --devices=0,1,2,3,4,5,6,7 \
+        --learned_index=/result/pubmed_faiss_learn.index \
+        --shard_index_input=/result/pubmed_faiss_shard \
+        --output_file=/result/pubmed_faiss_final.index
+
+Step 4: Build KNN index
+^^^^^^^^^^^^^^^^^^^^^^^
+
+During training, it is inefficient to run a query to find the K-nearest neighbor chunk IDs for each training data point. 
+This can be pre-calculated by building a KNN index before training. The KNN index maps the training data chunk IDs to the K-nearest neighbor chunk IDs 
+in the retrieval data. As with building the Faiss index, this process is divided into two steps.
+
+Following is the KNN index data format:
+
+.. list-table::
+   :widths: 25 25 25 25 45
+
+   * - 'KNNRETM\x00\x00' (header 9 bytes)
+     - 1 (version 8 byte)
+     - K number of neighbors (8 byte)
+     - Number chunks (8 byte)
+     - Map to K retrieval data chunk IDs, shape (number_chunks, K) ( int64 array)
+
+Step 4.1: Build KNN sharding index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The KNN index is built using the memory-mapped training data created by the ``preprocess_data_for_megatron.py`` script and the Faiss index 
+file for the retrieval data built by the ``build_retrieval_index.py`` script.
+
+An example script is:
+
+.. code-block:: bash
+
+    python scripts/nlp_language_modeling/build_knn_map_index.py \
+        --input_file=/result/pubmed_eval_text_document  \
+        --tokenizer-library=megatron \
+        --tokenizer-type=GPT2BPETokenizer \
+        --merge-file=/dataset/gpt2-merges.txt \
+        --vocab-file=/dataset/gpt2-vocab.json \
+        --process_chunk_size=10000 \
+        --sentence_transformer_model=all-mpnet-base-v2 \
+        --batch_size=1024 \
+        --K_neighbors=50 \
+        --workers=2 \
+        --devices=0,1,2,3,4,5,6,7 \
+        --remove_duplicate \
+        --dedup_margin=70 \
+        --nprobe=100 \
+        --shard_id=0 \
+        --total_shards=10 \
+        --stage=1 \
+        --output_file=/dataset/pubmed_knn_shard0.save \
+        --faiss_index=/result/pubmed_faiss_final.index
+
+In this example, the training data is broken into ``total_shards`` shards, and the KNN index is calculated for the shard specified by ``shard_id``. 
+The result is saved to a file specified by ``output_file``. In the example above, 10 KNN sharding indexes are created.
+
+Use the ``remove_duplicate`` flag if the training data and retrieval data are the same to remove neighbors from the same document.
+
+Step 4.2: Merge KNN sharding index into final KNN index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An example script is:
+
+.. code-block:: bash
+
+    python scripts/nlp_language_modeling/build_knn_map_index.py  \
+    --stage=2 \
+    --output_file=pubmed_knn_final.save \
+    --shard_index_input=pubmed_knn_shard
+
+
+Train NeMo RETRO Model
+-----------------------
+
+Once the training data, retrieval data, KNN index, and Faiss index are prepared, we are ready to train the RETRO model. In the NeMo implementation, 
+the RETRO model can be pre-trained with or without the `mu-Transfer <https://openreview.net/pdf?id=Bx6qKuBM2AD>`_ :cite:`nlp-retro-yang2022tensor` feature. We will introduce both ways.
+
+
+The table below lists some of the common parameters that can be configured for model pre-training.
+
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| **Parameter**                    | **Default** | **Description**                                                                        |
++==================================+=============+========================================================================================+
+| model.micro_batch_size           | 4           | the micro batch size used for training                                                 |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.tensor_model_parallel_size | 1           | tensor model parallel size                                                             |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.encoder_seq_length         | 2048        | token sequence length                                                                  |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.chunk_size                 | 64          | the chunk size used to retrieve                                                        |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.enc_num_layers             | 4           | total number of encoder layers                                                         |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.dec_num_layers             | 6           | total number of decoder layers                                                         |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.enc_cross_attention        | [3]         | layer numbers for cross attention in encoder                                           |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.dec_cross_attention        | [3,4,5]     | layer numbers for chunked cross attention in decoder                                   |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.add_position_embedding     | FALSE       | whether to add the absolute position encoding                                          |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.hidden_size                | 768         | model hidden size                                                                      |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.ffn_hidden_size            | 3072        | model FFN hidden size. Usually 4 * hidden_size                                         |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.num_attention_heads        | 12          | number of attention heads                                                              |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.init_method_std            | 0.02        | standard deviation of the zero mean normal distribution used for weight initialization |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.hidden_dropout             | 0.1         | dropout probability for hidden state transformer                                       |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.attention_dropout          | 0.1         | dropout probability in the attention layer                                             |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+| model.ffn_dropout                | 0           | dropout probability in the feed-forward layer                                          |
++----------------------------------+-------------+----------------------------------------------------------------------------------------+
+
+
+Option 1: Train the NeMo RETRO model *without* mu-Transfer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+An example RETRO pre-training script is:
+
+.. code-block:: bash
+
+    python examples/nlp/language_modeling/megatron_retro_pretraining.py \
+        trainer.devices=8 \
+        trainer.num_nodes=2 \
+        trainer.accelerator=gpu \
+        trainer.max_steps=800000 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=/result/retro_model \
+        model.apply_query_key_layer_scaling=False \
+        model.tensor_model_parallel_size=8 \
+        model.optim.name=adamw \
+        model.enc_num_layers=2 \
+        model.dec_num_layers=32 \
+        model.enc_cross_attention=[0] \
+        model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
+        model.hidden_size=4096 \
+        model.ffn_hidden_size=16384 \
+        model.num_attention_heads=32 \
+        model.tokenizer.merge_file=/dataset/gpt2-merges.txt \
+        model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \
+        model.data.data_prefix=[/result/pubmed_eval_text_document] \
+        model.data.knn_index=[dataset/pubmed_knn_final.save] \
+        model.data.retrieval_prefix=/result/pubmed_eval_text_document \
+        model.micro_batch_size=8
+
+During the training, launch Tensorboard to monitor training like so:
+
+.. code-block:: bash
+
+    tensorboard --logdir /result/retro_model --bind_all
+
+.. note:: Weights and Biases (WandB) is supported too. Add ``exp_manager.create_wandb_logger=True`` to the model training arguments to enable it.
+
+After the training, the model nemo file can be found at the result checkpoint directory.
+
+Option 2: Train the NeMo RETRO model *with* mu-Transfer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+`mu-Transfer <https://openreview.net/pdf?id=Bx6qKuBM2AD>`_ :cite:`nlp-retro-yang2022tensor` paper proposed a method to zero-shot transfer hyperparameter to train a larger model.
+This can be done in 3 steps in NeMo RETRO implementation. 
+
+
+Step 1. find optimal hyper parameter for a small base model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use the pre-training code in Option 1, either manually or automatically ind a set of optimal hyperparameter for a small base RETRO 
+model. This is can be done cheaply ans fast due to the small model size.
+
+Step 2. calculate the shape file that can be used to run mu-Transfer 
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The shape file determines which hyperparameters will be scaled up, allowing the model to adjust the learning rate, weight scaling factor, etc.
+
+Here is an example shape file calculation script:
+
+
+.. code-block:: bash
+
+    python examples/nlp/language_modeling/megatron_retro_cal_shape.py \
+        trainer.devices=8 \
+        trainer.num_nodes=1 \
+        trainer.accelerator=gpu \
+        exp_manager.exp_dir=/result/retro_model \
+        base_model.enc_num_layers=2 \
+        delta_model.enc_num_layers=2 \
+        base_model.dec_num_layers=32 \
+        delta_model.dec_num_layers=32 \
+        base_model.tensor_model_parallel_size=8 \
+        delta_model.tensor_model_parallel_size=8 \
+        base_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
+        delta_model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
+        base_model.enc_cross_attention=[0] \
+        delta_model.enc_cross_attention=[0] \
+        base_model.hidden_size=768 \
+        base_model.ffn_hidden_size=3072 \
+        delta_model.hidden_size=96 \
+        delta_model.ffn_hidden_size=384 \
+        base_model.num_attention_heads=16 \
+        delta_model.num_attention_heads=16 \
+        model.shape_file=tp8_32depth_o1_rel_shape_info.yaml 
+
+In this example, the ``base_model`` refers to the small base model for which an optimal set of hyperparameters has been determined. 
+The ``delta_model`` refers to a model with certain hyperparameters that have been scaled up or down. In this case, 
+the ``hidden_size`` and ``ffn_hidden_size`` have been changed in the ``delta_model``, allowing these two parameters to be scaled freely later.
+
+Step 3. Pretrain mu-Transfer RETRO model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once the shape file is created, we can start training a RETRO model.  The model training can be scale up freely using the hyperparameters 
+specified by the delta model and the shape file. 
+
+An example mu-Transfer pre-training script is:
+
+.. code-block:: bash
+
+    python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
+        trainer.devices=8 \
+        trainer.num_nodes=2 \
+        trainer.accelerator=gpu \
+        trainer.max_steps=500000 \
+        trainer.precision=16 \
+        exp_manager.exp_dir=/result/retro_model \
+        model.apply_query_key_layer_scaling=False \
+        model.tensor_model_parallel_size=8 \
+        model.optim.name=muadamw \
+        model.enc_num_layers=2 \
+        model.dec_num_layers=32 \
+        model.enc_cross_attention=[0] \
+        model.dec_cross_attention=[8,11,14,17,20,23,26,29,31] \
+        model.hidden_size=4096 \
+        model.ffn_hidden_size=16384 \
+        model.num_attention_heads=32 \
+        model.tokenizer.merge_file=/dataset/gpt2-merges.txt \
+        model.tokenizer.vocab_file=/dataset/gpt2-vocab.json \
+        model.data.data_prefix=[/result/pubmed_eval_text_document] \
+        model.data.knn_index=[dataset/pubmed_knn_final.save] \
+        model.data.retrieval_prefix=/result/pubmed_eval_text_document \
+        model.micro_batch_size=8 \
+        model.shape_file=tp8_32depth_o1_rel_shape_info.yaml
+
+.. note:: We have chosen to use ``muadamw`` as the optimizer for use with the mu-transfer method.  Currently, only ``muadam`` and ``muadamw`` are supported. 
+
+Similarly to the pre-training in Option 1, the model nemo file can be found at the result checkpoint directory after training is complete.
+
+Run NeMo RETRO Model Inference
+-------------------------------
+
+Once the NeMo RETRO model has been trained, we can put it into inference mode and experiment with it. 
+During inference, we are not limited to the static Faiss index that we built earlier for KNN queries. 
+We can feed any external data to the model as retrieval context. NeMo RETRO implementation supports dynamic retrieval service, 
+allowing users to add, reset, and query new documents on the fly.
+
+We have built a simple web client that makes it easy for users to play around with the model. Here is an example script to launch the server:
+
+.. code-block:: bash
+
+    python examples/nlp/language_modeling/megatron_retro_eval.py \
+        trainer.devices=8 \
+        trainer.num_nodes=1 \
+        trainer.accelerator=gpu \
+        trainer.precision=16 \
+        retro_model_file=megatron_retro.nemo \
+        tensor_model_parallel_size=8 \
+        pipeline_model_parallel_size=1 \
+        retrieval_service.sentence_bert.devices=\'0,1,2,3,4,5,6,7\' \
+        retrieval_service.services.0.faiss_devices=\'0,1,2,3,4,5,6,7\' \
+        retrieval_service.services.1.faiss_devices=\'0,1,2,3,4,5,6,7\' \
+        retrieval_service.services.0.faiss_index=/result/pubmed_faiss_final.index \
+        retrieval_service.services.0.retrieval_index=/result/pubmed_eval_text_document \
+        retrieval_service.neighbors=2 \
+        retrieval_service.pad_tokens=True \
+        retrieval_service.store_retrieved=True \
+        server=True \
+        web_server=True \
+        share=True \
+        username=test \
+        password=test123
+
+Set the retro_model_file to use the nemo file generated in the pre-training step. After launching the server, copy-paste the URL from 
+the terminal into your browser. Use the specified username and password to log in and have fun experimenting with the RETRO model.
+
+References
+************
+
+.. bibliography:: ../../nlp_all.bib
+    :style: plain
+    :labelprefix: nlp-retro
+    :keyprefix: nlp-retro-

From 6d85e48afe009b4bf4a1a3c331dc74eab4798cbe Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Fri, 26 Apr 2024 12:13:13 -0700
Subject: [PATCH 16/30] [Nemo CICD] Run when labeled:Run CICD (#9044)

* further specialize runners for more parallelism

* run when labeled:Run CICD
---
 .github/workflows/cicd-main.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 091e18e58ebc..83016a738e71 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -16,6 +16,7 @@ name: "CICD NeMo"
 on:
   pull_request:
     branches: [ "main" ]
+    types: [ labeled ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -24,6 +25,7 @@ concurrency:
 jobs:
   gpu-test:
     runs-on: self-hosted-azure
+    if: ${{ github.event.label.name == 'Run CICD' }}
     steps:
     - name: Run nvidia-smi test
       run: |
@@ -32,6 +34,7 @@ jobs:
 
   cicd-cluster-clean:
     runs-on: self-hosted-azure-builder
+    if: ${{ github.event.label.name == 'Run CICD' }}
     steps:
     - name: Clean server from old files
       run: |
@@ -54,6 +57,7 @@ jobs:
   cicd-test-container-setup:
     needs: [cicd-cluster-clean]
     runs-on: self-hosted-azure-builder
+    if: ${{ github.event.label.name == 'Run CICD' }}
     # uses: actions/cache@v2
     #container:
 #      image: nvcr.io/nvidia/pytorch:24.01-py3

From b8ad0a855b1a15c9c70d559396f147bc7a3cdc9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Fri, 26 Apr 2024 17:10:03 -0400
Subject: [PATCH 17/30] Improved random seed configuration for Lhotse
 dataloaders with docs (#9001)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Improving RNG seeding with Lhotse dataloading

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Fix

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add documentation about random seeds

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Add doc about managing random seed

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Apply suggestions from code review

Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com>
---
 docs/source/asr/datasets.rst                  |  48 ++++++
 .../asr/models/aed_multitask_models.py        |   2 +-
 .../common/data/lhotse/dataloader.py          |  31 ++--
 .../common/test_lhotse_dataloading.py         |  15 +-
 .../common/test_lhotse_multirank_rng.py       | 162 ++++++++++++++++++
 5 files changed, 235 insertions(+), 23 deletions(-)
 create mode 100644 tests/collections/common/test_lhotse_multirank_rng.py

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 9732ba98e635..b4656eec3f3f 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -823,6 +823,54 @@ For multi-dataset setups, one may provide multiple manifests and even their weig
             bucket_duration_bins=[1.91,3.02,3.56,...
     <other diagnostic information about the dataset>
 
+
+Seeds and randomness
+~~~~~~~~~~~~~~~~~~~~
+
+In Lhotse dataloading configuration we have two parameters controlling randomness: ``seed`` and ``shard_seed``.
+Both of them can be either set to a fixed number, or one of two string options ``"randomized"`` and ``"trng"``.
+Their roles are:
+
+* ``seed`` is the base random seed, and is one of several factors used to initialize various RNGs participating in dataloading.
+
+* ``shard_seed`` controls the shard randomization strategy in distributed data parallel setups when using sharded tarred datasets.
+
+Below are the typical examples of configuration with an explanation of the expected outcome.
+
+Case 1 (default): ``seed=<int>`` and ``shard_seed="trng"``:
+
+* The ``trng`` setting discards ``seed`` and causes the actual random seed to be drawn using OS's true RNG. Each node/GPU/dataloading worker draws its own unique random seed when it first needs it.
+
+* Each node/GPU/dataloading worker yields data in a different order (no mini-batch duplication).
+
+* On each training script run, the order of dataloader examples are **different**.
+
+* Since the random seed is unpredictable, the exact dataloading order is not replicable.
+
+Case 2: ``seed=<int>`` and ``shard_seed="randomized"``:
+
+* The ``randomized`` setting uses ``seed`` along with DDP ``rank`` and dataloading ``worker_id`` to set a unique but deterministic random seed in each dataloading process across all GPUs.
+
+* Each node/GPU/dataloading worker yields data in a different order (no mini-batch duplication).
+
+* On each training script run, the order of dataloader examples are **identical** as long as ``seed`` is the same.
+
+* This setup guarantees 100% dataloading reproducibility.
+
+* Resuming training without changing of the ``seed`` value will cause the model to train on data it has already seen. For large data setups, not managing the ``seed`` may cause the model to never be trained on a majority of data. This is why this mode is not the default.
+
+* If you're combining DDP with model parallelism techniques (Tensor Parallel, Pipeline Parallel, etc.) you need to use ``shard_seed="randomized"``. Using ``"trng"`` will cause different model parallel ranks to desynchronize and cause a deadlock.
+
+* Generally the seed can be managed by the user by providing a different value each time the training script is launched. For example, for most models the option to override would be ``model.train_ds.seed=<value>``. If you're launching multiple tasks queued one after another on a grid system, you can generate a different random seed for each task, e.g. on most Unix systems ``RSEED=$(od -An -N4 -tu4 < /dev/urandom | tr -d ' ')`` would generate a random uint32 number that can be provided as the seed.
+
+Other, more exotic configurations:
+
+* With ``shard_seed=<int>``, all dataloading workers will yield the same results. This is only useful for unit testing and maybe debugging.
+
+* With ``seed="trng"``, the base random seed itself will be drawn using a TRNG. It will be different on each GPU training process. This setting is not recommended.
+
+* With ``seed="randomized"``, the base random seed is set to Python's global RNG seed. It might be different on each GPU training process. This setting is not recommended.
+
 Preparing Text-Only Data for Hybrid ASR-TTS Models
 --------------------------------------------------
 
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 5cda453db45d..7e20d7a16559 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -40,7 +40,7 @@
 from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.common import tokenizers
-from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config
 from nemo.collections.common.metrics import GlobalAverageLossMetric
 from nemo.collections.common.parts import transformer_weights_init
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index b32f067c14a9..5bb3bf2988ea 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -29,14 +29,13 @@
     IterableDatasetWrapper,
     make_worker_init_fn,
 )
+from lhotse.dataset.dataloading import resolve_seed
 from lhotse.dataset.sampling.base import SamplingConstraint, TimeConstraint, TokenConstraint
 from lhotse.lazy import LazyFlattener
-from lhotse.utils import fastcopy
+from lhotse.utils import fastcopy, fix_random_seed
 from omegaconf import DictConfig, OmegaConf
 
-from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
 from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config
-from nemo.collections.common.tokenizers import TokenizerSpec
 from nemo.utils import logging
 
 
@@ -87,7 +86,7 @@ class LhotseDataLoadingConfig:
     sample_rate: int = 16000
     min_duration: float | None = -1
     max_duration: float | None = float("inf")
-    seed: int | str = "randomized"  # int | "randomized" | "trng"; the latter two are lazily resolved by Lhotse in dloading worker processes
+    seed: int | str = 0
     num_workers: int = 0
     pin_memory: bool = False
 
@@ -123,11 +122,7 @@ class LhotseDataLoadingConfig:
 
 
 def get_lhotse_dataloader_from_config(
-    config: DictConfig,
-    global_rank: int,
-    world_size: int,
-    dataset: torch.utils.data.Dataset,
-    tokenizer: TokenizerSpec | TokenizerWrapper = None,
+    config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset, tokenizer=None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
@@ -154,6 +149,10 @@ def get_lhotse_dataloader_from_config(
 
     config = make_structured_with_schema_warnings(config)
 
+    # First, resolve the random seed in case a string value was provided.
+    seed = resolve_seed(config.seed)
+    fix_random_seed(seed)
+
     # 1. Load a manifest as a Lhotse CutSet.
     cuts, is_tarred = read_cutset_from_config(config)
 
@@ -167,6 +166,8 @@ def get_lhotse_dataloader_from_config(
         assert (
             tokenizer is not None
         ), "You must pass a tokenizer to `get_lhotse_dataloader_from_config` in order to read text-only datasets (enabled via use_multimodal_dataloading)"
+        from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper
+
         if not isinstance(tokenizer, TokenizerWrapper):
             tokenizer = TokenizerWrapper(tokenizer)
         # Note this code can also pre-tokenize the text in cuts, but for now we disable it with apply_fn.
@@ -177,7 +178,11 @@ def get_lhotse_dataloader_from_config(
     if config.noise_path is not None:
         noise = CutSet.from_file(config.noise_path)
         cuts = cuts.mix(
-            cuts=noise, snr=config.noise_snr, mix_prob=config.noise_mix_prob, seed="trng", random_mix_offset=True
+            cuts=noise,
+            snr=config.noise_snr,
+            mix_prob=config.noise_mix_prob,
+            seed=config.shard_seed,
+            random_mix_offset=True,
         )
 
     # 2.b. On-the-fly speed perturbation.
@@ -235,7 +240,7 @@ def get_lhotse_dataloader_from_config(
             shuffle=config.shuffle,
             drop_last=config.drop_last,
             shuffle_buffer_size=config.shuffle_buffer_size,
-            seed=config.seed,
+            seed=config.shard_seed,
             num_buckets=config.num_buckets,
             duration_bins=config.bucket_duration_bins,
             num_cuts_for_bins_estimate=config.num_cuts_for_bins_estimate,
@@ -257,7 +262,7 @@ def get_lhotse_dataloader_from_config(
             shuffle=config.shuffle,
             drop_last=config.drop_last,
             shuffle_buffer_size=config.shuffle_buffer_size,
-            seed=config.seed,
+            seed=config.shard_seed,
             rank=0 if is_tarred else global_rank,
             world_size=1 if is_tarred else world_size,
         )
@@ -289,7 +294,7 @@ def get_lhotse_dataloader_from_config(
         # This together with infinite datasets removes the need to split data across nodes/workers.
         dloader_kwargs = dict(
             dataset=IterableDatasetWrapper(dataset=dataset, sampler=sampler),
-            worker_init_fn=make_worker_init_fn(rank=global_rank, world_size=world_size),
+            worker_init_fn=make_worker_init_fn(rank=global_rank, world_size=world_size, seed=seed),
             persistent_workers=config.num_workers > 0,  # helps Lhotse Shar maintain shuffling state
         )
     else:
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 791c5df1c018..d4b3ad03050e 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -339,6 +339,11 @@ def test_dataloader_from_nemo_manifest(nemo_manifest_path: Path):
     assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1
 
 
+class _Identity:
+    def __getitem__(self, cuts):
+        return cuts
+
+
 def test_dataloader_from_nemo_manifest_has_custom_fields(nemo_manifest_path: Path):
     config = OmegaConf.create(
         {
@@ -356,11 +361,7 @@ def test_dataloader_from_nemo_manifest_has_custom_fields(nemo_manifest_path: Pat
         }
     )
 
-    class _IdentityDataset:
-        def __getitem__(self, cuts):
-            return cuts
-
-    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_IdentityDataset())
+    dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity())
 
     batch = next(iter(dl))
     for cut in batch:
@@ -852,10 +853,6 @@ def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path):
         {"cuts_path": cuts_path, "sample_rate": 16000, "use_lhotse": True, "num_workers": 0, "batch_size": 2,}
     )
 
-    class _Identity(torch.utils.data.Dataset):
-        def __getitem__(self, x):
-            return x
-
     dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity())
 
     batches = [batch for batch in dl]
diff --git a/tests/collections/common/test_lhotse_multirank_rng.py b/tests/collections/common/test_lhotse_multirank_rng.py
new file mode 100644
index 000000000000..7fa828900e27
--- /dev/null
+++ b/tests/collections/common/test_lhotse_multirank_rng.py
@@ -0,0 +1,162 @@
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+from lhotse import CutSet
+from lhotse.serialization import load_jsonl, save_to_jsonl
+from lhotse.shar.writers import JsonlShardWriter, TarWriter
+from lhotse.testing.dummies import DummyManifest
+from omegaconf import OmegaConf
+
+from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config
+
+
+class _Identity:
+    def __getitem__(self, cuts):
+        return cuts
+
+
+@pytest.fixture(scope="session")
+def cutset_path(tmp_path_factory) -> Path:
+    """10 utterances of length 1s as a Lhotse CutSet."""
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=10, with_data=True)
+    for c in cuts:
+        c.features = None
+        c.custom = None
+        c.supervisions[0].custom = None
+
+    tmp_path = tmp_path_factory.mktemp("data")
+    p = tmp_path / "cuts.jsonl.gz"
+    pa = tmp_path / "audio"
+    cuts.save_audios(pa).to_file(p)
+    return p
+
+
+@pytest.fixture(scope="session")
+def nemo_manifest_path(cutset_path: Path):
+    """10 utterances of length 1s as a NeMo manifest."""
+    nemo = []
+    for idx, c in enumerate(CutSet.from_file(cutset_path)):
+        nemo.append(
+            {"audio_filepath": c.recording.sources[0].source, "text": f"irrelevant-{idx}", "duration": c.duration,}
+        )
+    p = cutset_path.parent / "nemo_manifest.json"
+    save_to_jsonl(nemo, p)
+    return p
+
+
+@pytest.fixture(scope="session")
+def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> tuple[str, str]:
+    """5 shards, each with 2 utterances."""
+    root = nemo_manifest_path.parent / "nemo_tar"
+    root.mkdir(exist_ok=True)
+    with TarWriter(f"{root}/audios_%01d.tar", shard_size=2) as tar_writer, JsonlShardWriter(
+        f"{root}/manifest_%01d.jsonl", shard_size=2
+    ) as mft_writer:
+        for idx, d in enumerate(load_jsonl(nemo_manifest_path)):
+            p = d["audio_filepath"]
+            name = Path(p).name
+            with open(p, "rb") as f:
+                tar_writer.write(name, BytesIO(f.read()))
+            mft_writer.write({**d, "audio_filepath": name, "shard_id": idx // 2})
+    return f"{root}/manifest__OP_0..4_CL_.jsonl", f"{root}/audios__OP_0..4_CL_.tar"
+
+
+def test_dataloader_multiple_ranks_deterministic_rng(nemo_tarred_manifest_path: tuple[str, str]):
+    json_mft, tar_mft = nemo_tarred_manifest_path
+    config = OmegaConf.create(
+        {
+            "manifest_filepath": json_mft,
+            "tarred_audio_filepaths": tar_mft,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 1,
+            # lhotse specific
+            "use_bucketing": True,
+            "num_buckets": 2,
+            "drop_last": False,
+            "batch_duration": 4.0,  # seconds
+            "quadratic_duration": 15.0,  # seconds
+            "shuffle_buffer_size": 10,
+            "bucket_buffer_size": 100,
+            "seed": 0,
+            "shard_seed": "randomized",
+        }
+    )
+
+    # Data parallel, rank 0
+    dp0 = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity())
+
+    # Data parallel, rank 0 copy (is the iteration deterministic? -> yes)
+    dp0_cpy = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity(),)
+
+    # Data parallel, rank 0, incremented seed (paranoia mode: does the iteration order change with the seed? -> yes)
+    config2 = config.copy()
+    config2["seed"] = config2["seed"] + 1
+    dp0_incrseed = get_lhotse_dataloader_from_config(config=config2, global_rank=0, world_size=2, dataset=_Identity(),)
+
+    # Data parallel, rank 1 (is data different on each DP rank? -> yes)
+    dp1 = get_lhotse_dataloader_from_config(config=config, global_rank=1, world_size=2, dataset=_Identity())
+
+    dloaders = zip(*[iter(dl) for dl in (dp0, dp0_cpy, dp0_incrseed, dp1)])
+
+    for i in range(5):
+        b0, b0_cpy, b0_incrseed, b1 = next(dloaders)
+        assert b0 == b0_cpy
+        assert b0 != b1
+        assert b0_incrseed != b1
+        assert b0 != b0_incrseed
+
+
+def test_dataloader_multiple_ranks_trng(nemo_tarred_manifest_path: tuple[str, str]):
+    """
+    This test is the same as ``test_dataloader_multiple_ranks_deterministic_rng``,
+    except that we set ``shard_seed="trng"`` which causes the seed to be lazily
+    resolved in subprocesses (resolved => being drawn using OS's TRNG).
+    Therefore, we don't expect any reproducibility.
+    """
+    json_mft, tar_mft = nemo_tarred_manifest_path
+    config = OmegaConf.create(
+        {
+            "manifest_filepath": json_mft,
+            "tarred_audio_filepaths": tar_mft,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 1,
+            # lhotse specific
+            "use_bucketing": True,
+            "num_buckets": 2,
+            "drop_last": False,
+            "batch_duration": 4.0,  # seconds
+            "quadratic_duration": 15.0,  # seconds
+            "shuffle_buffer_size": 10,
+            "bucket_buffer_size": 100,
+            "seed": 0,
+            "shard_seed": "trng",
+        }
+    )
+
+    # Data parallel, rank 0
+    dp0 = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity())
+
+    # Data parallel, rank 0 copy (is the iteration deterministic? -> no, trng)
+    dp0_cpy = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=2, dataset=_Identity(),)
+
+    # Data parallel, rank 0, incremented seed (paranoia mode: does the iteration order change with the seed? -> yes)
+    config2 = config.copy()
+    config2["seed"] = config2["seed"] + 1
+    dp0_incrseed = get_lhotse_dataloader_from_config(config=config2, global_rank=0, world_size=2, dataset=_Identity(),)
+
+    # Data parallel, rank 1 (is data different on each DP rank? -> yes)
+    dp1 = get_lhotse_dataloader_from_config(config=config, global_rank=1, world_size=2, dataset=_Identity())
+
+    dloaders = zip(*[iter(dl) for dl in (dp0, dp0_cpy, dp0_incrseed, dp1)])
+
+    for i in range(5):
+        b0, b0_cpy, b0_incrseed, b1 = next(dloaders)
+        assert b0 != b0_cpy
+        assert b0 != b1
+        assert b0_incrseed != b1
+        assert b0 != b0_incrseed

From a6dacf6dd791ddd1ba7054114a9528a5ba5b16c9 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Sat, 27 Apr 2024 04:30:07 +0200
Subject: [PATCH 18/30] [NeMo-UX] Adding GPTModel & MockDataModule (#9011)

* Adding MegatronParallel

* Move over _strategy_liMegatronCheckpointIO

* Adding GPTModel & MockDataModule

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove merge errors

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* revert accidental commit

Signed-off-by: Chen Cui <chcui@nvidia.com>

* minor updates

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change lightning names to make tests work

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix circular import

Signed-off-by: Chen Cui <chcui@nvidia.com>

* fix type checking

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/io/pl.py                                 |  10 +-
 nemo/lightning/__init__.py                    |  24 ++
 nemo/lightning/_strategy_lib.py               |   2 +-
 nemo/lightning/base.py                        |  51 ++++
 nemo/lightning/data.py                        | 281 ++++++++++++++++++
 nemo/lightning/megatron_parallel.py           |   2 +-
 nemo/lightning/pytorch/callbacks/__init__.py  |   2 +-
 nemo/lightning/pytorch/plugins/__init__.py    |   3 +
 .../lightning/pytorch/plugins/data_sampler.py | 135 +++++++++
 nemo/lightning/pytorch/strategies.py          |  46 +--
 nemo/llm/__init__.py                          |  11 +
 nemo/llm/gpt/__init__.py                      |   0
 nemo/llm/gpt/data/__init__.py                 |   3 +
 nemo/llm/gpt/data/mock.py                     | 137 +++++++++
 nemo/llm/gpt/model/__init__.py                |   3 +
 nemo/llm/gpt/model/base.py                    | 231 ++++++++++++++
 16 files changed, 910 insertions(+), 31 deletions(-)
 create mode 100644 nemo/lightning/base.py
 create mode 100644 nemo/lightning/data.py
 create mode 100644 nemo/lightning/pytorch/plugins/__init__.py
 create mode 100644 nemo/lightning/pytorch/plugins/data_sampler.py
 create mode 100644 nemo/llm/__init__.py
 create mode 100644 nemo/llm/gpt/__init__.py
 create mode 100644 nemo/llm/gpt/data/__init__.py
 create mode 100644 nemo/llm/gpt/data/mock.py
 create mode 100644 nemo/llm/gpt/model/__init__.py
 create mode 100644 nemo/llm/gpt/model/base.py

diff --git a/nemo/io/pl.py b/nemo/io/pl.py
index f6bf46557b43..659ef0d6621b 100644
--- a/nemo/io/pl.py
+++ b/nemo/io/pl.py
@@ -2,11 +2,11 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, Optional, TypeVar, Union
 
-import lightning as L
+import pytorch_lightning as pl
 import torch
-from lightning.fabric.plugins.io.checkpoint_io import CheckpointIO
-from lightning.fabric.utilities.cloud_io import get_filesystem
-from lightning.fabric.utilities.types import _PATH
+from lightning_fabric.plugins.io.checkpoint_io import CheckpointIO
+from lightning_fabric.utilities.cloud_io import get_filesystem
+from lightning_fabric.utilities.types import _PATH
 from torch import nn
 from typing_extensions import override
 
@@ -14,7 +14,7 @@
 log = logging.getLogger(__name__)
 
 
-LightningModuleT = TypeVar("LightningModuleT", bound=L.LightningModule)
+LightningModuleT = TypeVar("LightningModuleT", bound=pl.LightningModule)
 ModuleT = TypeVar("ModuleT", bound=nn.Module)
 
 
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index e69de29bb2d1..a508f29b9ace 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -0,0 +1,24 @@
+from typing import Union
+
+from lightning_fabric.plugins.environments import slurm
+from pytorch_lightning import plugins as _pl_plugins
+
+from nemo.lightning.base import get_vocab_size, teardown
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
+from nemo.lightning.pytorch.strategies import MegatronStrategy
+
+
+# We monkey patch because nvidia uses a naming convention for SLURM jobs
+def _is_slurm_interactive_mode():
+    job_name = slurm.SLURMEnvironment.job_name()
+    return job_name is None or job_name.endswith("bash") or job_name.endswith("interactive")
+
+
+slurm._is_slurm_interactive_mode = _is_slurm_interactive_mode  # noqa: SLF001
+
+
+_pl_plugins._PLUGIN_INPUT = Union[_pl_plugins._PLUGIN_INPUT, _data_sampler.DataSampler]  # noqa: SLF001
+
+
+__all__ = ["MegatronStrategy", "MegatronDataSampler", "get_vocab_size", "teardown"]
diff --git a/nemo/lightning/_strategy_lib.py b/nemo/lightning/_strategy_lib.py
index e3f5f146ff12..cd8e38af12f2 100644
--- a/nemo/lightning/_strategy_lib.py
+++ b/nemo/lightning/_strategy_lib.py
@@ -11,7 +11,7 @@
 
 
 if TYPE_CHECKING:
-    from lightning.fabric.utilities.types import Optimizable
+    from lightning_fabric.utilities.types import Optimizable
     from megatron.core.model_parallel_config import ModelParallelConfig
 
 
diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py
new file mode 100644
index 000000000000..ab9fe40eb7a2
--- /dev/null
+++ b/nemo/lightning/base.py
@@ -0,0 +1,51 @@
+import gc
+import os
+from pathlib import Path
+from typing import Optional
+
+import torch
+import torch.distributed
+from pytorch_lightning import Trainer
+from torch import nn
+
+DEFAULT_NEMO_CACHE_HOME = Path.home() / ".cache" / "nemo"
+NEMO_CACHE_HOME = Path(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME))
+DEFAULT_NEMO_DATASETS_CACHE = NEMO_CACHE_HOME / "datasets"
+NEMO_DATASETS_CACHE = Path(os.getenv("NEMO_DATASETS_CACHE", DEFAULT_NEMO_DATASETS_CACHE))
+DEFAULT_NEMO_MODELS_CACHE = NEMO_CACHE_HOME / "models"
+NEMO_MODELS_CACHE = Path(os.getenv("NEMO_MODELS_CACHE", DEFAULT_NEMO_MODELS_CACHE))
+
+
+def get_vocab_size(config, vocab_size: int, make_vocab_size_divisible_by: int = 128,) -> int:
+    from nemo.utils import logging
+
+    after = vocab_size
+    multiple = make_vocab_size_divisible_by * config.tensor_model_parallel_size
+    while (after % multiple) != 0:
+        after += 1
+    logging.info(
+        f"Padded vocab_size: {after}, original vocab_size: {vocab_size}, dummy tokens:" f" {after - vocab_size}."
+    )
+
+    return after
+
+
+def teardown(trainer: Trainer, model: Optional[nn.Module] = None) -> None:
+    # Destroy torch distributed
+    if torch.distributed.is_initialized():
+        from megatron.core import parallel_state
+
+        parallel_state.destroy_model_parallel()
+        torch.distributed.destroy_process_group()
+
+    trainer._teardown()  # noqa: SLF001
+    if model is not None:
+        for obj in gc.get_objects():
+            if torch.is_tensor(obj) and obj.is_cuda:
+                del obj
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+__all__ = ["get_vocab_size", "teardown"]
diff --git a/nemo/lightning/data.py b/nemo/lightning/data.py
new file mode 100644
index 000000000000..794300db72f0
--- /dev/null
+++ b/nemo/lightning/data.py
@@ -0,0 +1,281 @@
+import abc
+import logging
+import os
+from itertools import chain
+from typing import List, Literal, Optional
+
+import torch
+from torch.utils.data import DataLoader, Dataset
+
+
+def create_dataloader(
+    dataset: "Dataset", drop_last: bool = True, pad_samples_to_global_batch_size=False, **kwargs
+) -> DataLoader:
+    output = DataLoader(dataset, collate_fn=dataset.collate_fn, **kwargs)
+
+    output._drop_last = drop_last  # noqa: SLF001
+    output._pad_samples_to_global_batch_size = pad_samples_to_global_batch_size  # noqa: SLF001
+
+    return output
+
+
+def setup_microbatch_calculator(
+    global_rank: int, micro_batch_size: int, global_batch_size: int, rampup_batch_size: Optional[List[int]] = None,
+) -> None:
+    """
+    Initializes the data for distributed training by setting up the microbatch calculator
+    based on the provided global rank and data configuration.
+
+    This function checks if the microbatch calculator has already been initialized. If it has,
+    the function validates that the current configuration matches the initialized settings. If the
+    calculator has not been initialized, it sets up a new one with the provided configuration.
+
+    Args:
+        global_rank (int): The global rank of the current process.
+        config (DataConfig): The data configuration object containing settings for global batch size,
+            micro batch size, data parallel size, and optional ramp-up batch size.
+
+    Raises
+    ------
+        Exception: If the microbatch calculator has already been initialized with different settings.
+
+    """
+    from nemo.lightning._strategy_lib import NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE
+
+    from nemo.utils import AppState
+
+    app_state = AppState()
+
+    if os.environ.get(NEMO_MEGATRON_MODEL_PARALLEL_APPSTATE_OVERRIDE, "false").lower() == "true":
+        init_global_rank = app_state.global_rank
+    else:
+        init_global_rank = global_rank
+
+    from apex.transformer.microbatches import ConstantNumMicroBatches
+    from apex.transformer.pipeline_parallel.utils import (
+        _GLOBAL_NUM_MICROBATCHES_CALCULATOR,
+        setup_microbatch_calculator,
+    )
+
+    if _GLOBAL_NUM_MICROBATCHES_CALCULATOR is None:
+        setup_microbatch_calculator(
+            rank=init_global_rank,
+            global_batch_size=global_batch_size,
+            micro_batch_size=micro_batch_size,
+            data_parallel_size=app_state.data_parallel_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+    else:
+        if isinstance(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, ConstantNumMicroBatches):
+            assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.current_global_batch_size == global_batch_size
+            assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.micro_batch_size == micro_batch_size
+            assert _GLOBAL_NUM_MICROBATCHES_CALCULATOR.num_micro_batches == global_batch_size // (
+                micro_batch_size * app_state.data_parallel_size
+            )
+        else:
+            raise Exception("Microbatch calculator already initialized.")
+
+
+def add_megatron_sampler(
+    dataloader: DataLoader,
+    micro_batch_size: int,
+    global_batch_size: int,
+    rampup_batch_size: Optional[List[int]] = None,
+    consumed_samples: int = 0,
+    dataloader_type: Literal["single", "cyclic"] = "single",
+    # data_sharding: bool = False
+) -> DataLoader:
+    from megatron.core import parallel_state
+
+    if dataloader_type == 'single':
+        batch_sampler = MegatronPretrainingSampler(
+            total_samples=len(dataloader.dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+            data_parallel_rank=parallel_state.get_data_parallel_rank(),
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            drop_last=getattr(dataloader, "_drop_last", False),
+            pad_samples_to_global_batch_size=getattr(dataloader, "_pad_samples_to_global_batch_size", False),
+        )
+    elif dataloader_type == 'cyclic':
+        batch_sampler = MegatronPretrainingRandomSampler(
+            dataloader.dataset,
+            total_samples=len(dataloader.dataset),
+            consumed_samples=consumed_samples,
+            micro_batch_size=micro_batch_size,
+            data_parallel_rank=parallel_state.get_data_parallel_rank(),
+            data_parallel_size=parallel_state.get_data_parallel_world_size(),
+            pad_samples_to_global_batch_size=getattr(dataloader, "_pad_samples_to_global_batch_size", False),
+            # data_sharding=data_sharding
+        )
+    else:
+        raise Exception(f'{dataloader_type} dataloader type is not supported.')
+
+    return DataLoader(
+        dataloader.dataset,
+        batch_sampler=batch_sampler,
+        num_workers=dataloader.num_workers,
+        pin_memory=dataloader.pin_memory,
+        persistent_workers=dataloader.persistent_workers,
+        collate_fn=dataloader.collate_fn,
+    )
+
+
+# TODO: Replace this with megatron.core.data.data_samplers after we upgrade
+class BaseMegatronSampler:
+    def __init__(
+        self,
+        total_samples: int,
+        consumed_samples: int,
+        micro_batch_size: int,
+        data_parallel_rank: int,
+        data_parallel_size: int,
+        drop_last: bool = True,
+        global_batch_size: Optional[int] = None,
+        rampup_batch_size: Optional[list] = None,
+        pad_samples_to_global_batch_size: Optional[bool] = False,
+    ) -> None:
+        # Sanity checks.
+        if total_samples <= 0:
+            raise RuntimeError(f"no sample to consume: {total_samples}")
+        if consumed_samples >= total_samples:
+            raise RuntimeError(f"no samples left to consume: {consumed_samples}, {total_samples}")
+        if micro_batch_size <= 0:
+            raise RuntimeError(f"micro_batch_size size must be greater than 0, but {micro_batch_size}")
+        if data_parallel_size <= 0:
+            raise RuntimeError(f"data parallel size must be greater than 0, but {data_parallel_size}")
+        if data_parallel_rank >= data_parallel_size:
+            raise RuntimeError(
+                f"data_parallel_rank should be smaller than data size, but {data_parallel_rank} >= {data_parallel_size}"
+            )
+        if global_batch_size is not None and rampup_batch_size is None:
+            if global_batch_size % (micro_batch_size * data_parallel_size) != 0:
+                raise RuntimeError(
+                    f"`global_batch_size` ({global_batch_size}) is not divisible by "
+                    f"`micro_batch_size ({micro_batch_size}) x data_parallel_size "
+                    f"({data_parallel_size})`"
+                )
+        if pad_samples_to_global_batch_size and global_batch_size is None:
+            raise RuntimeError(
+                "`pad_samples_to_global_batch_size` can be `True` only when "
+                "`global_batch_size` is set to an integer value"
+            )
+
+        # Keep a copy of input params for later use.
+        self.total_samples = total_samples
+        self.consumed_samples = consumed_samples
+        self.micro_batch_size = micro_batch_size
+        self.data_parallel_rank = data_parallel_rank
+        self.micro_batch_times_data_parallel_size = self.micro_batch_size * data_parallel_size
+        self.drop_last = drop_last
+        self.global_batch_size = global_batch_size
+        self.pad_samples_to_global_batch_size = pad_samples_to_global_batch_size
+
+        logging.info(
+            f"Instantiating MegatronPretrainingSampler with total_samples: {total_samples} and"
+            f" consumed_samples: {consumed_samples}"
+        )
+
+    def __len__(self):
+        num_available_samples: int = self.total_samples - self.consumed_samples
+        if self.global_batch_size is not None:
+            if self.drop_last:
+                return num_available_samples // self.global_batch_size
+            else:
+                return (num_available_samples + self.global_batch_size - 1) // self.global_batch_size
+        else:
+            return (num_available_samples - 1) // self.micro_batch_times_data_parallel_size + 1
+
+    @abc.abstractmethod
+    def __iter__(self):
+        ...
+
+
+class MegatronPretrainingSampler(BaseMegatronSampler):
+    def get_start_end_idx(self):
+        start_idx = self.data_parallel_rank * self.micro_batch_size
+        end_idx = start_idx + self.micro_batch_size
+        return start_idx, end_idx
+
+    def __iter__(self):
+        batch = []
+        # Last batch will be dropped if drop_last is not set False
+        indices = range(self.consumed_samples, self.total_samples)
+        if (not self.drop_last) and self.pad_samples_to_global_batch_size:
+            pad_samples_num = -len(indices) % self.global_batch_size
+            pad_indices = range(-1, -pad_samples_num - 1, -1)
+            indices = chain(indices, pad_indices)
+
+        for idx in indices:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_times_data_parallel_size:
+                start_idx, end_idx = self.get_start_end_idx()
+                yield batch[start_idx:end_idx]
+                batch = []
+
+        # Check the last partial batch and see drop_last is set
+        if len(batch) > 0 and not self.drop_last:
+            assert (
+                not self.pad_samples_to_global_batch_size
+            ), "with pad_samples_to_global_batch_size all batches should be complete"
+            start_idx, end_idx = self.get_start_end_idx()
+            yield batch[start_idx:end_idx]
+
+
+class MegatronPretrainingRandomSampler(BaseMegatronSampler):
+    def __init__(
+        self,
+        total_samples: int,
+        consumed_samples: int,
+        micro_batch_size: int,
+        data_parallel_rank: int,
+        data_parallel_size: int,
+        drop_last: bool = True,
+        global_batch_size: Optional[int] = None,
+        pad_samples_to_global_batch_size: Optional[bool] = False,
+    ) -> None:
+        super().__init__(
+            total_samples=total_samples,
+            consumed_samples=consumed_samples,
+            micro_batch_size=micro_batch_size,
+            data_parallel_rank=data_parallel_rank,
+            data_parallel_size=data_parallel_size,
+            drop_last=drop_last,
+            global_batch_size=global_batch_size,
+            pad_samples_to_global_batch_size=pad_samples_to_global_batch_size,
+        )
+        assert (
+            not pad_samples_to_global_batch_size
+        ), "`MegatronPretrainingRandomSampler` does not support sample padding"
+        self.last_batch_size = self.total_samples % self.micro_batch_times_data_parallel_size
+
+    def __iter__(self):
+        active_total_samples = self.total_samples - self.last_batch_size
+        self.epoch = self.consumed_samples // active_total_samples
+        current_epoch_samples = self.consumed_samples % active_total_samples
+        assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0
+
+        # data sharding and random sampling
+        bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) * self.micro_batch_size
+        bucket_offset = current_epoch_samples // self.data_parallel_size
+        start_idx = self.data_parallel_rank * bucket_size
+
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        random_idx = torch.randperm(bucket_size, generator=g).tolist()
+        idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
+
+        batch = []
+        # Last batch if not complete will be dropped.
+        for idx in idx_range:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_size:
+                self.consumed_samples += self.micro_batch_times_data_parallel_size
+                yield batch
+                batch = []
+
+        # Check the last partial batch and see drop_last is set
+        if len(batch) > 0 and not self.drop_last:
+            yield batch
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index c18e685e38de..899f2fb2c06c 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -573,7 +573,7 @@ def add(self, *callbacks) -> "CallbackConnector":
         """
         _pl_callback = None
         try:
-            import lightning.pytorch as pl
+            import pytorch_lightning as pl
 
             _pl_callback = pl.Callback
         except ImportError:
diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py
index fcceedeb7090..5854c144885b 100644
--- a/nemo/lightning/pytorch/callbacks/__init__.py
+++ b/nemo/lightning/pytorch/callbacks/__init__.py
@@ -1,3 +1,3 @@
-from nemo_ext.lightning.pytorch.callbacks.progress import MegatronProgressBar
+from nemo.lightning.pytorch.callbacks.progress import MegatronProgressBar
 
 __all__ = ["MegatronProgressBar"]
diff --git a/nemo/lightning/pytorch/plugins/__init__.py b/nemo/lightning/pytorch/plugins/__init__.py
new file mode 100644
index 000000000000..45f88a383681
--- /dev/null
+++ b/nemo/lightning/pytorch/plugins/__init__.py
@@ -0,0 +1,3 @@
+from nemo.lightning.pytorch.plugins.data_sampler import MegatronDataSampler
+
+__all__ = ["MegatronDataSampler"]
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
new file mode 100644
index 000000000000..1fca29ce01d3
--- /dev/null
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -0,0 +1,135 @@
+from typing import Any, Dict, List, Literal, Optional
+
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+
+
+class DataSampler:
+    def connect(self, trainer: pl.Trainer):
+        self.trainer = trainer
+
+    def setup(self, global_rank: int) -> None:
+        raise NotImplementedError()
+
+    def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0) -> DataLoader:
+        raise NotImplementedError()
+
+
+class MegatronDataSampler(DataSampler):
+    def __init__(
+        self,
+        seq_len: int,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        dataloader_type: Literal["single", "cyclic"] = "single",
+    ):
+        self.seq_len = seq_len
+        self.micro_batch_size = micro_batch_size
+        self.global_batch_size = global_batch_size
+        self.rampup_batch_size = rampup_batch_size
+        self.dataloader_type = dataloader_type
+        self.init_consumed_samples: int = 0
+        self.prev_consumed_samples = 0
+        self.if_first_step = 0
+        self.prev_global_batch_size = None
+
+    def setup(self, global_rank: int) -> None:
+        from nemo.lightning.data import setup_microbatch_calculator
+
+        setup_microbatch_calculator(global_rank, self.micro_batch_size, self.global_batch_size, self.rampup_batch_size)
+
+    def transform_dataloader(self, dataloader: DataLoader, consumed_samples: int = 0) -> DataLoader:
+        from nemo.lightning.data import add_megatron_sampler
+
+        return add_megatron_sampler(
+            dataloader,
+            micro_batch_size=self.micro_batch_size,
+            global_batch_size=self.global_batch_size,
+            rampup_batch_size=self.rampup_batch_size,
+            consumed_samples=consumed_samples,
+            dataloader_type=self.dataloader_type,
+        )
+
+    def compute_consumed_samples(self, steps_since_resume=0) -> int:
+        from nemo.lightning.pytorch.strategies import MegatronStrategy
+        from nemo.utils import AppState
+
+        if not isinstance(self.trainer.strategy, MegatronStrategy):
+            return 0
+
+        app_state = AppState()
+
+        if self.rampup_batch_size is not None:
+            from apex.transformer.pipeline_parallel.utils import _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+
+            current_global_batch_size = getattr(_GLOBAL_NUM_MICROBATCHES_CALCULATOR, "current_global_batch_size", 1)
+            consumed_samples = self.prev_consumed_samples + self.if_first_step * current_global_batch_size
+        else:
+            consumed_samples = (
+                self.init_consumed_samples
+                + steps_since_resume * app_state.data_parallel_size * self.micro_batch_size * self.num_microbatches
+            )
+
+        return int(consumed_samples)
+
+    # Megatron callbacks
+    def on_megatron_step_start(self, trainer: pl.Trainer) -> None:
+        # do validation and save the checkpoint when gbs is changed
+        if (
+            self.rampup_batch_size is not None
+            and self.prev_global_batch_size != self.current_global_batch_size
+            and self.prev_global_batch_size
+        ):
+            trainer.should_stop = True
+
+    def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
+        import apex.transformer.pipeline_parallel.utils
+
+        if self.rampup_batch_size is None:
+            return
+
+        self.prev_global_batch_size = self.current_global_batch_size
+
+        # TODO: Add consumed samples
+        consumed_samples = self.compute_consumed_samples(trainer.global_step + 1 - self.init_global_step)
+
+        self.prev_consumed_samples = consumed_samples
+
+        num_microbatch_calculator = (
+            apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
+        )
+
+        num_microbatch_calculator.update(
+            consumed_samples=consumed_samples, consistency_check=False,
+        )
+        current_global_batch_size = num_microbatch_calculator.current_global_batch_size
+        pl_module.log(
+            "global_batch_size", current_global_batch_size, prog_bar=True, rank_zero_only=True, batch_size=1,
+        )
+        self.if_first_step = 1
+
+    @property
+    def megatron_data_kwargs(self) -> Dict[str, Any]:
+        return {
+            "seq_length": self.seq_len,
+            "micro_batch_size": self.micro_batch_size,
+            "num_microbatches": self.num_microbatches,
+        }
+
+    @property
+    def num_microbatches(self) -> int:
+        from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+        return get_num_microbatches()
+
+    @property
+    def current_global_batch_size(self) -> int:
+        import apex.transformer.pipeline_parallel.utils
+
+        num_microbatch_calculator = (
+            apex.transformer.pipeline_parallel.utils._GLOBAL_NUM_MICROBATCHES_CALCULATOR  # noqa: SLF001
+        )
+        current_global_batch_size = num_microbatch_calculator.current_global_batch_size
+
+        return current_global_batch_size
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 0fa386cb45ef..89cbe98cf707 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -4,23 +4,23 @@
 from collections import OrderedDict
 from contextlib import ExitStack
 from pathlib import Path
-from typing import Any, ContextManager, Dict, List, Mapping, Optional, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Any, ContextManager, Dict, List, Mapping, Optional, TypeVar, Union, cast
 
-import lightning.pytorch as pl
+import pytorch_lightning as pl
 import torch
 import torch.distributed
-from lightning.fabric.plugins import CheckpointIO, ClusterEnvironment
-from lightning.fabric.utilities.optimizer import _optimizers_to_device
-from lightning.pytorch.accelerators import CPUAccelerator
-from lightning.pytorch.callbacks.progress import TQDMProgressBar
-from lightning.pytorch.loops import _AutomaticOptimization, evaluation_loop, fit_loop, prediction_loop
-from lightning.pytorch.loops.fetchers import _DataLoaderIterDataFetcher
-from lightning.pytorch.overrides.distributed import _sync_module_states
-from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
-from lightning.pytorch.strategies.ddp import DDPStrategy
-from lightning.pytorch.trainer.states import RunningStage, TrainerFn
-from lightning.pytorch.utilities.model_helpers import is_overridden
-from lightning.pytorch.utilities.types import STEP_OUTPUT
+from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment
+from lightning_fabric.utilities.optimizer import _optimizers_to_device
+from pytorch_lightning.accelerators import CPUAccelerator
+from pytorch_lightning.callbacks.progress import TQDMProgressBar
+from pytorch_lightning.loops import _AutomaticOptimization, evaluation_loop, fit_loop, prediction_loop
+from pytorch_lightning.loops.fetchers import _DataLoaderIterDataFetcher
+from pytorch_lightning.overrides.distributed import _sync_module_states
+from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
+from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.trainer.states import RunningStage, TrainerFn
+from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.types import STEP_OUTPUT
 from torch import nn
 from torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks import noop_hook
 from torch.nn.parallel import DistributedDataParallel
@@ -32,6 +32,9 @@
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar
 
+if TYPE_CHECKING:
+    from nemo.lightning.pytorch.plugins.data_sampler import DataSampler
+
 ConfigT = TypeVar("ConfigT")
 
 
@@ -51,7 +54,7 @@ def __init__(
         pipeline_model_parallel_size: int = 1,
         virtual_pipeline_model_parallel_size: Optional[int] = None,
         sequence_parallel: bool = False,
-        # data_sampler: Optional[DataSampler] = None,
+        data_sampler: Optional['DataSampler'] = None,
         parallel_devices: Optional[List[torch.device]] = None,
         cluster_environment=None,  # TODO: Add type-hint
         checkpoint_io=None,  # TODO: Add type-hint
@@ -69,7 +72,7 @@ def __init__(
         )
         self.no_ddp_communication_hook = no_ddp_communication_hook
         self.megatron_callbacks = CallbackConnector()
-        # self.data_sampler: Optional[DataSampler] = data_sampler
+        self.data_sampler: Optional['DataSampler'] = data_sampler
         self.tensor_model_parallel_size = tensor_model_parallel_size
         self.pipeline_model_parallel_size = pipeline_model_parallel_size
         self.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
@@ -371,10 +374,10 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
 
     def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = True) -> None:
         assert self.megatron_parallel is not None
-        from megatron.core import mpu
+        from megatron.core import parallel_state
 
         for index, module in enumerate(self.megatron_parallel):
-            if mpu.get_virtual_pipeline_model_parallel_world_size() is not None:
+            if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
                 checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}']
             else:
                 checkpoint_state_dict = checkpoint['state_dict']
@@ -402,9 +405,9 @@ def _get_data_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
         return None
 
     def _get_forward_step(self, step_type: str) -> Optional[_ModuleStepFunction]:
-        from megatron.core import mpu
+        from megatron.core import parallel_state
 
-        if mpu.is_pipeline_last_stage():
+        if parallel_state.is_pipeline_last_stage():
             if not hasattr(self.lightning_module, f"{step_type}_step"):
                 raise ValueError(f"LightningModule does not have {step_type}_step method")
 
@@ -463,7 +466,6 @@ def parallelism(self):
             tensor_model_parallel_size=self.tensor_model_parallel_size,
             pipeline_model_parallel_size=self.pipeline_model_parallel_size,
             virtual_pipeline_model_parallel_size=self.virtual_pipeline_model_parallel_size,
-            sequence_parallel=self.sequence_parallel,
         )
 
 
@@ -486,8 +488,6 @@ def wrapped(trainer: pl.Trainer, stage: RunningStage):
         if isinstance(trainer.strategy, MegatronStrategy):
             return _DataLoaderIterDataFetcher()
 
-        return fn(trainer, stage)
-
     return wrapped
 
 
diff --git a/nemo/llm/__init__.py b/nemo/llm/__init__.py
new file mode 100644
index 000000000000..2dd39b3f170e
--- /dev/null
+++ b/nemo/llm/__init__.py
@@ -0,0 +1,11 @@
+from nemo.llm.gpt.data import MockDataModule
+from nemo.llm.gpt.model import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
+
+__all__ = [
+    "MockDataModule",
+    "GPTModel",
+    "GPTConfig",
+    "gpt_data_step",
+    "gpt_forward_step",
+    "MaskedTokenLossReduction",
+]
diff --git a/nemo/llm/gpt/__init__.py b/nemo/llm/gpt/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/llm/gpt/data/__init__.py b/nemo/llm/gpt/data/__init__.py
new file mode 100644
index 000000000000..e9b7c07c16cc
--- /dev/null
+++ b/nemo/llm/gpt/data/__init__.py
@@ -0,0 +1,3 @@
+from nemo.llm.gpt.data.mock import MockDataModule
+
+__all__ = ["MockDataModule"]
diff --git a/nemo/llm/gpt/data/mock.py b/nemo/llm/gpt/data/mock.py
new file mode 100644
index 000000000000..ff035a78453d
--- /dev/null
+++ b/nemo/llm/gpt/data/mock.py
@@ -0,0 +1,137 @@
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+from torch.utils.data import DataLoader, Dataset
+
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+if TYPE_CHECKING:
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+class MockDataModule(pl.LightningDataModule):
+    def __init__(
+        self,
+        seq_length: int = 2048,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 4,
+        global_batch_size: int = 8,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_train_samples: int = 10_000,
+        num_val_samples: int = 10_000,
+        num_test_samples: int = 10_000,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+    ):
+        super().__init__()
+        self.seq_length = seq_length
+        self.num_train_samples = num_train_samples
+        self.num_val_samples = num_val_samples
+        self.num_test_samples = num_test_samples
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "GPT2BPETokenizer")
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        self._train_ds = _MockGPTDataset(self.tokenizer, "train", self.num_train_samples, self.seq_length)
+        self._validation_ds = _MockGPTDataset(self.tokenizer, "valid", self.num_val_samples, self.seq_length)
+        self._test_ds = _MockGPTDataset(self.tokenizer, "test", self.num_test_samples, self.seq_length)
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        return self._create_dataloader(self._train_ds)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._validation_ds)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._test_ds)
+
+    def _create_dataloader(self, dataset, **kwargs) -> DataLoader:
+        return DataLoader(
+            dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=dataset.collate_fn,
+            **kwargs,
+        )
+
+
+class _MockGPTDataset(Dataset):
+    def __init__(
+        self, tokenizer: "TokenizerSpec", name: str, num_samples: int, seq_length: int, seed: int = 42,
+    ) -> None:
+        super().__init__()
+        self.name = name
+        self.seq_length = seq_length
+        self.vocab_size = tokenizer.vocab_size
+        self.length = num_samples
+        self.seed = seed
+
+        self.attention_mask = torch.tril(torch.ones((self.seq_length, self.seq_length))).unsqueeze(0)
+        self.attention_mask = self.attention_mask < 0.5
+        self.loss_mask = torch.ones(self.seq_length, dtype=torch.float)
+        self.position_ids = torch.arange(self.seq_length, dtype=torch.int64)
+
+    def __len__(self) -> int:
+        return self.length
+
+    def _get_text(self, idx: int) -> np.ndarray:
+        np_gen = np.random.default_rng(seed=(self.seed + idx))
+        return np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64)
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        # Generate data of the expected size and datatype (based on GPTDataset).
+        np_gen = np.random.default_rng(seed=(self.seed + idx))
+        tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64))
+        labels = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length], dtype=np.int64))
+
+        return {
+            "tokens": tokens,
+            "labels": labels,
+            "attention_mask": self.attention_mask,
+            "loss_mask": self.loss_mask,
+            "position_ids": self.position_ids,
+        }
+
+    def _collate_fn(self, batch):
+        """
+        A default implementation of a collation function.
+        Users should override this method to define custom data loaders.
+        """
+        return data.dataloader.default_collate(batch)
+
+    def collate_fn(self, batch):
+        """Method that user pass as functor to DataLoader.
+        
+        The method optionally performs neural type checking and add types to the outputs.
+
+        Please note, subclasses of Dataset should not implement `input_types`.
+
+        # Usage:
+        dataloader = torch.utils.data.DataLoader(
+                ....,
+                collate_fn=dataset.collate_fn,
+                ....
+        )
+
+        Returns
+        -------
+            Collated batch, with or without types.
+        """
+        return self._collate_fn(batch)
diff --git a/nemo/llm/gpt/model/__init__.py b/nemo/llm/gpt/model/__init__.py
new file mode 100644
index 000000000000..9481e75542ed
--- /dev/null
+++ b/nemo/llm/gpt/model/__init__.py
@@ -0,0 +1,3 @@
+from nemo.llm.gpt.model.base import GPTConfig, GPTModel, MaskedTokenLossReduction, gpt_data_step, gpt_forward_step
+
+__all__ = ["GPTConfig", "GPTModel", "MaskedTokenLossReduction", "gpt_data_step", "gpt_forward_step"]
diff --git a/nemo/llm/gpt/model/base.py b/nemo/llm/gpt/model/base.py
new file mode 100644
index 000000000000..02588b494077
--- /dev/null
+++ b/nemo/llm/gpt/model/base.py
@@ -0,0 +1,231 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional
+
+import pytorch_lightning as L
+import torch
+import torch.distributed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch.optim import Optimizer
+
+from nemo.lightning import get_vocab_size
+from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
+
+if TYPE_CHECKING:
+    from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
+
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+@dataclass
+class GPTConfig(TransformerConfig):
+    # From megatron.core.models.gpt.gpt_model.GPTModel
+    fp16_lm_cross_entropy: bool = False
+    parallel_output: bool = True
+    share_embeddings_and_output_weights: bool = False
+    position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
+    rotary_percent: float = 1.0
+    seq_len_interpolation_factor: Optional[float] = None
+    seq_length: int = 1024
+
+    # TODO: Move this to better places?
+    get_attention_mask_from_fusion: bool = False
+
+    optimizer_fn: Optional[Callable[["GPTModel"], Optimizer]] = None
+
+    def configure_model(self, tokenizer) -> "MCoreGPTModel":
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (
+                self.num_layers // p_size
+            ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
+
+        from megatron.core import parallel_state
+        from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+        from megatron.core.models.gpt.gpt_model import GPTModel as MCoreGPTModel
+
+        return MCoreGPTModel(
+            self,
+            transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(),
+            vocab_size=get_vocab_size(self, tokenizer.vocab_size),
+            max_sequence_length=self.seq_length,
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+            share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
+            position_embedding_type=self.position_embedding_type,
+            rotary_percent=self.rotary_percent,
+            seq_len_interpolation_factor=self.seq_len_interpolation_factor,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+        )
+
+
+class GPTModel(L.LightningModule):
+    def __init__(
+        self,
+        config: GPTConfig,
+        # TODO: Add transformer_layer_spec when we update mcore
+        tokenizer: Optional["TokenizerSpec"] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+
+    def configure_model(self) -> None:
+        self.module = self.config.configure_model(self.tokenizer)
+
+    def configure_optimizers(self) -> Optimizer:
+        if self.config.optimizer_fn is not None:
+            return self.config.optimizer_fn(self)
+
+        return gpt_default_optimizer(self)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        decoder_input: Optional[torch.Tensor] = None,
+        inference_params=None,
+    ) -> torch.Tensor:
+        output_tensor = self.module(
+            input_ids,
+            position_ids,
+            attention_mask,
+            decoder_input=decoder_input,
+            labels=labels,
+            inference_params=inference_params,
+        )
+
+        return output_tensor
+
+    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+        return gpt_data_step(dataloader_iter)
+
+    def forward_step(self, batch) -> torch.Tensor:
+        return gpt_forward_step(self, batch)
+
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+
+        return self.forward_step(batch)
+
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+
+        return self.forward_step(batch)
+
+    def training_loss_reduction(self) -> MaskedTokenLossReduction:
+        return MaskedTokenLossReduction()
+
+    def validation_loss_reduction(self) -> MaskedTokenLossReduction:
+        return MaskedTokenLossReduction(validation_step=True)
+
+    def copy(self) -> "GPTModel":
+        return self.__class__(self.config, self.tokenizer)
+
+
+def gpt_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    from megatron.core import parallel_state
+
+    # Based on: https://github.com/NVIDIA/Megatron-LM/blob/main/pretrain_gpt.py#L87
+    # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py#L828-L842
+
+    batch = next(dataloader_iter)
+
+    _batch: dict
+    if isinstance(batch, tuple) and len(batch) == 3:
+        _batch = batch[0]
+    else:
+        _batch = batch
+
+    required_keys = set()
+    required_keys.add("attention_mask")
+    if parallel_state.is_pipeline_first_stage():
+        required_keys.update(("tokens", "position_ids"))
+    if parallel_state.is_pipeline_last_stage():
+        required_keys.update(("labels", "loss_mask"))
+    # if self.get_attention_mask_from_fusion:
+    #     required_keys.remove('attention_mask')
+
+    _batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
+    # slice batch along sequence dimension for context parallelism
+    output = get_batch_on_this_context_parallel_rank(_batch)
+
+    return output
+
+
+def gpt_forward_step(model, batch) -> torch.Tensor:
+    forward_args = {
+        "input_ids": batch["tokens"],
+        "position_ids": batch["position_ids"],
+        "attention_mask": batch["attention_mask"],
+        "labels": batch["labels"],
+    }
+
+    if 'cu_seqlens' in batch:
+        forward_args['packed_seq_params'] = get_packed_seq_params(batch)
+
+    return model(**forward_args)
+
+
+def gpt_default_optimizer(module) -> Optimizer:
+    from apex.optimizers import FusedAdam
+
+    return FusedAdam(module.parameters(), lr=1e-4)
+
+
+def get_batch_on_this_context_parallel_rank(batch):
+    from megatron.core import parallel_state
+
+    if cp_size := parallel_state.get_context_parallel_world_size() > 1:
+        num_valid_tokens_in_ub = None
+        if 'loss_mask' in batch and batch['loss_mask'] is not None:
+            num_valid_tokens_in_ub = batch['loss_mask'].sum()
+
+        cp_rank = parallel_state.get_context_parallel_rank()
+        for key, val in batch.items():
+            if val is not None:
+                seq_dim = 1 if key != 'attention_mask' else 2
+                _val = val.view(
+                    *val.shape[0:seq_dim],
+                    2 * cp_size,
+                    val.shape[seq_dim] // (2 * cp_size),
+                    *val.shape[(seq_dim + 1) :],
+                )
+                index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True).cuda(
+                    non_blocking=True
+                )
+                _val = _val.index_select(seq_dim, index)
+                _val = _val.view(*val.shape[0:seq_dim], -1, *_val.shape[(seq_dim + 2) :])
+                batch[key] = _val
+        batch['num_valid_tokens_in_ub'] = num_valid_tokens_in_ub
+    return batch
+
+
+def get_packed_seq_params(batch):
+    from megatron.core.packed_seq_params import PackedSeqParams
+
+    cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
+    # remove -1 "paddings" added in collate_fn
+    if cu_seqlens_argmin := batch.get('cu_seqlens_argmin', None) is not None:
+        # pre-compute cu_seqlens_argmin in dataset class for perf
+        cu_seqlens = cu_seqlens[: cu_seqlens_argmin.item()]
+    else:
+        cu_seqlens = cu_seqlens[: torch.argmin(cu_seqlens)]
+
+    # pre-compute max_seqlens in dataset class for perf
+    max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+
+    # these args are passed eventually into TEDotProductAttention.forward()
+    return PackedSeqParams(
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+        qkv_format='thd',
+    )
+
+
+__all__ = ["GPTModel", "GPTConfig", "gpt_data_step", "gpt_forward_step", "gpt_default_optimizer"]

From 7c3e18dcaa2949ee8898dcf4a2af6207bda55448 Mon Sep 17 00:00:00 2001
From: huvunvidia <86480512+huvunvidia@users.noreply.github.com>
Date: Sat, 27 Apr 2024 09:04:17 -0400
Subject: [PATCH 19/30] Adding unit test for mcore RETRO model (#9022)

* runnable test_retro_model.py, with spm_tok_ende_4k/tokenizer.model tokenizer

* cleaning code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: Huy Vu2 <huvu@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 tests/collections/nlp/test_retro_model.py | 250 ++++++++++++++++++++++
 1 file changed, 250 insertions(+)
 create mode 100644 tests/collections/nlp/test_retro_model.py

diff --git a/tests/collections/nlp/test_retro_model.py b/tests/collections/nlp/test_retro_model.py
new file mode 100644
index 000000000000..ec100338a137
--- /dev/null
+++ b/tests/collections/nlp/test_retro_model.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import pytest
+import torch
+from omegaconf import DictConfig
+from pytorch_lightning import Trainer
+
+from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel
+from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
+
+DEVICE_CAPABILITY = None
+if torch.cuda.is_available():
+    DEVICE_CAPABILITY = torch.cuda.get_device_capability()
+
+
+@pytest.fixture()
+def retro_workdir_path(test_data_dir):
+
+    config_file = {
+        "retro_bert_tokenizer_type": "BertWordPieceLowerCase",
+        "retro_bert_vocab_file": "",
+        "retro_block_size": 1000,
+        "retro_gpt_chunk_length": 64,
+        "retro_gpt_data_cache_path": None,
+        "retro_gpt_data_path": "",
+        "retro_gpt_eval_interval": 2000,
+        "retro_gpt_eval_iters": 100,
+        "retro_gpt_global_batch_size": 8,
+        "retro_gpt_merge_file": None,
+        "retro_gpt_seed": 1234,
+        "retro_gpt_seq_length": 2048,
+        "retro_gpt_split": "98,2,0",
+        "retro_gpt_tokenizer_model": "spm_tok_ende_4k/tokenizer.model",
+        "retro_gpt_tokenizer_type": "GPTSentencePieceTokenizer",
+        "retro_gpt_train_samples": 5000,
+        "retro_gpt_valid_samples": 5000,
+        "retro_gpt_vocab_file": None,
+        "retro_neighbor_dirs": {"test": None, "train": None, "valid": None},
+    }
+
+    # save config to json file in retro_workdir_path
+    retro_workdir_path = test_data_dir + "/nlp"
+    config_file_path = retro_workdir_path + "/config.json"
+    out_file = open(config_file_path, 'w')
+    json.dump(config_file, out_file)
+    out_file.close()
+
+    return retro_workdir_path
+
+
+@pytest.fixture()
+def model_cfg(test_data_dir, retro_workdir_path):
+
+    # set model configs
+    model_cfg = {
+        'mcore_gpt': True,
+        'precision': '16',
+        'micro_batch_size': 4,
+        'global_batch_size': 8,
+        'tensor_model_parallel_size': 1,
+        'pipeline_model_parallel_size': 1,
+        'resume_from_checkpoint': None,
+        'encoder_seq_length': 2048,
+        'max_position_embeddings': 2048,
+        'num_layers': 12,
+        'hidden_size': 768,
+        'ffn_hidden_size': 3072,
+        'num_attention_heads': 12,
+        'init_method_std': 0.023,
+        'hidden_dropout': 0.1,
+        'kv_channels': 64,
+        # 'apply_query_key_layer_scaling': False,
+        'apply_query_key_layer_scaling': True,
+        'layernorm_epsilon': 1e-5,
+        'make_vocab_size_divisible_by': 128,
+        'pre_process': True,
+        'post_process': True,
+        'persist_layer_norm': True,
+        'bias': True,
+        'activation': 'gelu',
+        'transformer_block_type': 'pre_ln',
+        'retro': {
+            # 'retro_project_dir': os.path.join('tests/.data/test_data/nlp/retro_workdir_dummy'),
+            # 'retro_project_dir': os.path.join(test_data_dir, 'nlp/retro_workdir_dummy'),
+            # 'retro_project_dir': '/lustre/fsw/coreai_dlalgo_genai/huvu/data/retro/pretrain_data/micro-wiki-core-unittest',
+            'retro_project_dir': retro_workdir_path,
+            'retro_encoder_num_layers': 2,
+            'retro_encoder_hidden_dropout': 0.1,
+            'retro_encoder_attention_dropout': 0.1,
+            'retro_num_neighbors': 2,
+            'retro_num_retrieved_chunks': 2,
+            'retro_verify_neighbor_count': True,
+        },
+        'tokenizer': {
+            'library': 'megatron',
+            'type': None,
+            'model': None,
+            'vocab_file': None,
+            'merge_file': None,
+            'delimiter': None,
+            'sentencepiece_legacy': False,
+        },
+        'native_amp_init_scale': 4294967296,
+        'native_amp_growth_interval': 1000,
+        'hysteresis': 2,
+        'fp32_residual_connection': False,
+        'fp16_lm_cross_entropy': False,
+        'megatron_amp_O2': True,
+        'seed': 1234,
+        'use_cpu_initialization': False,
+        'onnx_safe': False,
+        'apex_transformer_log_level': 30,
+        'activations_checkpoint_method': None,
+        'activations_checkpoint_num_layers': None,
+        'data': {
+            'data_prefix': 'None',
+            'index_mapping_dir': None,
+            'data_impl': 'mmap',
+            'splits_string': '98,2,0',
+            'seq_length': 2048,
+            'skip_warmup': True,
+            'num_workers': 2,
+            'dataloader_type': 'single',
+            'reset_position_ids': False,
+            'reset_attention_mask': False,
+            'eod_mask_loss': False,
+            'shuffle_documents': False,
+            'retro_data': {
+                'retro_block_size': 10000,
+                'retro_chunk_length': 64,
+                'retro_split_preprocessing': "98,2,0",
+                'retro_neighbor_dirs': None,
+            },
+        },
+        'optim': {
+            'name': 'distributed_fused_adam',
+            'lr': 6.0e-4,
+            'weight_decay': 0.1,
+            'betas': [0.9, 0.95],
+            'sched': {'name': 'CosineAnnealing', 'warmup_steps': None, 'constant_steps': None, 'min_lr': '6.0e-5'},
+        },
+    }
+    return model_cfg
+
+
+@pytest.fixture()
+def trainer_cfg():
+
+    trainer_cfg = {
+        'devices': 1,
+        'num_nodes': 1,
+        'accelerator': 'gpu',
+        'precision': '16',
+        'logger': False,
+        'enable_checkpointing': False,
+        'use_distributed_sampler': False,
+        'max_epochs': -1,
+        'max_steps': 750000,
+        'log_every_n_steps': 10,
+        'val_check_interval': 100,
+        'limit_val_batches': 50,
+        'limit_test_batches': 500,
+        'accumulate_grad_batches': 1,
+        'gradient_clip_val': 1.0,
+    }
+
+    return trainer_cfg
+
+
+@pytest.fixture()
+def retro_model(model_cfg, trainer_cfg):
+
+    strategy = NLPDDPStrategy()
+
+    trainer = Trainer(strategy=strategy, **trainer_cfg)
+
+    cfg = DictConfig(model_cfg)
+
+    model = MegatronRetroModel(cfg=cfg, trainer=trainer)
+
+    return model
+
+
+@pytest.mark.run_only_on('GPU')
+class TestRETROModel:
+    @pytest.mark.unit
+    def test_constructor(self, retro_model):
+        assert isinstance(retro_model, MegatronRetroModel)
+
+        num_weights = retro_model.num_weights
+        # assert num_weights == 306868224 # using "tokenizer/mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model" tokenizer
+        assert num_weights == 113405952  # using "spm_tok_ende_4k/tokenizer.model" tokenizer
+
+    @pytest.mark.unit
+    def test_forward(self, retro_model):
+
+        # create dummy input
+        batch_size = 4
+        neighbors = 2
+        seq_length = 2048
+        chunk_length = 64
+        num_chunks = seq_length // chunk_length
+        retrieved_chunk_size = chunk_length * 2
+        vocab_size = 2000
+        eos_id = vocab_size - 2
+
+        # set input for forward
+        all_tokens = torch.randint(0, vocab_size, (batch_size, seq_length + 1)).cuda()
+        tokens = all_tokens[:, :-1]
+        labels = all_tokens[:, 1:]
+        attention_mask, _, text_position_ids = get_ltor_masks_and_position_ids(tokens, eos_id, False, False, False)
+        context_input_ids = torch.randint(
+            0, vocab_size, (batch_size * num_chunks * neighbors, retrieved_chunk_size)
+        ).cuda()
+        _, _, context_position_ids = get_ltor_masks_and_position_ids(  # neighbor_tokens is already a 2D array
+            context_input_ids, eos_id, False, False, False
+        )
+        context_mask = None
+
+        # set model to eval mode
+        retro_model.eval()
+
+        # forward step
+        with torch.no_grad():
+            out = retro_model(
+                tokens=tokens.cuda(),
+                text_position_ids=text_position_ids.cuda(),
+                attention_mask=attention_mask.cuda(),
+                labels=labels.cuda(),
+                context_input_ids=context_input_ids.cuda(),
+                context_position_ids=context_position_ids.cuda(),
+                context_mask=context_mask,
+            )
+
+        assert out.shape == torch.Size([batch_size, seq_length])

From d600c4c9a3da9587b2683493cad2f12f243c1bf2 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Sat, 27 Apr 2024 13:03:03 -0700
Subject: [PATCH 20/30] add tag/label for 1-gpu runner (#9046)

---
 .github/workflows/cicd-main.yml | 208 ++++++++++++++++----------------
 1 file changed, 104 insertions(+), 104 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 83016a738e71..5779a860fbbb 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -325,7 +325,7 @@ jobs:
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -343,7 +343,7 @@ jobs:
             python examples/asr/asr_ctc/speech_to_text_ctc.py \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_results
@@ -353,7 +353,7 @@ jobs:
 
   ASR_dev_run_Speech_to_Text_WPE_-_CitriNet:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -374,7 +374,7 @@ jobs:
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             model.tokenizer.dir="/home/TestData/asr_tokenizers/an4_wpe_128/" \
             model.tokenizer.type="wpe" \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_results
@@ -384,7 +384,7 @@ jobs:
 
   ASR_dev_run_Speech_Pre-training_-_CitriNet:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -403,7 +403,7 @@ jobs:
             --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_pre_training_results
@@ -413,7 +413,7 @@ jobs:
 
   ASR_dev_run_Speech_To_Text_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -434,7 +434,7 @@ jobs:
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             init_from_nemo_model=/home/TestData/asr/stt_en_fastconformer_transducer_large.nemo \
             model.tokenizer.update_tokenizer=False \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
@@ -444,7 +444,7 @@ jobs:
 
   ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -481,7 +481,7 @@ jobs:
             model.optim.sched.warmup_steps=0 \
             +model.optim.sched.max_steps=3 \
             trainer.max_epochs=null \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_finetuning_results
@@ -491,7 +491,7 @@ jobs:
 
   ASR_dev_run_Speech_to_Text_WPE_-_Conformer:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -514,7 +514,7 @@ jobs:
             model.tokenizer.type="wpe" \
             model.train_ds.batch_size=4 \
             model.validation_ds.batch_size=4 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_conformer_results
@@ -525,7 +525,7 @@ jobs:
   # L2: ASR dev run - part two
   ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -549,7 +549,7 @@ jobs:
             model.encoder.d_model=144 \
             model.train_ds.batch_size=4 \
             model.validation_ds.batch_size=4 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_wpe_squeezeformer_results
@@ -588,7 +588,7 @@ jobs:
 
   # L2_Speech_to_Text_AED:
   #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
+  #   runs-on: self-hosted-azure-gpus-1
   #   container:
   #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
@@ -627,7 +627,7 @@ jobs:
   #           model.tokenizer.langs.en.type=bpe \
   #           ++model.tokenizer.langs.es.dir=/home/TestData/asr_tokenizers/canary/es/tokenizer_spe_bpe_v1024_max_4 \
   #           ++model.tokenizer.langs.es.type=bpe \
-  #           trainer.devices=[0] \
+  #           trainer.devices=1 \
   #           trainer.accelerator="gpu" \
   #           +trainer.use_distributed_sampler=false \
   #           +trainer.fast_dev_run=True \
@@ -638,7 +638,7 @@ jobs:
   # L2: Speaker dev run
   L2_Speaker_dev_run_Speaker_Recognition:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -660,7 +660,7 @@ jobs:
             model.validation_ds.manifest_filepath=/home/TestData/an4_speaker/dev.json \
             model.decoder.num_classes=2 \
             trainer.max_epochs=10 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/speaker_tasks/recognition/speaker_recognition_results
@@ -670,7 +670,7 @@ jobs:
 
   L2_Speaker_dev_run_Speaker_Diarization:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -693,7 +693,7 @@ jobs:
             model.validation_ds.emb_dir=examples/speaker_tasks/diarization/speaker_diarization_results \
             model.train_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_train/msdd_data.50step.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_diarizer/simulated_valid/msdd_data.50step.json \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/speaker_tasks/diarization/speaker_diarization_results
@@ -703,7 +703,7 @@ jobs:
 
   L2_Speaker_dev_run_Speech_to_Label:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -722,7 +722,7 @@ jobs:
             model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
             model.validation_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
             model.test_ds.manifest_filepath=/home/TestData/speech_commands/test_manifest.json \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             model.preprocessor._target_=nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor \
@@ -858,7 +858,7 @@ jobs:
   # L2: ASR Multi-dataloader dev run
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -876,7 +876,7 @@ jobs:
             python examples/asr/asr_ctc/speech_to_text_ctc.py \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=[/home/TestData/an4_dataset/an4_val.json,/home/TestData/an4_dataset/an4_val.json] \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             trainer.max_epochs=1 \
             trainer.max_steps=1 \
@@ -888,7 +888,7 @@ jobs:
 
   L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -906,7 +906,7 @@ jobs:
             python examples/asr/speech_classification/speech_to_label.py \
             model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
             model.validation_ds.manifest_filepath=[/home/TestData/speech_commands/test_manifest.json,/home/TestData/speech_commands/test_manifest.json] \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             trainer.max_epochs=1 \
             trainer.max_steps=1 \
@@ -927,7 +927,7 @@ jobs:
   # L2: ASR Adapters
   L2_ASR_Adapters_Linear_Adapters:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -949,7 +949,7 @@ jobs:
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             trainer.max_steps=5 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_adapters_results
@@ -959,7 +959,7 @@ jobs:
 
   L2_ASR_Adapters_RelPos_MHA_Adapters:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -982,7 +982,7 @@ jobs:
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
             model.validation_ds.manifest_filepath=/home/TestData/an4_dataset/an4_val.json \
             trainer.max_steps=5 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=True \
             exp_manager.exp_dir=examples/asr/speech_to_text_adapters_mha_results
@@ -1109,7 +1109,7 @@ jobs:
   # L2: G2P Models
   L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1133,7 +1133,7 @@ jobs:
                     model.tokenizer.dir=/home/TestData/g2p/tokenizer_spe_unigram_v512 \
                     trainer.max_epochs=1 \
                     model.max_source_len=64 \
-                    trainer.devices=[0] \
+                    trainer.devices=1 \
                     do_training=True \
                     do_testing=True \
                     exp_manager.exp_dir=${OUTPUT_DIR_CONFORMER} \
@@ -1158,7 +1158,7 @@ jobs:
     #             model.test_ds.manifest_filepath=/home/TestData/g2p/g2p.json \
     #             trainer.max_epochs=1 \
     #             model.max_source_len=64 \
-    #             trainer.devices=[1] \
+    #             trainer.devices=1 \
     #             do_training=True \
     #             do_testing=True \
     #             exp_manager.exp_dir=${OUTPUT_DIR_T5} \
@@ -1217,7 +1217,7 @@ jobs:
   # TODO: pleasefixme
   # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT:
   #   needs: [cicd-test-container-setup]
-  #   runs-on: self-hosted-azure
+  #   runs-on: self-hosted-azure-gpus-1
   #   container:
   #     image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
   #     options: 
@@ -1246,7 +1246,7 @@ jobs:
   #           model.test_ds.batch_size=2 \
   #           model.nemo_path=null \
   #           trainer.val_check_interval=0.0 \
-  #           trainer.devices=[0] \
+  #           trainer.devices=1 \
   #           model.dataset.use_cache=false \
   #           model.tokenizer.special_tokens={pad_token:"endoftext"} \
   #           model.tokenizer.tokenizer_name=gpt2 \
@@ -1258,7 +1258,7 @@ jobs:
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1286,7 +1286,7 @@ jobs:
             model.dataset.num_tasks=6 \
             model.nemo_path=null \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=bert-base-cased \
             trainer.accelerator=gpu \
@@ -1297,7 +1297,7 @@ jobs:
 
   L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1324,7 +1324,7 @@ jobs:
             model.test_ds.batch_size=2 \
             model.nemo_path=null \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=bert-base-uncased \
             trainer.accelerator=gpu \
@@ -1335,7 +1335,7 @@ jobs:
 
   L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1365,7 +1365,7 @@ jobs:
             model.test_ds.batch_size=2 \
             model.nemo_path=null \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=bert-base-uncased \
             trainer.accelerator=gpu \
@@ -1376,7 +1376,7 @@ jobs:
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1407,7 +1407,7 @@ jobs:
             model.test_ds.batch_size=2 \
             model.nemo_path=null \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=bert-base-uncased \
             trainer.accelerator=gpu \
@@ -1418,7 +1418,7 @@ jobs:
 
   L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1442,7 +1442,7 @@ jobs:
             model.dataset.task=design \
             model.dataset.prompt_template="This example is related to" \
             model.library=huggingface \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=bert-base-uncased \
             trainer.accelerator=gpu \
@@ -1453,7 +1453,7 @@ jobs:
 
   L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1476,7 +1476,7 @@ jobs:
             model.dataset.task=design \
             model.dataset.prompt_template="" \
             model.library=huggingface \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \
             trainer.accelerator=gpu \
@@ -1488,7 +1488,7 @@ jobs:
   # L2: Dialogue Generation
   L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1518,7 +1518,7 @@ jobs:
             model.test_ds.batch_size=2 \
             model.nemo_path=null \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=facebook/bart-large \
             trainer.accelerator=gpu \
@@ -1529,7 +1529,7 @@ jobs:
 
   L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1565,7 +1565,7 @@ jobs:
             model.test_ds.batch_size=2 \
             model.nemo_path=null \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             model.language_model.pretrained_model_name=facebook/bart-large \
             trainer.accelerator=gpu \
             exp_manager=null  && \
@@ -1593,7 +1593,7 @@ jobs:
 #             model.dataset.task=ms_marco \
 #             model.dataset.debug_mode=True \
 #             trainer.val_check_interval=0.0 \
-#             trainer.devices=[0] \
+#             trainer.devices=1 \
 #             model.dataset.use_cache=false \
 #             model.language_model.pretrained_model_name=gpt2 \
 #             trainer.accelerator=gpu \
@@ -1607,7 +1607,7 @@ jobs:
   # L2: COPY
   L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1631,7 +1631,7 @@ jobs:
             model.dataset.task=ms_marco \
             model.dataset.debug_mode=True \
             trainer.val_check_interval=0.0 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             model.dataset.use_cache=false \
             model.language_model.pretrained_model_name=gpt2 \
             trainer.accelerator=gpu \
@@ -1703,7 +1703,7 @@ jobs:
 #     trainer.accelerator=gpu \
 #     trainer.strategy=ddp \
 #     trainer.precision=16 \
-#     trainer.devices=[1] \
+#     trainer.devices=1 \
 #     trainer.accelerator="gpu" \
 #     +trainer.fast_dev_run=true \
 #     exp_manager=null
@@ -1713,7 +1713,7 @@ jobs:
   # L2: BERT Text Classification
   L2_BERT_Text_Classification_with_BERT_Test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1737,7 +1737,7 @@ jobs:
             model.train_ds.batch_size=10 \
             model.dataset.max_seq_length=50 \
             model.dataset.use_cache=false \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager=null
@@ -1747,7 +1747,7 @@ jobs:
   # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_BERT_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1780,7 +1780,7 @@ jobs:
             model.language_model.pretrained_model_name=bert-base-uncased \
             model.dataset.version_2_with_negative=false \
             trainer.precision=16 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             exp_manager=null
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1788,7 +1788,7 @@ jobs:
 
   L2_Parallel_BERT_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1818,7 +1818,7 @@ jobs:
             model.language_model.pretrained_model_name=bert-base-uncased \
             model.dataset.version_2_with_negative=true \
             trainer.precision=16 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             exp_manager=null
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1827,7 +1827,7 @@ jobs:
   # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_BART_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1860,7 +1860,7 @@ jobs:
             model.language_model.pretrained_model_name=facebook/bart-base \
             model.dataset.version_2_with_negative=false \
             trainer.precision=16 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             exp_manager=null
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1868,7 +1868,7 @@ jobs:
 
   L2_Parallel_BART_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1898,7 +1898,7 @@ jobs:
             model.language_model.pretrained_model_name=facebook/bart-base \
             model.dataset.version_2_with_negative=true \
             trainer.precision=16 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             exp_manager=null
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1907,7 +1907,7 @@ jobs:
   # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0
   L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1940,7 +1940,7 @@ jobs:
             model.language_model.pretrained_model_name=gpt2 \
             model.dataset.version_2_with_negative=false \
             trainer.precision=16 \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             exp_manager=null
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1948,7 +1948,7 @@ jobs:
 
   L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -1978,7 +1978,7 @@ jobs:
             model.language_model.pretrained_model_name=gpt2 \
             model.dataset.version_2_with_negative=true \
             trainer.precision=16 \
-            trainer.devices=[1] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             exp_manager=null
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1987,7 +1987,7 @@ jobs:
   # L2: Intent and Slot Classification Tasks
   L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2007,7 +2007,7 @@ jobs:
             model.data_dir=/home/TestData/nlp/retail \
             model.validation_ds.prefix=dev \
             model.test_ds.prefix=dev \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager.exp_dir=checkpoints
@@ -2017,7 +2017,7 @@ jobs:
 
   L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2037,7 +2037,7 @@ jobs:
             model.data_dir=/home/TestData/nlp/new_multiatis \
             model.validation_ds.prefix=dev \
             model.test_ds.prefix=dev \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             +trainer.fast_dev_run=true \
             exp_manager.exp_dir=checkpoints2
             rm -rf checkpoints2
@@ -2153,7 +2153,7 @@ jobs:
   # L2: Parallel NLP Examples 2
   L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2174,7 +2174,7 @@ jobs:
             model.dataset.data_dir=/home/TestData/nlp/ner/ \
             model.train_ds.batch_size=2 \
             model.dataset.use_cache=false \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             model.dataset.class_balancing="weighted_loss" \
@@ -2184,7 +2184,7 @@ jobs:
 
   L2_Parallel_NLP_Examples2_Punctuation_and_capitalization_finetuning_from_pretrained_test:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2210,7 +2210,7 @@ jobs:
               +model.train_ds.use_cache=false \
               +model.validation_ds.use_cache=false \
               +model.test_ds.use_cache=false \
-              trainer.devices=[1] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               +trainer.fast_dev_run=true \
               exp_manager.exp_dir=null && \
@@ -2220,7 +2220,7 @@ jobs:
 
   L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2238,7 +2238,7 @@ jobs:
             cd examples/nlp/token_classification && \
             python token_classification_train.py \
             model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             model.dataset.use_cache=false \
@@ -2586,7 +2586,7 @@ jobs:
   # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed
   L2_Pretraining_BERT_pretraining_from_Text:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2604,7 +2604,7 @@ jobs:
             cd examples/nlp/language_modeling && \
               python bert_pretraining.py \
               --config-name=bert_pretraining_from_text_config.yaml \
-              trainer.devices=[0] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               trainer.precision=16 \
               +trainer.fast_dev_run=true \
@@ -2628,7 +2628,7 @@ jobs:
 
   L2_Pretraining_BERT_from_Preprocessed:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2646,7 +2646,7 @@ jobs:
             cd examples/nlp/language_modeling && \
               python bert_pretraining.py \
               --config-name=bert_pretraining_from_preprocessed_config.yaml \
-              trainer.devices=[1] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               trainer.precision=16 \
               +trainer.fast_dev_run=false \
@@ -2704,7 +2704,7 @@ jobs:
   # L2: NMT Attention is All You Need Training
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2738,7 +2738,7 @@ jobs:
               model.decoder.hidden_size=64 \
               model.decoder.inner_size=256 \
               +model.optim.capturable=True \
-              trainer.devices=[0] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               +trainer.val_check_interval=2 \
               +trainer.limit_val_batches=1 \
@@ -2766,7 +2766,7 @@ jobs:
               model.decoder.hidden_size=64 \
               model.decoder.inner_size=256 \
               +model.optim.capturable=True \
-              trainer.devices=[0] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               +trainer.val_check_interval=10 \
               +trainer.limit_val_batches=1 \
@@ -2782,7 +2782,7 @@ jobs:
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2812,7 +2812,7 @@ jobs:
               model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
               model.encoder.pre_ln=true \
               model.decoder.pre_ln=true \
-              trainer.devices=[1] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               +trainer.fast_dev_run=true \
               +trainer.limit_test_batches=2 \
@@ -2822,7 +2822,7 @@ jobs:
 
   L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2850,7 +2850,7 @@ jobs:
               model.test_ds.tgt_file_name=[/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref,/home/TestData/nlp/nmt/toy_data/wmt14-en-de.ref] \
               model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
               model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/spm_4k_ende.model \
-              trainer.devices=[0] \
+              trainer.devices=1 \
               trainer.accelerator="gpu" \
               +trainer.fast_dev_run=true \
               +trainer.limit_test_batches=2 \
@@ -2889,7 +2889,7 @@ jobs:
   # L2: NMT Attention is All You Need Finetuning
   L2_NMT_Attention_is_All_You_Need_Finetuning:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2907,7 +2907,7 @@ jobs:
             cd examples/nlp/machine_translation && \
             python enc_dec_nmt_finetune.py \
             model_path=/home/TestData/nlp/nmt/toy_data/enes_v16k_s100k_6x6.nemo \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             ~trainer.max_epochs \
             model.train_ds.src_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
             model.train_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.ref \
@@ -2932,7 +2932,7 @@ jobs:
   # L2: NMT Tarred Dataset Creation
   L2_NMT_Tarred_Dataset_Creation_Auto_Tarred_Dataset_Creation:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -2964,7 +2964,7 @@ jobs:
             model.encoder_tokenizer.vocab_size=2000 \
             model.decoder_tokenizer.vocab_size=2000 \
             ~model.test_ds \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.fast_dev_run=true \
             exp_manager=null \
@@ -3257,7 +3257,7 @@ jobs:
     #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt13-en-de.ref \
     #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
     #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=[1] \
+    #           trainer.devices=1 \
     #           trainer.accelerator="gpu" \
     #           +trainer.fast_dev_run=true \
     #           +trainer.limit_test_batches=2 \
@@ -3303,7 +3303,7 @@ jobs:
     #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
     #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
     #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=[0] \
+    #           trainer.devices=1 \
     #           trainer.accelerator="gpu" \
     #           +trainer.fast_dev_run=true \
     #           +trainer.limit_test_batches=2 \
@@ -3338,7 +3338,7 @@ jobs:
     #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
     #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
     #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=[1] \
+    #           trainer.devices=1 \
     #           trainer.accelerator="gpu" \
     #           +trainer.fast_dev_run=true \
     #           +trainer.limit_test_batches=2 \
@@ -3384,7 +3384,7 @@ jobs:
     #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
     #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
     #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=[0] \
+    #           trainer.devices=1 \
     #           trainer.accelerator="gpu" \
     #           +trainer.fast_dev_run=true \
     #           +trainer.limit_test_batches=2 \
@@ -3419,7 +3419,7 @@ jobs:
     #           model.test_ds.tgt_file_name=/home/TestData/nlp/nmt/toy_data/wmt14-de-en.src \
     #           model.encoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
     #           model.decoder_tokenizer.tokenizer_model=/home/TestData/nlp/nmt/toy_data/tt_tokenizer.BPE.4096.model \
-    #           trainer.devices=[1] \
+    #           trainer.devices=1 \
     #           trainer.accelerator="gpu" \
     #           +trainer.fast_dev_run=true \
     #           +trainer.limit_test_batches=2 \
@@ -4607,7 +4607,7 @@ jobs:
 
   L2_Megatron_GPT_Finetuning_StarCoder_PP1:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5671,7 +5671,7 @@ jobs:
   # TODO(Oktai15): update it in 1.8.0 version
   L2_Megatron_T5_GLUE_RTE:  
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5715,7 +5715,7 @@ jobs:
   
   L2_Megatron_T5_GLUE_XNLI:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5886,7 +5886,7 @@ jobs:
   # L2: TTS Fast dev runs 1
   L2_TTS_Fast_dev_runs_1_Tacotron_2:
     needs: [cicd-test-container-setup]
-    runs-on: self-hosted-azure
+    runs-on: self-hosted-azure-gpus-1
     container:
       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
       options: 
@@ -5904,7 +5904,7 @@ jobs:
             python examples/tts/tacotron2.py \
             train_dataset=/home/TestData/an4_dataset/an4_train.json \
             validation_datasets=/home/TestData/an4_dataset/an4_val.json \
-            trainer.devices=[0] \
+            trainer.devices=1 \
             trainer.accelerator="gpu" \
             +trainer.limit_train_batches=1 +trainer.limit_val_batches=1 trainer.max_epochs=1 \
             trainer.strategy=auto \

From 7c2f985a167c64458e743bb71989c868161d6281 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Sat, 27 Apr 2024 13:03:20 -0700
Subject: [PATCH 21/30] [Nemo CICD] checkout v4 (#9048)

* add tag/label for 1-gpu runner

* checkout v4

* checkout v4
---
 .github/workflows/cicd-main.yml | 240 ++++++++++++++++----------------
 1 file changed, 121 insertions(+), 119 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 5779a860fbbb..fa20d86f387d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -17,6 +17,8 @@ on:
   pull_request:
     branches: [ "main" ]
     types: [ labeled ]
+  push:
+    branches: [ "pagaray/nemo_cicd_part22" ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -49,7 +51,7 @@ jobs:
 #        - ${{ github.workspace }}:/workspace
 #    steps:
 #    - name: Checkout repository
-#      uses: actions/checkout@v2
+#      uses: actions/checkout@v4
 #      with:
 #        path: ${{ github.run_id }}
 
@@ -70,7 +72,7 @@ jobs:
 #        --env HYDRA_FULL_ERROR=1
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
       with:
         path: ${{ github.run_id }}
 
@@ -190,7 +192,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
     - name: "L0: Unit Tests GPU"
       run: |
         NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads
@@ -213,7 +215,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v2
+      uses: actions/checkout@v4
     - name: "L0: Unit Tests CPU"
       run: |
         CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
@@ -240,7 +242,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
             --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf \
@@ -265,7 +267,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python scripts/checkpoint_converters/convert_starcoder_hf_to_nemo.py \
             --input_name_or_path /home/TestData/nlp/megatron_gpt/starcoder-ci-hf \
@@ -289,7 +291,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python scripts/checkpoint_converters/convert_falcon_hf_to_nemo.py \
             --input_name_or_path /home/TestData/nlp/megatron_gpt/falcon-ci-hf \
@@ -313,7 +315,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py \
             --input_name_or_path=/home/TestData/nlp/megatron_gpt/Baichuan2-7B-Base \
@@ -338,7 +340,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_ctc/speech_to_text_ctc.py \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
@@ -366,7 +368,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
             --config-path="../conf/citrinet/" --config-name="config_bpe" \
@@ -397,7 +399,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/speech_pretraining/speech_pre_training.py \
             --config-path="../conf/ssl/citrinet/" --config-name="citrinet_ssl_ci" \
@@ -426,7 +428,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/speech_to_text_finetune.py \
             --config-path="conf/asr_finetune" --config-name="speech_to_text_finetune" \
@@ -457,7 +459,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/speech_to_text_finetune.py \
             --config-path="conf/asr_finetune" --config-name="speech_to_text_hf_finetune" \
@@ -504,7 +506,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
             --config-path="../conf/conformer" --config-name="conformer_ctc_bpe" \
@@ -538,7 +540,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_ctc/speech_to_text_ctc_bpe.py \
             --config-path="../conf/squeezeformer" --config-name="squeezeformer_ctc_bpe" \
@@ -572,7 +574,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_ctc/speech_to_text_ctc.py \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
@@ -601,7 +603,7 @@ jobs:
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
-  #         uses: actions/checkout@v2
+  #         uses: actions/checkout@v4
   #       - run: |
   #           python examples/asr/speech_multitask/speech_to_text_aed.py \
   #           model.prompt_format=canary \
@@ -651,7 +653,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/speaker_tasks/recognition/speaker_reco.py \
             model.train_ds.batch_size=10 \
@@ -683,7 +685,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder.py \
             model.diarizer.speaker_embeddings.model_path=titanet_large \
@@ -716,7 +718,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/speech_classification/speech_to_label.py \
             model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
@@ -752,7 +754,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py \
             diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
@@ -783,7 +785,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |  
             python examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_infer.py \
             diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
@@ -814,7 +816,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/speaker_tasks/diarization/neural_diarizer/multiscale_diar_decoder_infer.py \
             diarizer.manifest_filepath=/home/TestData/an4_diarizer/an4_manifest.json \
@@ -841,7 +843,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python tools/speech_data_simulator/multispeaker_simulator.py \
             --config-path=conf --config-name=data_simulator.yaml \
@@ -871,7 +873,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_ctc/speech_to_text_ctc.py \
             model.train_ds.manifest_filepath=/home/TestData/an4_dataset/an4_train.json \
@@ -901,7 +903,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/speech_classification/speech_to_label.py \
             model.train_ds.manifest_filepath=/home/TestData/speech_commands/train_manifest.json \
@@ -940,7 +942,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_adapters/train_asr_adapter.py \
             model.pretrained_model="stt_en_conformer_ctc_small" \
@@ -972,7 +974,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/asr_adapters/train_asr_adapter.py \
             model.pretrained_model="stt_en_conformer_ctc_small" \
@@ -1007,7 +1009,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/asr/transcribe_speech.py \
             pretrained_name="QuartzNet15x5Base-En" \
@@ -1034,7 +1036,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
@@ -1056,7 +1058,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd tools/ctc_segmentation && \
             TIME=`date +"%Y-%m-%d-%T"` && \
@@ -1088,7 +1090,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd tools/ctc_segmentation && \
             TIME=`date +"%Y-%m-%d-%T"` && \
@@ -1122,7 +1124,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/tts/g2p && \
                 TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
@@ -1188,7 +1190,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/tts/g2p && \
                 TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
@@ -1230,7 +1232,7 @@ jobs:
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
-  #         uses: actions/checkout@v2
+  #         uses: actions/checkout@v4
   #       - run: |
   #           cd examples/nlp/dialogue && \
   #           python dialogue.py \
@@ -1271,7 +1273,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1310,7 +1312,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1348,7 +1350,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1389,7 +1391,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1431,7 +1433,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1466,7 +1468,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1501,7 +1503,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1542,7 +1544,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1620,7 +1622,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/dialogue && \
             python dialogue.py \
@@ -1656,7 +1658,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/duplex_text_normalization && \
             python duplex_text_normalization_train.py \
@@ -1726,7 +1728,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/text_classification && \
             python text_classification_with_bert.py \
@@ -1760,7 +1762,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             # Cannot do fast_dev_run because squad needs whole dev dataset
             cd examples/nlp/question_answering && \
@@ -1801,7 +1803,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             # Cannot do fast_dev_run because squad needs whole dev dataset
             cd examples/nlp/question_answering && \
@@ -1840,7 +1842,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/question_answering && \
             python question_answering.py \
@@ -1881,7 +1883,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/question_answering && \
             python question_answering.py \
@@ -1920,7 +1922,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/question_answering && \
             python question_answering.py \
@@ -1961,7 +1963,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/question_answering && \
             python question_answering.py \
@@ -2000,7 +2002,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/intent_slot_classification && \
             python intent_slot_classification.py \
@@ -2030,7 +2032,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/intent_slot_classification && \
             python multi_label_intent_slot_classification.py \
@@ -2166,7 +2168,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/token_classification && \
             python token_classification_train.py \
@@ -2197,7 +2199,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/token_classification && \
             data_dir="$(mktemp -d -p "$(pwd)")" && \
@@ -2233,7 +2235,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/token_classification && \
             python token_classification_train.py \
@@ -2262,7 +2264,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/token_classification/token_classification_evaluate.py \
             model.dataset.data_dir=/home/TestData/nlp/ner/ \
@@ -2286,7 +2288,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             data_dir="$(mktemp -d -p "$(pwd)")" && \
             cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}"/ && \
@@ -2317,7 +2319,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/token_classification && \
             output_dir="$(mktemp -d -p "$(pwd)")" && \
@@ -2377,7 +2379,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             data_dir="$(mktemp -d -p "$(pwd)")" && \
             cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \
@@ -2442,7 +2444,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/token_classification && \
             work_dir="$(mktemp -d -p "$(pwd)")" && \
@@ -2505,7 +2507,7 @@ jobs:
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
-  #         uses: actions/checkout@v2
+  #         uses: actions/checkout@v4
   #       - run: |
   #           cd examples/nlp/token_classification && \
   #           work_dir="$(mktemp -d -p "$(pwd)")" && \
@@ -2568,7 +2570,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             output_dir="$(mktemp -d -p "$(pwd)")" && \
             python examples/nlp/token_classification/punctuate_capitalize_infer.py \
@@ -2599,7 +2601,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/language_modeling && \
               python bert_pretraining.py \
@@ -2641,7 +2643,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/language_modeling && \
               python bert_pretraining.py \
@@ -2683,7 +2685,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/entity_linking && \
             python self_alignment_pretraining.py \
@@ -2717,7 +2719,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/machine_translation/enc_dec_nmt.py \
               --config-path=conf \
@@ -2795,7 +2797,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
               cd examples/nlp/machine_translation && \
               python enc_dec_nmt.py \
@@ -2835,7 +2837,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
               cd examples/nlp/machine_translation && \
               python enc_dec_nmt.py \
@@ -2874,7 +2876,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/machine_translation && \
             python nmt_transformer_infer.py \
@@ -2902,7 +2904,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/machine_translation && \
             python enc_dec_nmt_finetune.py \
@@ -2945,7 +2947,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/machine_translation && \
             python enc_dec_nmt.py \
@@ -2986,7 +2988,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             cd examples/nlp/machine_translation && \
             python create_tarred_parallel_dataset.py \
@@ -3017,7 +3019,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/machine_translation/megatron_nmt_training.py \
             trainer.devices=2 \
@@ -3126,7 +3128,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_bart_pretraining.py \
             trainer.devices=2 \
@@ -3444,7 +3446,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
@@ -3525,7 +3527,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
@@ -3607,7 +3609,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
             trainer.devices=2 \
@@ -3691,7 +3693,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_retro_pretraining.py \
             trainer.num_nodes=1 \
@@ -3760,7 +3762,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \
             trainer.devices=2 \
@@ -3843,7 +3845,7 @@ jobs:
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
-  #         uses: actions/checkout@v2
+  #         uses: actions/checkout@v4
   #       - run: |
   #           python examples/nlp/language_modeling/megatron_retro_mutransfer_pretrain.py \
   #               trainer.devices=2 \
@@ -3937,7 +3939,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/token_classification/token_classification_train.py \
             exp_manager.exp_dir=examples/nlp/language_modeling/token_classification_results \
@@ -3964,7 +3966,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
             trainer.devices=2 \
@@ -4057,7 +4059,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
            python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
            trainer.devices=2 \
@@ -4247,7 +4249,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
             trainer.devices=2 \
@@ -4343,7 +4345,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
             trainer.devices=2 \
@@ -4439,7 +4441,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
             trainer.devices=2 \
@@ -4539,7 +4541,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
             trainer.devices=2 \
@@ -4620,7 +4622,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/tuning/megatron_gpt_sft.py \
             trainer.devices=1 \
@@ -4664,7 +4666,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             rm -rf examples/nlp/language_modeling/gpt_peft_lora_results_pp2
 
@@ -4710,7 +4712,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             rm -rf /home/TestData/nlp/lora_tuning_tp2
 
@@ -4772,7 +4774,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_eval.py \
                 gpt_model_file=/home/TestData/nlp/megatron_gpt/125M/megatron_gpt.nemo \
@@ -4798,7 +4800,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_eval.py \
                 gpt_model_file=/home/TestData/nlp/megatron_gpt/PP2/gpt_pp2_tp1.nemo \
@@ -4826,7 +4828,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/tuning/megatron_gpt_generate.py \
                 model.restore_from_path=/home/TestData/nlp/megatron_gpt_sft/megatron_gpt_rope_sft.nemo \
@@ -4889,7 +4891,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_change_num_partitions.py \
                 --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
@@ -4918,7 +4920,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_change_num_partitions.py \
                 --model_file /home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
@@ -4947,7 +4949,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py \
             trainer.devices=2 \
@@ -5054,7 +5056,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py \
             trainer.devices=2 \
@@ -5161,7 +5163,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py \
             trainer.devices=2 \
@@ -5268,7 +5270,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py \
             trainer.devices=2 \
@@ -5349,7 +5351,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py \
             trainer.devices=2 \
@@ -5403,7 +5405,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py -cn megatron_ul2_config \
             trainer.devices=2 \
@@ -5494,7 +5496,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_eval.py \
                 --model_file /home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \
@@ -5518,7 +5520,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_bart_pretraining.py \
             trainer.devices=2 \
@@ -5598,7 +5600,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_bart_pretraining.py \
             trainer.devices=2 \
@@ -5684,7 +5686,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
             trainer.devices=1 \
@@ -5728,7 +5730,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \
             -cn megatron_t5_config_finetune_glue_xnli \
@@ -5777,7 +5779,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             rm -rf /home/TestData/nlp/t5_lora_tuning_tp2
 
@@ -5842,7 +5844,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
                 trainer.max_steps=10 \
@@ -5869,7 +5871,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/nlp/language_modeling/megatron_t5_pretraining.py \
             trainer.max_steps=10 \
@@ -5899,7 +5901,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/tts/tacotron2.py \
             train_dataset=/home/TestData/an4_dataset/an4_train.json \
@@ -5937,7 +5939,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/tts/waveglow.py \
             train_dataset=/home/TestData/an4_dataset/an4_train.json \
@@ -5971,7 +5973,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/tts/fastpitch.py \
             --config-name fastpitch_align_v1.05 \
@@ -6015,7 +6017,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/tts/radtts.py \
             train_dataset=/home/TestData/an4_dataset/an4_train.json \
@@ -6056,7 +6058,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/tts/mixer_tts.py \
             train_dataset=/home/TestData/an4_dataset/an4_train.json \
@@ -6094,7 +6096,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             python examples/tts/hifigan.py \
             train_dataset=/home/TestData/an4_dataset/an4_train.json \
@@ -6130,7 +6132,7 @@ jobs:
   #       --volume /mnt/datadrive/TestData:/home/TestData
   #   steps:
   #       - name: Checkout repository
-  #         uses: actions/checkout@v2
+  #         uses: actions/checkout@v4
   #       - run: |
   #           python examples/multimodal/text_to_image/nerf/main.py \
   #           trainer.num_nodes=1 \
@@ -6158,7 +6160,7 @@ jobs:
         --volume /mnt/datadrive/TestData:/home/TestData
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v2
+          uses: actions/checkout@v4
         - run: |
             CUDA_VISIBLE_DEVICES=0 python examples/asr/speech_to_text_eval.py \
                 pretrained_name=QuartzNet15x5Base-En  \

From b401fdec55da8ff49b14fc7920eedfe6ac52daa4 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Sat, 27 Apr 2024 14:22:46 -0700
Subject: [PATCH 22/30] Remove temp test change (#9049)

---
 .github/workflows/cicd-main.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index fa20d86f387d..e6e8fb808943 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -17,8 +17,6 @@ on:
   pull_request:
     branches: [ "main" ]
     types: [ labeled ]
-  push:
-    branches: [ "pagaray/nemo_cicd_part22" ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}

From 8c8c667405cc046c15e180682fbd47de3195b8af Mon Sep 17 00:00:00 2001
From: Adi Renduchintala <adithya.r@gmail.com>
Date: Mon, 29 Apr 2024 09:48:34 -0700
Subject: [PATCH 23/30] docs and simplification of cmd args (#8979)

* docs and simplification of cmd args

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* added cicd test

Signed-off-by: arendu <adithya.r@gmail.com>

* added cicd test is needs

Signed-off-by: arendu <adithya.r@gmail.com>

* Update information_retrieval.rst

Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>

* updated to fix wrong file paths

Signed-off-by: arendu <adithya.r@gmail.com>

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* Update cicd-main.yml

Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Signed-off-by: Adi Renduchintala <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 .github/workflows/cicd-main.yml               |  55 +++++++++
 docs/source/nlp/information_retrieval.rst     | 104 ++++++++++++++++++
 ...megatron_gpt_embedder_generate_config.yaml |  14 ++-
 .../megatron_gpt_embedder_tuning_config.yaml  |  37 ++++---
 .../megatron_gpt_embedding_model.py           |   9 +-
 .../megatron_gpt_sft_model.py                 |   3 +-
 6 files changed, 200 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index e6e8fb808943..a13284521b3c 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4648,6 +4648,60 @@ jobs:
             rm -rf examples/nlp/language_modeling/gpt_sft_results
         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
           if: "failure()"
+  
+  L2_Megatron_GPT_Embedding:
+    needs: [cicd-test-container-setup]
+    runs-on: self-hosted-azure
+    container:
+      image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+      options: 
+        # --user 0:128
+        --device=/dev/nvidia0
+        --gpus all
+        --shm-size=8g
+        --env TRANSFORMERS_OFFLINE=0 
+        --env HYDRA_FULL_ERROR=1
+        --volume /mnt/datadrive/TestData:/home/TestData
+    steps:
+        - name: Checkout repository
+          uses: actions/checkout@v2
+        - run: |
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
+
+            python examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+            exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
+            model.global_batch_size=4 \
+            model.micro_batch_size=4 \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.max_epochs=null \
+            trainer.max_steps=20 \
+            trainer.val_check_interval=10 \
+            model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+            model.peft.lora_tuning.adapter_dim=8 \
+            model.data.validation_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+            model.data.validation_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl] \
+            model.data.validation_ds.write_embeddings_to_file=True \
+            model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
+
+
+            python examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
+            model.peft.restore_from_path='/home/TestData/nlp/megatron_ir/working_dir/megatron_gpt_peft_lora_tuning/checkpoints/megatron_gpt_peft_lora_tuning.nemo' \
+            model.global_batch_size=4 \
+            model.micro_batch_size=4 \
+            model.peft.lora_tuning.adapter_dim=8 \
+            model.data.test_ds.write_embeddings_to_file=True \
+            model.data.test_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/test_embs' \
+            model.data.test_ds.query_file_names=[/home/TestData/nlp/megatron_ir/test_query.jsonl] \
+            model.data.test_ds.doc_file_names=[/home/TestData/nlp/megatron_ir/test_doc.jsonl]
+
+            rm -rf /home/TestData/nlp/megatron_ir/working_dir
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: "failure()"
 
   L2_Megatron_GPT_PEFT_Lora_PP2:
     needs: [cicd-test-container-setup]
@@ -6256,6 +6310,7 @@ jobs:
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_GPT_Finetuning_PP2
       - L2_Megatron_GPT_Finetuning_StarCoder_PP1
+      - L2_Megatron_GPT_Embedding 
       - L2_Megatron_GPT_PEFT_Lora_PP2
       - L2_Megatron_GPT_PEFT_Lora_TP2
       - L2_Megatron_GPT_Eval
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index a283c845b11d..fa9157e45b59 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -102,3 +102,107 @@ Then you can fine-tune the sentence-BERT model using the following script:
     exp_manager.wandb_logger_kwargs.name=${NAME} \
     exp_manager.wandb_logger_kwargs.project=${PROJECT}
     
+GPT Embedding Models
+=====================
+
+Recent work has also shown that it is possible to use Decoder-Only (GPT Style) models to train embedding models.
+`Improving Text Embeddings with
+Large Language Models <https://arxiv.org/pdf/2401.00368.pdf>`__ is one such recent papers which served as inspiration to implement Decoder-only embedding training in Nemo.
+
+Training a GPT Embedding Model
+-------------------------------
+
+To train GPT Embedding models we follow a format very similar to the SBERT Embedding training. However, there are a couple of differences. GPT Embedding model training expects a `jsonl` file in which each line is a json object. Here is a truncated example of data jsonl file::
+
+{"query": "What did ... 1952-2002 period?", "pos_doc": "Morning (2008) ... has changed little.", "neg_doc": "Even though ... sapiens.", "query_id": "q103151", "doc_id": "d14755"}
+{"query": "What type of ...  passions?", "pos_doc": "Burke was a leading ... upper classes.", "neg_doc": "Writing to a friend ... Government.", "query_id": "q77959", "doc_id": "d11263"}
+{"query": "Since 1999, ... progressed at?", "pos_doc": "Commercial solar water ... as of 2007.", "neg_doc": "The potential solar ... acquire.", "query_id": "q16545", "doc_id": "d1883"}
+
+
+As visible the json object should contain the following fields ``query``, ``pos_doc``, ``neg_doc``, ``query_id`` and ``doc_id``. The ``query_id`` and ``doc_id`` can be any alphanumeric string that uniquely maps to the ``query`` string and ``pos_doc`` string.
+
+During training, the GPT Embedding model employs LoRA (by default) to learn embeddings for the queries and documents, such that similarity of the ``query``-to-``pos_doc`` are maximized while simultaneously minimizing ``query``-to-``neg_doc`` similarity. LoRA allows us to fine-tune large LLMs such as Mistral 7B model with a relatively small number of training parameters.
+
+An example command to launch a training job is
+
+.. code-block:: console
+
+ python3 /NeMo/examples/nlp/information_retrieval/megatron_gpt_embedding_finetuning.py \
+    exp_manager.exp_dir="PATH_TO_SAVE_LORA_WEIGHTS" \
+    model.global_batch_size=4 \                         # exact choice for global batch size is data dependent typical values are in the range of 32 to 128.
+    model.micro_batch_size=4 \                          # exact choice for micro batch size is GPU memory dependent 2 to 8 are reasonable values.
+    trainer.devices=1 \                                 # indicates how many GPUs to use during training per node.
+    trainer.num_nodes=1 \                               # indicates how many nodes to use if multi-node cluster is available
+    trainer.max_steps=20 \                              # how many training steps to run.
+    model.restore_from_path="PATH_TO_BASE_NEMO_MODEL" \
+    model.peft.lora_tuning.adapter_dim=16 \             # the low-rank size for lora weights.
+    model.data.train_ds.file_names=["train.jsonl"]
+
+The full list of possible run arguments is configurable in ``/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml``. By default a trained model file should be generated in here ``PATH_TO_SAVE_LORA_WEIGHTS/megatron_gpt_peft_lora_tuning/checkpoints/`` typically with the extension ``.nemo``.
+
+
+Inference using a GPT Embedding Model
+-------------------------------------
+
+Once trained, the GPT Embedding Model can be used to generate embeddings for queries and corpus documents. We can launch inference using the following command:
+
+.. code-block:: console
+
+ python3 /NeMo/examples/nlp/information_retrieval/megatron_gpt_embedding_generate.py \
+    model.global_batch_size=4 \
+    model.micro_batch_size=4 \
+    trainer.devices=1 \
+    trainer.num_nodes=1 \
+    model.restore_from_path="PATH_TO_BASE_NEMO_MODEL" \  # Same base model used at training time. 
+    model.peft.restore_from_path="PATH_TO_SAVE_LORA_WEIGHTS/megatron_gpt_peft_lora_tuning/checkpoints//megatron_gpt_peft_lora_tuning.nemo" \ 
+    model.data.test_ds.query_file_names=["test_query.jsonl"] \
+    model.data.test_ds.doc_file_names=\["test_docs.jsonl"] \
+    model.data.test_ds.write_embeddings_to_file=True \
+    model.data.test_ds.output_file_path_prefix="PATH_TO_SAVE_EMEBDDINGS" 
+
+The contents of ``test_queries.jsonl`` is expected to be in the following format::
+
+{"query": "What do ... quantities?","query_id": "q11600", "doc_id": "d1172"}
+{"query": "What are ... subsectors?", "query_id": "q5831", "doc_id": "d577"}
+{"query": "Which article ... Government?", "query_id": "q3037", "doc_id": "d336"}
+
+Here, the ``doc_id`` field is expected to be the id of the document/passage which is the correct passage for the query. Note that since we are in inference mode, we don't require query-doc pairs.
+
+The contents of ``test_docs.jsonl`` is expected to be in the following format::
+
+{"pos_doc": "Hormones ... vitamin D.", "doc_id": "d823"}
+{"pos_doc": "Historically, Victoria ... October 2016.", "doc_id": "d159"}
+{"pos_doc": "Exceptional examples ... Warsaw.", "doc_id": "d1084"}
+
+Once again, we show 3 examples form each file. Typically the ``test_docs.jsonl`` will contain more items than queries in the ``test_queries.jsonl``.
+
+The inference command will result in two folders 
+
+* ``PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries`` 
+* ``PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_docs``
+
+The ``X`` in the folder ``consumed_samplesX`` is a number denoted number of batches consumed, this is not crucial at test time, but it is useful in training which we will see in the next section. First, let's take a look at the ``test_queries``.
+
+.. code-block:: console
+
+ $> ls PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries
+ query.ids  query.npy
+ $>head -n3 PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_queries/query.ids 
+ q11600
+ q5831
+ q3037
+
+``query.npy`` is a numpy pickled array containing rows of query embeddings and the ``query.ids`` text file list the id of each embedding in the same order.
+
+Similarly let's look into the ``test_docs`` folder
+
+.. code-block:: console
+
+ $> ls PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_doc/
+ doc.ids  doc.npy
+ $> head -n3 PATH_TO_SAVE_EMBEDDINGS/consumed_samplesX/test_doc/doc.ids 
+ d823
+ d159
+ d1084
+
+We can see that ``test_doc`` has a similar structure to ``test_queries`` but with ids and embeddings of the documents from the ``test_docs.josnl`` file. With this setup it is possible to evaluate the performance using metrics like MRR or NDCG.
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
index 778dc937efdc..1a81d21dd9a8 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_generate_config.yaml
@@ -4,7 +4,7 @@ trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -66,8 +66,14 @@ model:
   hidden_dropout: 0.0
   attention_dropout: 0.0
   ffn_dropout: 0.0
-  temperature: 0.8
+  temperature: 0.02
   num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
+  use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+  post_process: False # should be False.
+  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
+  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
+  use_flash_attention: True
+  precision: bf16
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
@@ -119,8 +125,8 @@ model:
       query_file_names: ??? # Path to a list of JSONL files corresponding to the query data. Data format is identical to validation_ds.
       doc_file_names: ??? # Path to a list of JSONL files corresponding to the doc data. Data format is identical to validation_ds.
       names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
-      global_batch_size: 1
-      micro_batch_size: 1
+      global_batch_size: ${global_batch_size}
+      micro_batch_size: ${micro_batch_size}
       shuffle: False
       num_workers: 0
       pin_memory: True
diff --git a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
index efd5271884ed..315bffd8a1ff 100644
--- a/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
+++ b/examples/nlp/information_retrieval/conf/megatron_gpt_embedder_tuning_config.yaml
@@ -4,15 +4,16 @@ trainer:
   devices: 1
   accelerator: gpu
   num_nodes: 1
-  precision: 16
+  precision: bf16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
-  max_epochs: 9999
+  max_epochs: null
   max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10 # frequency with which training steps are logged
-  val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
-  gradient_clip_val: 1.0
+  val_check_interval: ${trainer.max_steps} # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch
+  gradient_clip_val: null
+  num_sanity_val_steps: 0
 
 exp_manager:
   explicit_log_dir: null
@@ -34,7 +35,7 @@ exp_manager:
     model_parallel_size: ${model.tensor_model_parallel_size}
     always_save_nemo: False
     save_best_model: True
-  create_early_stopping_callback: True
+  create_early_stopping_callback: False
   early_stopping_callback_params:
     monitor: "val_loss"
     mode: "min"
@@ -54,7 +55,7 @@ model:
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training.
   sync_batch_comm: False
-  megatron_amp_O2: False
+  megatron_amp_O2: True 
 
   ## Sequence Parallelism
   # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
@@ -62,8 +63,8 @@ model:
   sequence_parallel: False
 
   ## Activation Checkpoint
-  activations_checkpoint_granularity: null # 'selective' or 'full'
-  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+  activations_checkpoint_granularity: selective # 'selective' or 'full'
+  activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity
   # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
@@ -74,9 +75,14 @@ model:
   hidden_dropout: 0.0
   attention_dropout: 0.0
   ffn_dropout: 0.0
-  temperature: 0.8
+  temperature: 0.02
   num_soft_negatives: 0 # Number of soft negatives to use for contrastive loss,it should be max(batch_size - 1), 0 means use hard negatives only
   use_all_possible_negatives: False # If True, use all possible negatives for contrastive loss, otherwise use num_soft_negatives, if num_soft_negatives is 0, use hard negatives only
+  post_process: False # should be False.
+  transformer_engine: True # required to be True for newer versions of Megatron-LM based models
+  mcore_gpt: True # required to be True for newer versions of Megatron-LM based models
+  use_flash_attention: True
+  precision: bf16
 
   peft:
     peft_scheme: "lora"  # can be either adapter,ia3, or ptuning
@@ -135,7 +141,7 @@ model:
       num_workers: 0
       memmap_workers: 2
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: 512  # Even if the base model can handle longer sequences, 512 is generally a good choice for training efficiency.
       min_seq_length: 1
       drop_last: True
       # Example of how to specify concat_sampling_probabilities
@@ -143,15 +149,16 @@ model:
       #   - 0.5
       #   - 0.25
       #   - 0.25
-      concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random'
+      concat_sampling_probabilities: 
+        - 1.0 
       label_key: 'output'
       add_eos: True
       add_bos: False
       index_mapping_dir: null # Path to a directory to write index mapping files.
       truncation_method: 'right' # Truncation from which position, Options: ['left', 'right'] 
     validation_ds:
-      query_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
-      doc_file_names: ??? # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      query_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      doc_file_names: null # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
       names: ["queries", "doc"] # Names of the corresponding datasets used to log metrics.
       global_batch_size: ${model.global_batch_size}
       micro_batch_size: ${model.micro_batch_size}
@@ -159,7 +166,7 @@ model:
       num_workers: 0
       memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: ${model.data.train_ds.max_seq_length}
       min_seq_length: 1
       drop_last: False
       label_key: ${model.data.train_ds.label_key}
@@ -182,7 +189,7 @@ model:
       num_workers: 0
       memmap_workers: ${model.data.train_ds.memmap_workers}
       pin_memory: True
-      max_seq_length: 2048
+      max_seq_length: ${model.data.train_ds.max_seq_length}
       min_seq_length: 1
       drop_last: False
       add_eos: ${model.data.train_ds.add_eos}
diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
index 110e59494b52..4cdeba1d67e2 100644
--- a/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
+++ b/nemo/collections/nlp/models/information_retrieval/megatron_gpt_embedding_model.py
@@ -123,8 +123,10 @@ def _build_dataset(self, data_cfg, is_train=True):
             _, _, num_train_samples_per_dataset = get_datasets_weights_and_num_samples(data_prefix, num_train_samples)
             num_train_samples_after_blend = sum([x[0] for x in num_train_samples_per_dataset])
         else:
-            num_query_samples_per_dataset = [[None]] * len(data_cfg.query_file_names)
-            num_doc_samples_per_dataset = [[None]] * len(data_cfg.doc_file_names)
+            num_query_files = len(data_cfg.query_file_names) if data_cfg.query_file_names is not None else 0
+            num_doc_files = len(data_cfg.doc_file_names) if data_cfg.doc_file_names is not None else 0
+            num_query_samples_per_dataset = [[None]] * num_query_files
+            num_doc_samples_per_dataset = [[None]] * num_doc_files
 
         # Check dataset max_seq_legnth and max_position_embeddings size
         if (
@@ -174,6 +176,9 @@ def _build_dataset(self, data_cfg, is_train=True):
             )
             return dataset
         else:
+            if data_cfg.query_file_names is None or data_cfg.doc_file_names is None:
+                return []
+
             query_dataset = GPTEmbeddingDataset(
                 file_path=data_cfg.query_file_names[0],
                 tokenizer=self.tokenizer,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
index 448f912c44d6..892a87189880 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_sft_model.py
@@ -804,7 +804,8 @@ def build_train_valid_test_datasets(self, stage):
             logging.info('Building GPT SFT validation datasets.')
             # Wrap this in a list since the general finetuning parent class supports multi-validation.
             self._validation_ds = self._build_dataset(self.cfg.data.validation_ds, is_train=False)
-            logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
+            if self._validation_ds:
+                logging.info(f'Length of val dataset: {len(self._validation_ds[0])}')
 
         if stage != 'validate':
             self.maybe_build_test()

From 43afd943507ec583072271c607341ea63c574496 Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:51:31 -0700
Subject: [PATCH 24/30] remove in-place addition for dreambooth train with text
 encoder (#8825)

* remove in-place addition for dreambooth train with text encoder

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Signed-off-by: Ming <111467530+Victor49152@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../multimodal/models/text_to_image/dreambooth/dreambooth.py   | 3 ++-
 .../multimodal/modules/stable_diffusion/encoders/modules.py    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
index 317cdf5d6364..0b830ac7319b 100644
--- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
+++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py
@@ -20,7 +20,6 @@
 from torch._inductor import config as inductor_config
 
 from nemo.collections.multimodal.data.dreambooth.dreambooth_dataset import DreamBoothDataset
-from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
 from nemo.collections.multimodal.modules.stable_diffusion.distributions.distributions import (
     DiagonalGaussianDistribution,
 )
@@ -647,6 +646,8 @@ def load_from_checkpoint(
         return checkpoint
 
     def _check_and_add_adapter(self, name, module, peft_name, peft_cfg, name_key_to_mcore_mixins=None):
+        from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
+
         if isinstance(module, AdapterModuleMixin):
             if isinstance(module, LinearWrapper):
                 peft_cfg.in_features, peft_cfg.out_features = module.in_features, module.out_features
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
index 446b81ab11b6..bff579bbca4f 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py
@@ -718,7 +718,7 @@ def forward(self, text):
 
     def encode_with_transformer(self, text):
         x = self.model.language_model.embedding.word_embeddings(text)
-        x += self.model.language_model.embedding.position_embeddings
+        x = x + self.model.language_model.embedding.position_embeddings
         x = x.permute(1, 0, 2)  # NLD -> LND
         x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
         x = self.model.language_model.encoder.final_layernorm(x)

From 3735b5c9a953021e7f4d3843009b8f4636057026 Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Tue, 30 Apr 2024 02:33:24 +0200
Subject: [PATCH 25/30] [NeMo-UX] Add checkpoint-io to MegatronStrategy (#9057)

* Adding MegatronParallel

* Move over _strategy_liMegatronCheckpointIO

* Adding GPTModel & MockDataModule

* Add nemo.io to MegatronStrategy

* Move to cloudpickle

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tests

Signed-off-by: Chen Cui <chcui@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 nemo/io/__init__.py                     |  14 +++
 nemo/io/api.py                          |  62 +++++++++++
 nemo/io/capture.py                      |  98 +++++++++++++++++
 nemo/io/mixin.py                        | 139 ++++++++++++++++++++++++
 nemo/io/pl.py                           |  49 ++++++++-
 nemo/lightning/__init__.py              |   3 +-
 nemo/lightning/pytorch/strategies.py    |  11 +-
 nemo/lightning/pytorch/trainer.py       |  15 +++
 nemo/llm/gpt/model/base.py              |   5 +-
 requirements/requirements.txt           |   1 +
 requirements/requirements_lightning.txt |   1 +
 tests/io/__init__.py                    |   0
 tests/io/test_api.py                    |  18 +++
 tests/io/test_mixin.py                  |  16 +++
 14 files changed, 422 insertions(+), 10 deletions(-)
 create mode 100644 nemo/io/api.py
 create mode 100644 nemo/io/capture.py
 create mode 100644 nemo/io/mixin.py
 create mode 100644 nemo/lightning/pytorch/trainer.py
 create mode 100644 tests/io/__init__.py
 create mode 100644 tests/io/test_api.py
 create mode 100644 tests/io/test_mixin.py

diff --git a/nemo/io/__init__.py b/nemo/io/__init__.py
index e69de29bb2d1..5b1d48768848 100644
--- a/nemo/io/__init__.py
+++ b/nemo/io/__init__.py
@@ -0,0 +1,14 @@
+from nemo.io.api import load, load_ckpt
+from nemo.io.capture import reinit
+from nemo.io.mixin import IOMixin
+from nemo.io.pl import TrainerCheckpoint, is_distributed_ckpt
+
+
+__all__ = [
+    "IOMixin",
+    "is_distributed_ckpt",
+    "load",
+    "load_ckpt",
+    'reinit',
+    "TrainerCheckpoint",
+]
diff --git a/nemo/io/api.py b/nemo/io/api.py
new file mode 100644
index 000000000000..f7de36cb9545
--- /dev/null
+++ b/nemo/io/api.py
@@ -0,0 +1,62 @@
+import pickle
+from pathlib import Path
+from typing import Any, Type, TypeVar
+
+import fiddle as fdl
+
+from nemo.io.pl import TrainerCheckpoint
+
+CkptType = TypeVar("CkptType")
+
+
+def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType:
+    """
+    Loads a configuration from a pickle file and constructs an object of the specified type.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+        output_type (Type[CkptType]): The type of the object to be constructed from the loaded data.
+
+    Returns
+    -------
+        CkptType: An instance of the specified type constructed from the loaded configuration.
+
+    Raises
+    ------
+        FileNotFoundError: If the specified file does not exist.
+
+    Example:
+        loaded_model = load("/path/to/model", output_type=MyModel)
+    """
+    del output_type  # Just for type-hint
+
+    _path = Path(path)
+    if hasattr(_path, 'is_dir') and _path.is_dir():
+        _path = Path(_path) / "io.pkl"
+    elif hasattr(_path, 'isdir') and _path.isdir:
+        _path = Path(_path) / "io.pkl"
+
+    if not _path.is_file():
+        raise FileNotFoundError(f"No such file: '{_path}'")
+
+    with open(_path, "rb") as f:
+        config = pickle.load(f)
+
+    return fdl.build(config)
+
+
+def load_ckpt(path: Path) -> TrainerCheckpoint:
+    """
+    Loads a TrainerCheckpoint from a pickle file or directory.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+
+    Returns
+    -------
+        TrainerCheckpoint: The loaded TrainerCheckpoint instance.
+
+    Example:
+        checkpoint: TrainerCheckpoint = load_ckpt("/path/to/checkpoint")
+    """
+    return load(path, output_type=TrainerCheckpoint)
diff --git a/nemo/io/capture.py b/nemo/io/capture.py
new file mode 100644
index 000000000000..2a65d18c15e3
--- /dev/null
+++ b/nemo/io/capture.py
@@ -0,0 +1,98 @@
+import functools
+import logging
+from typing import Callable, Generic, Optional, Protocol, TypeVar, runtime_checkable
+
+import fiddle as fdl
+
+log = logging.getLogger(__name__)
+
+
+def capture(to_capture: Optional[Callable] = None):
+    if to_capture is None:
+        return lambda f: capture(f)
+
+    @functools.wraps(to_capture)
+    def wrapper(*args, **kwargs):
+        if isinstance(to_capture, IOProtocol):
+            return to_capture(*args, **kwargs)
+
+        output = to_capture(*args, **kwargs)
+        if not hasattr(output, '__dict__'):
+            try:
+                if isinstance(output, (int, float, str, tuple)):
+                    new_output = type_factory(type(output), base_value=output)
+                else:
+                    NewType = type_factory(type(output))
+                    new_output = NewType(output)
+                new_output.__io__ = fdl.Partial(to_capture, *args, **kwargs)
+                output = new_output
+            except Exception as e:
+                logging.error(f"Error creating configurable type: {e}")
+        else:
+            output.__io__ = fdl.Partial(to_capture, *args, **kwargs)
+
+        return output
+
+    return wrapper
+
+
+SelfT = TypeVar("SelfT", covariant=True)
+
+
+@runtime_checkable
+class IOProtocol(Protocol, Generic[SelfT]):
+    @property
+    def __io__(self) -> fdl.Config[SelfT]:
+        ...
+
+
+@runtime_checkable
+class ReInitProtocol(Protocol, Generic[SelfT]):
+    def reinit(self) -> SelfT:
+        ...
+
+
+def reinit(configurable: IOProtocol[SelfT]) -> SelfT:
+    if isinstance(configurable, ReInitProtocol):
+        return configurable.reinit()
+
+    if not hasattr(configurable, '__io__'):
+        raise ValueError(f"Cannot reinit {configurable} because it does not have a __io__ attribute")
+
+    return fdl.build(configurable.__io__)
+
+
+# Global cache for dynamically created types
+type_cache = {}
+
+
+def type_factory(original_type, base_value=None):
+    """
+    Factory function to create or retrieve from cache a new type that can have additional attributes,
+    even if the original type is immutable.
+
+    Args:
+        original_type: The type of the original output value.
+        base_value: The base value to use for immutable types, if applicable.
+
+    Returns
+    -------
+        A new type that inherits from the original type and can have additional attributes,
+        or an instance of this new type if base_value is provided.
+    """
+    type_name = f"Configurable{original_type.__name__}"
+    if type_name in type_cache:
+        NewType = type_cache[type_name]
+    else:
+        NewType = type(f"Configurable{original_type.__name__}", (original_type,), {})
+        type_cache[type_name] = NewType
+
+    if base_value is not None:
+        try:
+            instance = NewType(base_value)
+        except TypeError:
+            logging.warning(f"Could not instantiate type {NewType.__name__} with base value.")
+            instance = NewType()
+        return instance
+
+    return NewType
diff --git a/nemo/io/mixin.py b/nemo/io/mixin.py
new file mode 100644
index 000000000000..d09c456f7957
--- /dev/null
+++ b/nemo/io/mixin.py
@@ -0,0 +1,139 @@
+import functools
+import inspect
+from dataclasses import is_dataclass
+from pathlib import Path
+from typing import Any, Dict
+
+import fiddle as fdl
+from cloudpickle import dump
+from typing_extensions import Self
+
+from nemo.io.capture import IOProtocol
+
+
+class IOMixin:
+    """
+    A mixin class designed to capture the arguments passed to the `__init__` method,
+    facilitating the re-creation of the object through `io.reinit` method using stored configurations.
+
+    This class intercepts the initialization of an object to store the arguments in a configuration
+    object, which can be serialized and later used to reinitialize the object to its original state.
+    It utilizes `fdl.Config` from the Fiddle library to create a structured configuration object
+    that holds the initialization parameters. This configuration object is crucial for enabling
+    serialization and deserialization of the parameters, thus allowing the object to be reconstructed
+    at a later time with the same initial state.
+
+    Attributes
+    ----------
+        __io__ (fdl.Config[Self]): A configuration object that stores the captured initialization
+        parameters in a structured format. This object is an instance of `fdl.Config`, which allows
+        for the serialization and deserialization of the parameters, enabling the object to be
+        reconstructed at a later time with the same initial state.
+
+    Examples
+    --------
+        from nemo import io
+        
+        class ExampleClass(io.IOMixin):
+            def __init__(self, param1, param2):
+                super().__init__()
+                self.param1 = param1
+                self.param2 = param2
+
+        # Creating an instance of ExampleClass
+        example = ExampleClass('value1', 'value2')
+        example_copy = io.reinit(example)
+        
+
+    Note:
+        For more information on `fdl.Config`, refer to the Fiddle library documentation at
+        [Fiddle Config Documentation](https://fiddle.readthedocs.io/en/latest/api_reference/core.html#config).
+
+    """
+
+    __io__ = fdl.Config[Self]
+
+    def __new__(cls, *args, **kwargs):
+        """
+        Overrides the default object creation process to wrap the `__init__` method, allowing
+        initialization arguments to be captured and stored in the `__io__` attribute.
+
+        Args:
+            *args: Variable length argument list for the `__init__` method.
+            **kwargs: Arbitrary keyword arguments for the `__init__` method.
+
+        Returns
+        -------
+            The newly created object instance.
+        """
+        original_init = cls.__init__
+
+        @functools.wraps(original_init)
+        def wrapped_init(self, *args, **kwargs):
+            cfg_kwargs = self.io_transform_args(original_init, *args, **kwargs)
+            self.__io__ = self.io_init(**cfg_kwargs)
+            original_init(self, *args, **kwargs)
+
+        cls.__init__ = wrapped_init
+        output = object().__new__(cls)
+
+        return output
+
+    def io_transform_args(self, init_fn, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Transforms and captures the arguments passed to the `__init__` method, filtering out
+        any arguments that are instances of `IOProtocol` or are dataclass fields with default
+        factories.
+
+        Args:
+            init_fn (Callable): The original `__init__` method of the class.
+            *args: Variable length argument list for the `__init__` method.
+            **kwargs: Arbitrary keyword arguments for the `__init__` method.
+
+        Returns
+        -------
+            Dict[str, Any]: A dictionary of the captured and transformed arguments.
+        """
+        sig = inspect.signature(init_fn)
+        bound_args = sig.bind_partial(self, *args, **kwargs)
+        bound_args.apply_defaults()
+        config_kwargs = {k: v for k, v in bound_args.arguments.items() if k != "self"}
+
+        to_del = []
+        for key in config_kwargs:
+            if isinstance(config_kwargs[key], IOProtocol):
+                config_kwargs[key] = config_kwargs[key].__io__
+            if is_dataclass(self):
+                # Check if the arg is a factory (dataclasses.field)
+                if config_kwargs[key].__class__.__name__ == "_HAS_DEFAULT_FACTORY_CLASS":
+                    to_del.append(key)
+
+        for key in to_del:
+            del config_kwargs[key]
+
+        return config_kwargs
+
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        """
+        Initializes the configuration object (`__io__`) with the captured arguments.
+
+        Args:
+            **kwargs: A dictionary of arguments that were captured during object initialization.
+
+        Returns
+        -------
+            fdl.Config[Self]: The initialized configuration object.
+        """
+        return fdl.Config(type(self), **kwargs)
+
+    def io_dump(self, output: Path):
+        """
+        Serializes the configuration object (`__io__`) to a file, allowing the object state to be
+        saved and later restored.
+
+        Args:
+            output (Path): The path to the file where the configuration object will be serialized.
+        """
+        config_path = Path(output) / "io.pkl"
+        with open(config_path, "wb") as f:
+            dump(self.__io__, f)
diff --git a/nemo/io/pl.py b/nemo/io/pl.py
index 659ef0d6621b..ba9b5be72cab 100644
--- a/nemo/io/pl.py
+++ b/nemo/io/pl.py
@@ -1,6 +1,7 @@
 import logging
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, Protocol, TypeVar, Union
 
 import pytorch_lightning as pl
 import torch
@@ -8,8 +9,13 @@
 from lightning_fabric.utilities.cloud_io import get_filesystem
 from lightning_fabric.utilities.types import _PATH
 from torch import nn
-from typing_extensions import override
+from typing_extensions import Self, override
 
+from nemo.io.capture import IOProtocol
+from nemo.io.mixin import IOMixin
+
+if TYPE_CHECKING:
+    from nemo.lightning.pytorch.strategies import MegatronStrategy
 
 log = logging.getLogger(__name__)
 
@@ -18,6 +24,42 @@
 ModuleT = TypeVar("ModuleT", bound=nn.Module)
 
 
+@dataclass
+class TrainerCheckpoint(IOMixin, Generic[LightningModuleT]):
+    model: LightningModuleT
+    trainer: pl.Trainer
+    extra: Dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
+        if not isinstance(strategy.trainer, IOProtocol):
+            raise ValueError(f"Trainer must be an instance of {IOProtocol}. Please use the Trainer from nemo.")
+
+        if not isinstance(strategy.lightning_module, IOProtocol):
+            raise ValueError("LightningModule must extend IOMixin.")
+
+        return cls(trainer=strategy.trainer, model=strategy.lightning_module, extra=cls.construct_extra(strategy))
+
+    @classmethod
+    def construct_extra(cls, strategy: "MegatronStrategy") -> Dict[str, Any]:
+        extra = {}
+        if hasattr(strategy.trainer, "datamodule") and isinstance(strategy.trainer.datamodule, IOProtocol):
+            extra["datamodule"] = strategy.trainer.datamodule.__io__
+
+        # TODO: Add optimizer to extra
+
+        return extra
+
+
+class TrainerCkptProtocol(Protocol):
+    @classmethod
+    def from_strategy(cls, strategy: "MegatronStrategy") -> Self:
+        ...
+
+    def io_dump(self, output: Path):
+        ...
+
+
 class MegatronCheckpointIO(CheckpointIO):
     """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
     common for most use cases.
@@ -54,7 +96,6 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         if fs.isdir(checkpoint_dir) and dist_checkpointing.check_is_distributed_checkpoint(checkpoint_dir):
             logging.info(f'Distributed checkpoint at path {checkpoint_dir} already exists, skipping saving')
             return
-
         fs.makedirs(checkpoint_dir, exist_ok=True)
         dist_checkpointing.save(sharded_state_dict=checkpoint, checkpoint_dir=str(checkpoint_dir))
 
@@ -113,7 +154,6 @@ def _fix_tensors_device(ckpt: Dict) -> Dict:
     """Ensure checkpoint tensors are on the correct device."""
     assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
     cur_dev = torch.device("cuda", index=torch.cuda.current_device())
-
     from megatron.core.dist_checkpointing.dict_utils import dict_list_map_outplace
 
     def _fix_device(t):
@@ -130,7 +170,6 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
     to be used as a directory for distributed checkpoints.
     """
     filepath = Path(filepath)
-
     if not filepath.suffix == ".ckpt":
         filepath = filepath.with_suffix(filepath.suffix + ".ckpt")
 
diff --git a/nemo/lightning/__init__.py b/nemo/lightning/__init__.py
index a508f29b9ace..f900345f96eb 100644
--- a/nemo/lightning/__init__.py
+++ b/nemo/lightning/__init__.py
@@ -7,6 +7,7 @@
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 from nemo.lightning.pytorch.plugins import data_sampler as _data_sampler
 from nemo.lightning.pytorch.strategies import MegatronStrategy
+from nemo.lightning.pytorch.trainer import Trainer
 
 
 # We monkey patch because nvidia uses a naming convention for SLURM jobs
@@ -21,4 +22,4 @@ def _is_slurm_interactive_mode():
 _pl_plugins._PLUGIN_INPUT = Union[_pl_plugins._PLUGIN_INPUT, _data_sampler.DataSampler]  # noqa: SLF001
 
 
-__all__ = ["MegatronStrategy", "MegatronDataSampler", "get_vocab_size", "teardown"]
+__all__ = ["MegatronStrategy", "MegatronDataSampler", "Trainer", "get_vocab_size", "teardown"]
diff --git a/nemo/lightning/pytorch/strategies.py b/nemo/lightning/pytorch/strategies.py
index 89cbe98cf707..65986b2a4855 100644
--- a/nemo/lightning/pytorch/strategies.py
+++ b/nemo/lightning/pytorch/strategies.py
@@ -27,7 +27,8 @@
 from torch.utils.data import DataLoader
 from typing_extensions import override
 
-from nemo.io.pl import MegatronCheckpointIO
+from nemo import io
+from nemo.io.pl import MegatronCheckpointIO, TrainerCheckpoint, TrainerCkptProtocol
 from nemo.lightning import _strategy_lib
 from nemo.lightning.megatron_parallel import CallbackConnector, MegatronParallel, _ModuleStepFunction
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar
@@ -38,7 +39,7 @@
 ConfigT = TypeVar("ConfigT")
 
 
-class MegatronStrategy(DDPStrategy):
+class MegatronStrategy(DDPStrategy, io.IOMixin):
     """Megatron plugin for Pytorch Lightning.
 
     Args:
@@ -60,6 +61,8 @@ def __init__(
         checkpoint_io=None,  # TODO: Add type-hint
         no_ddp_communication_hook: bool = True,
         find_unused_parameters: bool = False,
+        enable_nemo_ckpt_io: bool = True,
+        ckpt_type: TrainerCkptProtocol = TrainerCheckpoint,
         lazy_init: bool = False,
         **kwargs,
     ) -> None:
@@ -77,6 +80,8 @@ def __init__(
         self.pipeline_model_parallel_size = pipeline_model_parallel_size
         self.virtual_pipeline_model_parallel_size = virtual_pipeline_model_parallel_size
         self.sequence_parallel = sequence_parallel
+        self.enable_nemo_ckpt_io = enable_nemo_ckpt_io
+        self.ckpt_type = ckpt_type
         self.lazy_init = lazy_init
 
         # used in NVIDIA NGC PyTorch containers
@@ -346,6 +351,8 @@ def save_checkpoint(
             checkpoint['optimizer_states'] = [self.optimizer_sharded_state_dict()]
 
         self.checkpoint_io.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
+        if self.enable_nemo_ckpt_io and self.is_global_zero and self.ckpt_type:
+            self.ckpt_type.from_strategy(self).io_dump(ckpt_to_dir(filepath))
 
     @override
     def load_checkpoint(self, checkpoint_path: Union[str, Path]) -> Dict[str, Any]:
diff --git a/nemo/lightning/pytorch/trainer.py b/nemo/lightning/pytorch/trainer.py
new file mode 100644
index 000000000000..da04a93eef05
--- /dev/null
+++ b/nemo/lightning/pytorch/trainer.py
@@ -0,0 +1,15 @@
+from copy import deepcopy
+
+import fiddle as fdl
+import pytorch_lightning as pl
+from typing_extensions import Self
+
+from nemo.io.mixin import IOMixin
+
+
+class Trainer(pl.Trainer, IOMixin):
+    def io_init(self, **kwargs) -> fdl.Config[Self]:
+        # Each argument of the trainer can be stateful so we copy them
+        cfg_kwargs = {k: deepcopy(v) for k, v in kwargs.items()}
+
+        return fdl.Config(type(self), **cfg_kwargs)
diff --git a/nemo/llm/gpt/model/base.py b/nemo/llm/gpt/model/base.py
index 02588b494077..93186a7e7e08 100644
--- a/nemo/llm/gpt/model/base.py
+++ b/nemo/llm/gpt/model/base.py
@@ -7,6 +7,7 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from torch.optim import Optimizer
 
+from nemo import io
 from nemo.lightning import get_vocab_size
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 
@@ -17,7 +18,7 @@
 
 
 @dataclass
-class GPTConfig(TransformerConfig):
+class GPTConfig(TransformerConfig, io.IOMixin):
     # From megatron.core.models.gpt.gpt_model.GPTModel
     fp16_lm_cross_entropy: bool = False
     parallel_output: bool = True
@@ -60,7 +61,7 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         )
 
 
-class GPTModel(L.LightningModule):
+class GPTModel(L.LightningModule, io.IOMixin):
     def __init__(
         self,
         config: GPTConfig,
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 20efa2b22013..e2a558929146 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,3 +1,4 @@
+fiddle
 huggingface_hub>=0.20.3
 numba
 numpy>=1.22
diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 6bd43cdfc9c7..6acfddad9189 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -1,3 +1,4 @@
+fiddle
 hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>=2.2.1
diff --git a/tests/io/__init__.py b/tests/io/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/io/test_api.py b/tests/io/test_api.py
new file mode 100644
index 000000000000..d4c317bf2e9f
--- /dev/null
+++ b/tests/io/test_api.py
@@ -0,0 +1,18 @@
+from nemo import io
+from nemo import lightning as nl
+from nemo import llm
+
+
+class TestLoad:
+    def test_reload_ckpt(self, tmpdir):
+        trainer = nl.Trainer(devices=1, accelerator="cpu", strategy=nl.MegatronStrategy())
+        # model = llm.Mistral7BModel()
+        model = llm.GPTModel(
+            llm.GPTConfig(num_layers=2, hidden_size=1024, ffn_hidden_size=4096, num_attention_heads=8,)
+        )
+
+        ckpt = io.TrainerCheckpoint(model, trainer)
+        ckpt.io_dump(tmpdir)
+        loaded = io.load_ckpt(tmpdir)
+
+        assert loaded.model.config.seq_length == ckpt.model.config.seq_length
diff --git a/tests/io/test_mixin.py b/tests/io/test_mixin.py
new file mode 100644
index 000000000000..ed898d435609
--- /dev/null
+++ b/tests/io/test_mixin.py
@@ -0,0 +1,16 @@
+from nemo import io
+
+
+class DummyClass(io.IOMixin):
+    def __init__(self, a: int, b: int):
+        self.a = a
+        self.b = b
+
+
+class TestIOMixin:
+    def test_reinit(self):
+        dummy = DummyClass(5, 5)
+        copied = io.reinit(dummy)
+        assert copied is not dummy
+        assert copied.a == dummy.a
+        assert copied.b == dummy.b

From be59e0814835c2575e66887d9b6fb32bbdcf94df Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Mon, 29 Apr 2024 19:32:05 -0700
Subject: [PATCH 26/30] Mingyuanm/sdxl quantization notebook (#9042)

* Move cached embedding devices and dtype for onnx export consistency

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add old trt export/inference script, currently not working in latest container.

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Update intro and why nemo in dev doc

* Categorize tutorials

* Add NeMo TRT inference pipeline and quatization workflow

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add guards to avoid undefined variables

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* minor fix

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update tutorials link

* update index

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Restructure

* Add conversion script from hf sdxl to nemo sdxl

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Update quantize pipeline to adapt to variable image dimension

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* update sdxl pipeline to be aware of additional emb channels

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add guards for potential local var

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* copyright header

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Restructure

* Restructure

* Update flash attention

* Update flash attention

* Update file paths

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Fix few structure issue

* Fix migration

* Fix structure

* Fix structure

* Few updates

* Add few more scripts

* Fix scripts

* Fix few things

* Fix tutorial table

* Restructure

* Rename

* Add notebook

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* WIP

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Documentation

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Few fixes and moves

* Move sections

* Fix bib

* Refactor files

* Fixes

* Update quantization script

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add tutorial and docs

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add images

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Fix

* Fix few issues

* remove scripts

* Update comments

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Update docs

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add links to sdxl quantization tutorial

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Add link to new tutorial

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove unused import

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* Using links to images

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* remove unused imports

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Mingyuan Ma <mingyuanm@nvidia.com>
Co-authored-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 docs/source/multimodal/text2img/intro.rst     |   1 +
 .../multimodal/text2img/sdxl_quantization.rst | 156 ++++
 docs/source/starthere/tutorials.rst           |   3 +
 .../stable_diffusion/conf/sd_xl_quantize.yaml |  32 +-
 .../stable_diffusion/sd_xl_quantize.py        |  45 +
 .../stable_diffusion/sd_xl_trt_inference.py   |   5 +-
 nemo/utils/trt_utils.py                       |   4 +
 tutorials/multimodal/SDXL Quantization.ipynb  | 851 ++++++++++++++++++
 8 files changed, 1084 insertions(+), 13 deletions(-)
 create mode 100644 docs/source/multimodal/text2img/sdxl_quantization.rst
 create mode 100644 tutorials/multimodal/SDXL Quantization.ipynb

diff --git a/docs/source/multimodal/text2img/intro.rst b/docs/source/multimodal/text2img/intro.rst
index 9ec793d246fa..3c3c17768679 100644
--- a/docs/source/multimodal/text2img/intro.rst
+++ b/docs/source/multimodal/text2img/intro.rst
@@ -13,3 +13,4 @@ NeMo multimodal provides implementations of multiple image-to-text models, inclu
    imagen
    dreambooth
    controlnet
+   sdxl_quantization
diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
new file mode 100644
index 000000000000..78403e9c402c
--- /dev/null
+++ b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -0,0 +1,156 @@
+Stable Diffusion XL Int8 Quantization
+=======================================
+
+This example shows how to use Ammo to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
+>95% of the e2e Stable Diffusion latency.
+
+We also provide instructions on deploying and running E2E SDXL pipeline
+with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
+
+To get started, it is required to have a pretrained SDXL checkpoint in `nemo` format. The example training configs are provided in NeMo,
+which is located in `NeMo/examples/multimodal/text2img/stable_diffusion`.
+
+Calibration
+---------------
+The first step is to run quantization script with default config, and finally the script will export the quantized unet to onnx file.
+Here is the default config for `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_quantize.py`.
+
+
+.. code-block:: yaml
+    quantize
+      exp_name: nemo
+      n_steps: 20          # number of inference steps
+      format: 'int8'       # only int8 quantization is supported now
+      percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0
+      batch_size: 1        # batch size calling sdxl inference pipeline during calibration
+      calib_size: 32       # For SDXL, we recommend 32, 64 or 128
+      quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+      alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL
+
+
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+- percentile: Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of (n_steps * percentile) steps. Recommendation: 1.0
+- alpha: A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL, 1.0 for SD 1.5
+- quant-level: Which layers to be quantized, 1: CNNs, 2: CNN + FFN, 2.5: CNN + FFN + QKV, 3: CNN + Linear. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+- calib-size: For SDXL, we recommend 32, 64 or 128, for SD 1.5, set to 512 or 1024.
+
+
+Build the TRT engine for the Quantized ONNX UNet
+------------------------------------------------------------
+
+.. code-block:: bash
+    trtexec --onnx=./nemo_onnx/unet.onnx --shapes=x:8x4x128x128,timesteps:8,context:8x80x2048,y:8x2816 --fp16 --int8 --builderOptimizationLevel=4 --saveEngine=nemo_unet_xl.plan
+
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+Input shape has to be provided here when building TRT engine.
+- x: Input image latent shape (B * C * H * W)
+- context: Input text conditioning (B * S * hidden_dimention)
+- y: Additional embedding (B * adm_in_channels)
+
+Build End-to-end Stable Diffusion XL Pipeline with NeMo
+-----------------------------------------------------------
+
+We provide a script to build end to end TRT inference pipeline with NeMo backend, which is located at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_export.py`.
+
+.. code-block:: yaml
+    infer:
+        out_path: sdxl_export
+        width: 1024
+        height: 1024
+        batch_size: 2
+
+    trt:
+      static_batch: False
+      min_batch_size: 1
+      max_batch_size: 8
+
+Important Parameters
+^^^^^^^^^^^^^^^^^^^^
+- out_path: Directory to save onnx file and TRT engine files
+- width and height: Image resolution of inference output
+- batch_size: Only used for dummy input generation and onnx sanity check
+- {min,max}_batch_size: The input batch size of TRT engine along its dynamic axis
+
+
+Run End-to-end Stable Diffusion XL TRT Pipeline
+-----------------------------------------------------------
+
+The inference script can be found at `NeMo/examples/multimodal/text2img/stable_diffusion/sd_xl_trt_inference.py`.
+
+.. code-block:: yaml
+    unet_xl: sdxl_export/plan/unet_xl.plan
+    vae: sdxl_export/plan/vae.plan
+    clip1: sdxl_export/plan/clip1.plan
+    clip2: sdxl_export/plan/clip2.plan
+
+    out_path: trt_output
+
+
+Please specify unet_xl as the quantized Unet engine to run the quantized solution. The system will load the original engine file by default.
+
+Inference Speedup
+-------------------
+TRT version  9.3.0
+GPU: H100
+
+TRT int8 vs Framework fp16
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Pipeline            | Batch Size | Latency (ms)| Pipeline       | Batch Size | Latency | Speedup    |
++=====================+============+=============+================+============+=========+============+
+| Framework fp16 base | 1          | 3056.01     | Ammo TRT Int8  | 1          | 1406.68 | 2.172498365|
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Framework fp16 base | 2          | 4832.24     | Ammo TRT Int8  | 2          | 2403.29 | 2.01067703 |
++---------------------+------------+-------------+----------------+------------+---------+------------+
+| Framework fp16 base | 4          | 8433.71     | Ammo TRT Int8  | 4          | 4252.6  | 1.983189108|
++---------------------+------------+-------------+----------------+------------+---------+------------+
+
+
+
+TRT int8 vs TRT fp16
+^^^^^^^^^^^^^^^^^^^^^^^
+
+
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| Pipeline    | Batch Size | Latency (ms) | Precision | Batch Size | Latency    | Speedup     |
++=============+============+==============+===========+============+============+=============+
+| fp16 base   | 1          | 1723.97      | Ammo Int8 | 1          | 1406.68    | 1.225559473 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| fp16 base   | 2          | 3004.47      | Ammo Int8 | 2          | 2403.29    | 1.250148754 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+| fp16 base   | 4          | 5657.19      | Ammo Int8 | 4          | 4252.6     | 1.330289705 |
++-------------+------------+--------------+-----------+------------+------------+-------------+
+
+
+FP16 inference vs Int8 inference
+----------------------------------
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_1.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_1.png
+   :width: 50%
+Prompt: A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat. (FP16 upper vs Int8 lower)
+
+
+
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_2.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_2.png
+   :width: 50%
+Prompt: A cute corgi lives in a house made out of sushi. (FP16 upper vs Int8 lower)
+
+
+
+
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_fp16_3.png
+   :width: 50%
+.. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.23.0/asset-githubio-home-sdxl_trt_int8_3.png
+   :width: 50%
+Prompt: A high contrast portrait of a very happy fuzzy panda dressed as a chef in a high end kitchen making dough. There is a painting of flowers on the wall behind him. (FP16 upper vs Int8 lower)
+
diff --git a/docs/source/starthere/tutorials.rst b/docs/source/starthere/tutorials.rst
index 5ca48904ed9b..0298dbdf6d4b 100644
--- a/docs/source/starthere/tutorials.rst
+++ b/docs/source/starthere/tutorials.rst
@@ -63,6 +63,9 @@ Tutorial Overview
    * - Multimodal
      - Preparations and Advanced Applications: DreamBooth Tutorial
      - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/DreamBooth%20Tutorial.ipynb>`_
+   * - Multimodal
+     - Preparations and Advanced Applications: Stable Diffusion XL Quantization Tutorial
+     - `DreamBooth Tutorial <https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/SDXL%20Quantization.ipynb>`_
 
 .. list-table:: **Automatic Speech Recognition (ASR) Tutorials**
    :widths: 15 30 55
diff --git a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
index 000416f7996b..ecb75953829e 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
+++ b/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_quantize.yaml
@@ -2,7 +2,7 @@ trainer:
   devices: 1
   num_nodes: 1
   accelerator: gpu
-  precision: 32
+  precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -17,7 +17,7 @@ trainer:
 
 
 infer:
-  num_samples: 1
+  num_samples: 4
   prompt:
     - "A professional photograph of an astronaut riding a pig"
     - 'A photo of a Shiba Inu dog with a backpack riding a bike. It is wearing sunglasses and a beach hat.'
@@ -59,25 +59,33 @@ model:
 
 quantize:
   exp_name: nemo_test
-  n_steps: 20
-  format: 'int8'
-  percentile: 1.0
-  batch_size: 1
-  calib_size: 32
-  quant_level: 2.5
-  alpha: 0.8
+  n_steps: 20          # number of inference steps
+  format: 'int8'       # only int8 quantization is supported now
+  percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0
+  batch_size: 1        # batch size calling sdxl inference pipeline during calibration
+  calib_size: 32       # For SDXL, we recommend 32, 64 or 128
+  quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.
+  alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL
   quantized_ckpt: nemo.unet.state_dict.${quantize.exp_name}.pt
 
 onnx_export:
-  onnx_dir: nemo_onnx
-  pretrained_base: ${model.restore_from_path}
-  quantized_ckpt: ${quantize.quantized_ckpt}
+  onnx_dir: nemo_onnx  # Path to save onnx files
+  pretrained_base: ${model.restore_from_path}  # Path to nemo checkpoint for sdxl
+  quantized_ckpt: ${quantize.quantized_ckpt}  # Path to save quantized unet checkpoint
   format: int8
 
+trt_export:
+  static_batch: False # static batch engines have better latency
+  min_batch_size: 1   # minimum batch size when using dynamic batch, has to be the same with max_batch_size and infer.num_samples when using static batch
+  max_batch_size: 8   # maximum batch size when using dynamic batch, has to be the same with min_batch_size and infer.num_samples when using static batch
+  int8: True          # Allow engine builder recognize int8 precision
+  builder_optimization_level: 4  # set to 1-5, higher optimization level means better latency but longer compiling time
+  trt_engine: int8_unet_xl.plan  # path to save trt engine
 
 use_refiner: False
 use_fp16: False # use fp16 model weights
 out_path: ./output
 run_quantization: True
 run_onnx_export: True
+run_trt_export: True
 
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
index 5c5e1dd94a09..89bfcd294ae4 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
@@ -34,6 +34,7 @@
 from nemo.collections.multimodal.parts.stable_diffusion.sdxl_pipeline import SamplingPipeline
 from nemo.collections.multimodal.parts.utils import setup_trainer_and_model_for_inference
 from nemo.core.config import hydra_runner
+from nemo.utils.trt_utils import build_engine
 
 
 def do_calibrate(base, calibration_prompts, **kwargs):
@@ -49,6 +50,26 @@ def do_calibrate(base, calibration_prompts, **kwargs):
         )
 
 
+def get_input_profile_unet(
+    batch_size, static_batch=False, min_batch_size=1, max_batch_size=8, latent_dim=32, adm_in_channels=1280
+):
+    assert batch_size >= min_batch_size and batch_size <= max_batch_size
+    if static_batch:
+        min_batch_size = batch_size if static_batch else min_batch_size
+        max_batch_size = batch_size if static_batch else max_batch_size
+    input_profile = {}
+    dummy_input = generate_dummy_inputs(
+        sd_version="nemo", device='cuda', latent_dim=latent_dim, adm_in_channels=adm_in_channels
+    )
+    for key, value in dummy_input.items():
+        input_profile[key] = [
+            (min_batch_size, *(value.shape[1:])),
+            (batch_size, *(value.shape[1:])),
+            (max_batch_size, *(value.shape[1:])),
+        ]
+    return input_profile
+
+
 @hydra_runner(config_path='conf', config_name='sd_xl_quantize')
 def main(cfg):
     def model_cfg_modifier(model_cfg):
@@ -147,6 +168,30 @@ def forward_loop():
             opset_version=opset_version,
         )
 
+    if cfg.run_trt_export:
+        torch.cuda.empty_cache()
+        batch_size = cfg.infer.get('num_samples', 1)
+        min_batch_size = cfg.trt_export.min_batch_size
+        max_batch_size = cfg.trt_export.max_batch_size
+        static_batch = cfg.trt_export.static_batch
+        fp16 = cfg.trainer.precision in ['16', '16-mixed', 16]
+        build_engine(
+            f"{cfg.onnx_export.onnx_dir}/unet.onnx",
+            f"{cfg.trt_export.trt_engine}",
+            fp16=fp16,
+            input_profile=get_input_profile_unet(
+                batch_size,
+                static_batch=static_batch,
+                min_batch_size=min_batch_size,
+                max_batch_size=max_batch_size,
+                latent_dim=cfg.sampling.base.height // 8,
+                adm_in_channels=base.model.model.diffusion_model.adm_in_channels,
+            ),
+            timing_cache=None,
+            int8=cfg.trt_export.int8,
+            builder_optimization_level=cfg.trt_export.builder_optimization_level,
+        )
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
index 04fc7bd91315..14c64a58a8af 100644
--- a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
+++ b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py
@@ -22,6 +22,7 @@
 from cuda import cudart
 from transformers import CLIPTokenizer
 
+from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser import DiscreteDenoiser
 from nemo.collections.multimodal.modules.stable_diffusion.encoders.modules import ConcatTimestepEmbedderND
 from nemo.collections.multimodal.modules.stable_diffusion.quantization_utils.trt_engine import TRT_LOGGER, Engine
 from nemo.collections.multimodal.parts.stable_diffusion.sdxl_helpers import perform_save_locally
@@ -176,6 +177,7 @@ def run(self, prompt, negative_prompt, image_height, image_width, num_samples, a
 
         with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER):
             torch.cuda.synchronize()
+            e2e_tic = time.perf_counter()
 
             c, uc = self.encode_prompt(prompt, negative_prompt)
 
@@ -198,8 +200,9 @@ def denoiser(input, sigma, c):
 
             samples_z = self.sampler(denoiser, randn, cond=c, uc=uc)
             samples_x = self.decode_images(samples_z)
+            e2e_tic = time.perf_counter() - e2e_tic
             samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-
+            print(f'This batch takes {e2e_tic}s')
             perform_save_locally(self.cfg.out_path, samples)
 
 
diff --git a/nemo/utils/trt_utils.py b/nemo/utils/trt_utils.py
index 73e899532691..a355a8e9e77e 100644
--- a/nemo/utils/trt_utils.py
+++ b/nemo/utils/trt_utils.py
@@ -32,6 +32,8 @@ def build_engine(
     enable_preview=False,
     timing_cache=None,
     workspace_size=0,
+    int8=False,
+    builder_optimization_level=None,
 ):
     print(f"Building TensorRT engine for {onnx_path}: {output_path}")
     p = Profile()
@@ -53,6 +55,8 @@ def build_engine(
             profiles=[p],
             preview_features=preview_features,
             load_timing_cache=timing_cache,
+            int8=int8,
+            builder_optimization_level=builder_optimization_level,
             **config_kwargs,
         ),
         save_timing_cache=timing_cache,
diff --git a/tutorials/multimodal/SDXL Quantization.ipynb b/tutorials/multimodal/SDXL Quantization.ipynb
new file mode 100644
index 000000000000..1562a9c756ee
--- /dev/null
+++ b/tutorials/multimodal/SDXL Quantization.ipynb	
@@ -0,0 +1,851 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b32d3842",
+   "metadata": {},
+   "source": [
+    "# SDXL Int8 Quantization Solution by Ammo\n",
+    "\n",
+    "### Note:\n",
+    "This notebook requires nvidia-ammo > 0.9.x, which comes with NeMo framework container > 23.05. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "This tutorial shows how to use Ammo to calibrate and quantize the UNet part of the SDXL within NeMo framework. \n",
+    "\n",
+    "Please note that NeMo provides users with an end-to-end training framework for SDXL, and this quantization pipeline is supposed to work with a `.nemo` checkpoint trained from their own text-image dataset. In this tutorial, a open-source checkpoint is converted to `.nemo` format for illustration purpose."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f8320ca",
+   "metadata": {},
+   "source": [
+    "### Download SDXL checkpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd436eab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Download Unet checkpoint\n",
+    "! mkdir -p /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet && wget -P /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/unet/diffusion_pytorch_model.safetensors\n",
+    "## Download Vae checkpoint  \n",
+    "! mkdir -p /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae && wget -P /sdxl_ckpts/stable-diffusion-xl-base-1.0/vae https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/vae/diffusion_pytorch_model.safetensors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70164e82",
+   "metadata": {},
+   "source": [
+    "### Convert downloaded checkpoint into `.nemo` format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "c9649553",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo I 2024-04-24 22:13:11 distributed:42] Initializing torch.distributed with local_rank: 0, rank: 0, world_size: 1\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1183] hidden_size not found in {'precision': 'bf16-mixed', 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': True, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors'}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors', 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0'}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[rank0]:[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:13:12 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:13:12 megatron_base_model:1183] hidden_size not found in {'precision': 'bf16-mixed', 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': True, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors'}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': '/sdxl_ckpts/stable-diffusion-xl-base-1.0/vae/diffusion_pytorch_model.safetensors', 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0'}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:12 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:14 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:14 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:16 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:16 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:18 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:18 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:20 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:20 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:21 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:13:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:23 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:23 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:13:24 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:13:25 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "open_clip_pytorch_model.bin: 100%|██████████| 10.2G/10.2G [01:36<00:00, 106MB/s]\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "AutoencoderKLInferenceWrapper: Following keys are missing during loading VAE weights, which may lead to compromised image quality for a resumed training. Please check the checkpoint you provided.\n",
+      "Missing:['encoder.mid.attn_1.proj_out.bias', 'decoder.mid.attn_1.v.weight', 'encoder.mid.attn_1.proj_out.weight', 'decoder.mid.attn_1.proj_out.bias', 'decoder.mid.attn_1.q.weight', 'decoder.mid.attn_1.q.bias', 'encoder.mid.attn_1.q.weight', 'encoder.mid.attn_1.k.weight', 'encoder.mid.attn_1.v.bias', 'decoder.mid.attn_1.k.weight', 'decoder.mid.attn_1.v.bias', 'decoder.mid.attn_1.proj_out.weight', 'encoder.mid.attn_1.q.bias', 'encoder.mid.attn_1.v.weight', 'encoder.mid.attn_1.k.bias', 'decoder.mid.attn_1.k.bias']\n",
+      "Unexpected:['encoder.mid.attentions.0.to_k.weight', 'decoder.mid.attentions.0.to_out.0.weight', 'encoder.mid.attentions.0.to_v.bias', 'decoder.mid.attentions.0.to_q.bias', 'encoder.mid.attentions.0.to_q.weight', 'encoder.mid.attentions.0.to_v.weight', 'decoder.mid.attentions.0.to_k.weight', 'decoder.mid.attentions.0.to_v.bias', 'encoder.mid.attentions.0.to_k.bias', 'encoder.mid.attentions.0.to_out.0.bias', 'decoder.mid.attentions.0.to_out.0.bias', 'encoder.mid.attentions.0.to_out.0.weight', 'decoder.mid.attentions.0.to_k.bias', 'decoder.mid.attentions.0.to_v.weight', 'decoder.mid.attentions.0.to_q.weight', 'encoder.mid.attentions.0.to_q.bias']\n",
+      "[NeMo I 2024-04-24 22:15:42 convert_hf_ckpt_to_nemo:226] NeMo model saved to: /quantization/sdxl_base.nemo\n"
+     ]
+    }
+   ],
+   "source": [
+    "WORKDIR = '/quantization'\n",
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/convert_hf_ckpt_to_nemo.py \\\n",
+    "    --model_type sdxl \\\n",
+    "    --ckpt_path /sdxl_ckpts/stable-diffusion-xl-base-1.0/unet/diffusion_pytorch_model.safetensors \\\n",
+    "    --hparams_file /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml \\\n",
+    "    --nemo_file_path $WORKDIR/sdxl_base.nemo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25817b85",
+   "metadata": {},
+   "source": [
+    "### Run quantization script with default config, and finally the script will export the quantized unet to onnx file.\n",
+    "\n",
+    "##### Quantization config\n",
+    "\n",
+    "```yaml\n",
+    "quantize\n",
+    "  exp_name: nemo_test\n",
+    "  n_steps: 20          # number of inference steps\n",
+    "  format: 'int8'       # only int8 quantization is supported now\n",
+    "  percentile: 1.0      # Control quantization scaling factors (amax) collecting range, meaning that we will collect the minimum amax in the range of `(n_steps * percentile)` steps. Recommendation: 1.0\n",
+    "  batch_size: 1        # batch size calling sdxl inference pipeline during calibration\n",
+    "  calib_size: 32       # For SDXL, we recommend 32, 64 or 128\n",
+    "  quant_level: 2.5     #Which layers to be quantized, 1: `CNNs`, 2: `CNN + FFN`, 2.5: `CNN + FFN + QKV`, 3: `CNN + Linear`. Recommendation: 2, 2.5 and 3, depending on the requirements for image quality & speedup.\n",
+    "  alpha: 0.8           # A parameter in SmoothQuant, used for linear layers only. Recommendation: 0.8 for SDXL\n",
+    "```\n",
+    "\n",
+    "##### Onnx export config\n",
+    "\n",
+    "```yaml\n",
+    "onnx_export:\n",
+    "  onnx_dir: nemo_onnx    # Path to save onnx files\n",
+    "  pretrained_base: ${model.restore_from_path}  # Path to nemo checkpoint for sdxl\n",
+    "  quantized_ckpt: nemo.unet.state_dict.${quantize.exp_name}.pt  # Path to save quantized unet checkpoint\n",
+    "  format: int8\n",
+    "```\n",
+    "##### Onnx export config\n",
+    "\n",
+    "```yaml\n",
+    "trt_export:\n",
+    "  static_batch: False # static batch engines have better latency\n",
+    "  min_batch_size: 1   # minimum batch size when using dynamic batch, has to be the same with max_batch_size and infer.num_samples when using static batch\n",
+    "  max_batch_size: 1   # maximum batch size when using dynamic batch, has to be the same with min_batch_size and infer.num_samples when using static batch\n",
+    "  int8: True          # Allow engine builder recognize int8 precision\n",
+    "  builder_optimization_level: 4  # set to 1-5, higher optimization level means better latency but longer compiling time\n",
+    "  trt_engine: int8_unet_xl.plan  # path to save trt engine\n",
+    "```\n",
+    "\n",
+    "The following command restores a pre-trained sdxl model from `$WORKDIR/sdxl_base.nemo` derived from the above step.\n",
+    "The quantized U-Net checkpoint is saved to `quantize.quantized_ckpt`, converted onnx file is saved to `onnx_export.onnx_dir` and trt engine is saved to `trt_export.trt_engine`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d955f6c3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 19:42:59 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 19:42:59 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
+      "    \n",
+      "Using 16bit Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 19:43:09 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "24-04-24 19:43:09 - PID:1361 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 19:43:09 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:09 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:10 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:10 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:11 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:11 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:13 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:13 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:15 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:15 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:17 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:17 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:19 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 19:43:19 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:20 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:20 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 19:43:21 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 19:43:21 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "Loaded ViT-bigG-14 model config.\n",
+      "Loading pretrained ViT-bigG-14 weights (laion2b_s39b_b160k).\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "[NeMo I 2024-04-24 19:43:53 nlp_overrides:1155] Model MegatronDiffusionEngine was successfully restored from /quantization/sdxl_base.nemo.\n",
+      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "distributed_backend=nccl\n",
+      "All distributed processes registered. Starting with 1 processes\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "Building TensorRT engine for /quantization/nemo_onnx/unet.onnx: /quantization/int8_unet_xl.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {x [min=(1, 4, 128, 128), opt=(4, 4, 128, 128), max=(8, 4, 128, 128)],\n",
+      "             y [min=(1, 2816), opt=(4, 2816), max=(8, 2816)],\n",
+      "             timesteps [min=(1,), opt=(4,), max=(8,)],\n",
+      "             context [min=(1, 80, 2048), opt=(4, 80, 2048), max=(8, 80, 2048)]}\n",
+      "    ]\n",
+      "\u001B[38;5;11m[W] It looks like some layers in the network have compute precision set, but precision constraints were not enabled. \n",
+      "    Precision constraints must be set to 'prefer' or 'obey' for layer compute precision to take effect. \n",
+      "    Note: Layers and their requested precisions were: {'/input_blocks.0/input_blocks.0.0/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.0/input_blocks.0.0/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.0/input_blocks.0.0/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.0/input_blocks.0.0/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.1/input_blocks.1.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.2/input_blocks.2.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.3/input_blocks.3.0/op/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.3/input_blocks.3.0/op/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.3/input_blocks.3.0/op/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.3/input_blocks.3.0/op/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.4/input_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.5/input_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.6/input_blocks.6.0/op/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.6/input_blocks.6.0/op/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.6/input_blocks.6.0/op/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.6/input_blocks.6.0/op/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.7/input_blocks.7.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/input_blocks.8/input_blocks.8.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/middle_block/middle_block.2/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/middle_block/middle_block.2/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.0/output_blocks.0.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.1/output_blocks.1.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.2/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.3/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.4/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.5/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.6/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.7/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.8/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.1/transformer_blocks.9/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.2/conv/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.2/conv/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.2/output_blocks.2.2/conv/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.2/output_blocks.2.2/conv/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.3/output_blocks.3.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.4/output_blocks.4.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.0/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn1/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_q/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_k/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/attn2/to_v/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.0/proj/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.1/transformer_blocks.1/ff/net/net.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.2/conv/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.2/conv/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.5/output_blocks.5.2/conv/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.5/output_blocks.5.2/conv/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.6/output_blocks.6.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.6/output_blocks.6.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.7/output_blocks.7.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.7/output_blocks.7.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/in_layers/in_layers.1/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/out_layers/out_layers.2/weight_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/skip_connection/input_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/skip_connection/input_quantizer/DequantizeLinear': 'INT8', '/output_blocks.8/output_blocks.8.0/skip_connection/weight_quantizer/QuantizeLinear': 'FLOAT', '/output_blocks.8/output_blocks.8.0/skip_connection/weight_quantizer/DequantizeLinear': 'INT8', '/out/out.1/input_quantizer/QuantizeLinear': 'FLOAT', '/out/out.1/input_quantizer/DequantizeLinear': 'INT8', '/out/out.1/weight_quantizer/QuantizeLinear': 'FLOAT', '/out/out.1/weight_quantizer/DequantizeLinear': 'INT8'}\u001B[0m\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16, INT8]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 881.973 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/int8_unet_xl.plan\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py model.restore_from_path=$WORKDIR/sdxl_base.nemo onnx_export.onnx_dir=$WORKDIR/nemo_onnx quantize.quantized_ckpt=$WORKDIR/nemo.unet.state_dict.nemo.pt trt_export.trt_engine=$WORKDIR/int8_unet_xl.plan\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f97d6bfa",
+   "metadata": {},
+   "source": [
+    "### Build end to end TRT inference pipeline\n",
+    "In order to run an end to end inference with quantized U-Net engine, we need to export and build engines for the other compenents in SDXL, which includes the VAE and two CLIP encoder. The following script restores SDXL from the `nemo` checkpoint and saves the corresponding engine files to `infer.out_path`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2e8b7742",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 22:17:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:17:42 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
+      "    \n",
+      "Using 16bit Automatic Mixed Precision (AMP)\n",
+      "GPU available: True (cuda), used: True\n",
+      "TPU available: False, using: 0 TPU cores\n",
+      "IPU available: False, using: 0 IPUs\n",
+      "HPU available: False, using: 0 HPUs\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[W init.cpp:767] Warning: nvfuser is no longer supported in torch script, use _jit_set_nvfuser_enabled is deprecated and a no-op (function operator())\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:253] Rank 0 has data parallel group : [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:259] Rank 0 has combined group of data parallel and context parallel : [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:264] All data parallel group ranks with context parallel combined: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:267] Ranks 0 has data parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:284] Rank 0 has context parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:287] All context parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:288] Ranks 0 has context parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:299] Rank 0 has model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:300] All model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:310] Rank 0 has tensor model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:314] All tensor model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:315] Rank 0 has tensor model parallel rank: 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:344] Rank 0 has pipeline model parallel group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:356] Rank 0 has embedding group: [0]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:362] All pipeline model parallel group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:363] Rank 0 has pipeline model parallel rank 0\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:364] All embedding group ranks: [[0]]\n",
+      "[NeMo I 2024-04-24 22:17:50 megatron_init:365] Rank 0 has embedding rank: 0\n",
+      "24-04-24 22:17:50 - PID:703 - rank:(0, 0, 0, 0) - microbatches.py:39 - INFO - setting number of micro-batches to constant 1\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tensor_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: context_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: virtual_pipeline_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: sequence_parallel in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: expert_model_parallel_size in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: use_cpu_initialization in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: gradient_accumulation_fusion in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_ag in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_split_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_atomic_rs in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_wgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: tp_comm_bulk_dgrad in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: finalize_model_grads_func in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: overlap_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: batch_p2p_comm in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: pipeline_model_parallel_split_rank in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_num_layers in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: _cpu_offloading_context in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_activations in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: cpu_offloading_weights in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1172] The model: MegatronDiffusionEngine() does not have field.name: barrier_with_L1_time in its cfg. Add this key to cfg or config_mapping to make to make it configurable.\n",
+      "[NeMo W 2024-04-24 22:17:50 megatron_base_model:1183] hidden_size not found in {'precision': 16, 'micro_batch_size': 1, 'global_batch_size': 1, 'scale_factor': 0.13025, 'disable_first_stage_autocast': True, 'is_legacy': False, 'inductor': False, 'capture_cudagraph_iters': -1, 'scale_by_std': False, 'channels_last': False, 'fsdp': False, 'fsdp_set_buffer_dtype': None, 'precache_mode': None, 'loss_fn_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.loss.StandardDiffusionLoss', 'sigma_sampler': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sigma_sampling.DiscreteSampling', 'num_idx': 1000, 'discretization': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}}, 'denoiser_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser.DiscreteDenoiser', 'num_idx': 1000, 'weighting_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_weighting.EpsWeighting'}, 'scaling_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser_scaling.EpsScaling'}, 'discretization_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer.LegacyDDPMDiscretization'}}, 'unet_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.openaimodel.UNetModel', 'from_NeMo': False, 'adm_in_channels': 2816, 'num_classes': 'sequential', 'use_checkpoint': False, 'in_channels': 4, 'out_channels': 4, 'model_channels': 320, 'attention_resolutions': [4, 2], 'num_res_blocks': 2, 'channel_mult': [1, 2, 4], 'num_head_channels': 64, 'use_spatial_transformer': True, 'use_linear_in_transformer': True, 'transformer_depth': [1, 2, 10], 'context_dim': 2048, 'image_size': 64, 'legacy': False, 'use_flash_attention': True, 'from_pretrained': None}, 'first_stage_config': {'_target_': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper', 'from_pretrained': None, 'embed_dim': 4, 'monitor': 'val/rec_loss', 'ddconfig': {'attn_type': 'vanilla', 'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}, 'lossconfig': {'target': 'torch.nn.Identity'}}, 'conditioner_config': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner', 'emb_models': [{'is_trainable': False, 'input_key': 'captions', 'ucg_rate': 0.1, 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder', 'layer': 'hidden', 'layer_idx': 11}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'captions', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2', 'arch': 'ViT-bigG-14', 'version': 'laion2b_s39b_b160k', 'freeze': True, 'layer': 'penultimate', 'always_return_pooled': True, 'legacy': False}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'original_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'crop_coords_top_left', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}, {'is_trainable': False, 'ucg_rate': 0.1, 'input_key': 'target_size_as_tuple', 'emb_model': {'_target_': 'nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.ConcatTimestepEmbedderND', 'outdim': 256}}]}, 'data': {'num_workers': 16, 'train': {'dataset_path': ['/datasets/coyo/test.pkl'], 'augmentations': {'resize_smallest_side': 256, 'horizontal_flip': False}, 'filterings': None}, 'webdataset': {'infinite_sampler': False, 'local_root_path': '/datasets/coyo'}}, 'seed': 1234, 'resume_from_checkpoint': None, 'apex_transformer_log_level': 30, 'gradient_as_bucket_view': True, 'optim': {'name': 'fused_adam', 'lr': 0.0001, 'weight_decay': 0.0, 'betas': [0.9, 0.999], 'sched': {'name': 'WarmupHoldPolicy', 'warmup_steps': 10000, 'hold_steps': 10000000000000}}, 'nsys_profile': {'enabled': False, 'start_step': 10, 'end_step': 10, 'ranks': [0], 'gen_shape': False}, 'target': 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine.MegatronDiffusionEngine', 'nemo_version': '1.23.0rc0', 'ckpt_path': None}. Set this in model_parallel_config if using pipeline parallelism.\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:17:50 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:51 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:51 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:53 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:53 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:54 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:54 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:56 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:56 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:17:58 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:17:58 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:00 attention:436] constructing SpatialTransformer of depth 10 w/ 1280 channels and 20 heads\n",
+      "[NeMo I 2024-04-24 22:18:00 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 10. Setting context_dim to [2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:436] constructing SpatialTransformer of depth 2 w/ 640 channels and 10 heads\n",
+      "[NeMo I 2024-04-24 22:18:02 attention:445] WARNING: SpatialTransformer: Found context dims [2048] of depth 1, which does not match the specified 'depth' of 2. Setting context_dim to [2048, 2048] now.\n",
+      "[NeMo I 2024-04-24 22:18:02 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.wrappers>, cls=<OpenAIWrapper>\n",
+      "Loaded ViT-bigG-14 model config.\n",
+      "Loading pretrained ViT-bigG-14 weights (laion2b_s39b_b160k).\n",
+      "Initialized embedder #0: FrozenCLIPEmbedder with 123060480 params. Trainable: False\n",
+      "Initialized embedder #1: FrozenOpenCLIPEmbedder2 with 694659841 params. Trainable: False\n",
+      "Initialized embedder #2: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #3: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "Initialized embedder #4: ConcatTimestepEmbedderND with 0 params. Trainable: False\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "Working with z of shape (1, 4, 32, 32) = 4096 dimensions.\n",
+      "making attention of type 'vanilla' with 512 in_channels\n",
+      "[NeMo I 2024-04-24 22:18:35 nlp_overrides:1155] Model MegatronDiffusionEngine was successfully restored from /quantization/sdxl_base.nemo.\n",
+      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "distributed_backend=nccl\n",
+      "All distributed processes registered. Starting with 1 processes\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "\n",
+      "[NeMo W 2024-04-24 22:18:36 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:1184: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert y.shape[0] == x.shape[0]\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:18:36 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:209: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert x.shape[1] == self.channels\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:18:37 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py:145: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      assert x.shape[1] == self.channels\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:17 nemo_logging:349] /opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/model.py:172: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      w_ = w_ * (int(c) ** (-0.5))\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/onnx/utils.py:2095: UserWarning: Provided key z_pooled for dynamic axes is not a valid input/output name\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:86: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if input_shape[-1] > 1 or self.sliding_window is not None:\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:162: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if past_key_values_length > 0:\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:281: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:289: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:19 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/transformers/models/clip/modeling_clip.py:321: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "      if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n",
+      "    \n",
+      "[NeMo W 2024-04-24 22:22:27 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/torch/onnx/symbolic_opset9.py:5859: UserWarning: Exporting aten::index operator of advanced indexing in opset 14 is achieved by combination of multiple ONNX operators, including Reshape, Transpose, Concat, and Gather. If indices include negative values, the exported graph will produce incorrect results.\n",
+      "      warnings.warn(\n",
+      "    \n",
+      "Building TensorRT engine for /quantization/onnx/unet_xl/unet_xl.onnx: /quantization/plan/unet_xl.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {x [min=(1, 4, 128, 128), opt=(2, 4, 128, 128), max=(8, 4, 128, 128)],\n",
+      "             y [min=(1, 2816), opt=(2, 2816), max=(8, 2816)],\n",
+      "             timesteps [min=(1,), opt=(2,), max=(8,)],\n",
+      "             context [min=(1, 80, 2048), opt=(2, 80, 2048), max=(8, 80, 2048)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;11m[W] Detected layernorm nodes in FP16.\u001B[0m\n",
+      "\u001B[38;5;11m[W] Running layernorm after self-attention in FP16 may cause overflow. Exporting the model to the latest available ONNX opset (later than opset 17) to use the INormalizationLayer, or forcing layernorm layers to run in FP32 precision can help with preserving accuracy.\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 553.937 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/unet_xl.plan\n",
+      "Building TensorRT engine for /quantization/onnx/vae/vae.onnx: /quantization/plan/vae.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {z [min=(1, 4, 128, 128), opt=(2, 4, 128, 128), max=(8, 4, 128, 128)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | []\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 266.743 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/vae.plan\n",
+      "Building TensorRT engine for /quantization/onnx/clip1/clip1.onnx: /quantization/plan/clip1.plan\n",
+      "\u001B[38;5;11m[W] ModelImporter.cpp:409: Make sure input input_ids has Int64 binding.\u001B[0m\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {input_ids [min=(1, 77), opt=(2, 77), max=(8, 77)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 16.988 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/clip1.plan\n",
+      "Building TensorRT engine for /quantization/onnx/clip2/clip2.onnx: /quantization/plan/clip2.plan\n",
+      "[I] Configuring with profiles:[\n",
+      "        Profile 0:\n",
+      "            {input_ids [min=(1, 77), opt=(2, 77), max=(8, 77)]}\n",
+      "    ]\n",
+      "\u001B[38;5;14m[I] Building engine with configuration:\n",
+      "    Flags                  | [FP16]\n",
+      "    Engine Capability      | EngineCapability.DEFAULT\n",
+      "    Memory Pools           | [WORKSPACE: 48685.38 MiB, TACTIC_DRAM: 48685.38 MiB]\n",
+      "    Tactic Sources         | [CUBLAS, CUDNN, EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]\n",
+      "    Profiling Verbosity    | ProfilingVerbosity.DETAILED\n",
+      "    Preview Features       | [FASTER_DYNAMIC_SHAPES_0805, DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]\u001B[0m\n",
+      "\u001B[38;5;10m[I] Finished engine building in 72.535 seconds\u001B[0m\n",
+      "[I] Saving engine to /quantization/plan/clip2.plan\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_export.py model.restore_from_path=$WORKDIR/sdxl_base.nemo infer.out_path=$WORKDIR"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7eb2d03",
+   "metadata": {},
+   "source": [
+    "### Run TRT inference pipeline with original engines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25737be2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "FlashAttention Installed\n",
+      "[NeMo W 2024-04-24 22:46:11 nemo_logging:349] /usr/local/lib/python3.10/dist-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.\n",
+      "    See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.\n",
+      "      ret = run_job(\n",
+      "    \n",
+      "Loading TensorRT engine: /quantization/plan/unet_xl.plan\n",
+      "[I] Loading bytes from /quantization/plan/unet_xl.plan\n",
+      "unet_xl trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/vae.plan\n",
+      "[I] Loading bytes from /quantization/plan/vae.plan\n",
+      "vae trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/clip1.plan\n",
+      "[I] Loading bytes from /quantization/plan/clip1.plan\n",
+      "clip1 trt engine loaded successfully\n",
+      "Loading TensorRT engine: /quantization/plan/clip2.plan\n",
+      "[I] Loading bytes from /quantization/plan/clip2.plan\n",
+      "clip2 trt engine loaded successfully\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:46:17 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:24<00:00,  1.60it/\n",
+      "This batch takes 27.204587490297854s\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:46:45 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.57it/\n",
+      "This batch takes 25.58329666685313s\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:47:14 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.55it/\n",
+      "This batch takes 25.87396944500506s\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:47:44 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  98%|▉| 40/41 [00:25<00:00,  1.54it/\n",
+      "This batch takes 26.03419069480151s\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.discretizer>, cls=<LegacyDDPMDiscretization>\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.guiders>, cls=<VanillaCFG>\n",
+      "[NeMo I 2024-04-24 22:48:13 utils:108] Getting module=<nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.sampling_utils>, cls=<NoDynamicThresholding>\n",
+      "##############################  Sampling setting  ##############################\n",
+      "Sampler: EulerEDMSampler\n",
+      "Discretization: LegacyDDPMDiscretization\n",
+      "Guider: VanillaCFG\n",
+      "Sampling with EulerEDMSampler for 41 steps:  71%|▋| 29/41 [00:18<00:07,  1.52it/"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py \\\n",
+    "    out_path=$WORKDIR/trt_output_fp16 \\\n",
+    "    unet_xl=$WORKDIR/plan/unet_xl.plan \\\n",
+    "    vae=$WORKDIR/plan/vae.plan \\\n",
+    "    clip1=$WORKDIR/plan/clip1.plan \\\n",
+    "    clip2=$WORKDIR/plan/clip2.plan\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d11bbe7d",
+   "metadata": {},
+   "source": [
+    "### Run TRT inference pipeline with quantized U-Net engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3f2263b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "^C\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3109, in _dep_map\n",
+      "[2024-04-24 19:42:46,104] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers\n",
+      "[2024-04-24 19:42:46,104] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1300 closing signal SIGINT\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 2902, in __getattr__\n",
+      "    raise AttributeError(attr)\n",
+      "AttributeError: _DistInfoDistribution__dep_map\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py\", line 25, in <module>\n",
+      "    from nemo.collections.multimodal.modules.stable_diffusion.diffusionmodules.denoiser import DiscreteDenoiser\n",
+      "  File \"/opt/NeMo/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/denoiser.py\", line 17, in <module>\n",
+      "    from nemo.collections.multimodal.parts.stable_diffusion.utils import append_dims, instantiate_from_config\n",
+      "  File \"/opt/NeMo/nemo/collections/multimodal/parts/stable_diffusion/utils.py\", line 25, in <module>\n",
+      "    from nemo.utils import logging\n",
+      "  File \"/opt/NeMo/nemo/utils/__init__.py\", line 31, in <module>\n",
+      "    from nemo.utils.lightning_logger_patch import add_memory_handlers_to_pl_logger\n",
+      "  File \"/opt/NeMo/nemo/utils/lightning_logger_patch.py\", line 18, in <module>\n",
+      "    import pytorch_lightning as pl\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/__init__.py\", line 27, in <module>\n",
+      "    from pytorch_lightning.callbacks import Callback  # noqa: E402\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/__init__.py\", line 14, in <module>\n",
+      "    from pytorch_lightning.callbacks.batch_size_finder import BatchSizeFinder\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/batch_size_finder.py\", line 26, in <module>\n",
+      "    from pytorch_lightning.callbacks.callback import Callback\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/callback.py\", line 22, in <module>\n",
+      "    from pytorch_lightning.utilities.types import STEP_OUTPUT\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pytorch_lightning/utilities/types.py\", line 41, in <module>\n",
+      "    from torchmetrics import Metric\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/__init__.py\", line 22, in <module>\n",
+      "    from torchmetrics import functional  # noqa: E402\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/__init__.py\", line 121, in <module>\n",
+      "    from torchmetrics.functional.text._deprecated import _bleu_score as bleu_score\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/torchmetrics/functional/text/__init__.py\", line 49, in <module>\n",
+      "    if _TRANSFORMERS_GREATER_EQUAL_4_4:\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 164, in __bool__\n",
+      "    self._check_available()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 158, in _check_available\n",
+      "    self._check_requirement()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/lightning_utilities/core/imports.py\", line 132, in _check_requirement\n",
+      "    pkg_resources.require(self.requirement)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 968, in require\n",
+      "    needed = self.resolve(parse_requirements(requirements))\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 834, in resolve\n",
+      "    new_requirements = dist.requires(req.extras)[::-1]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 2822, in requires\n",
+      "    dm = self._dep_map\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3111, in _dep_map\n",
+      "    self.__dep_map = self._compute_dependencies()\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3133, in _compute_dependencies\n",
+      "    dm[s_extra] = [r for r in reqs_for_extra(extra) if r not in common]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3133, in <listcomp>\n",
+      "    dm[s_extra] = [r for r in reqs_for_extra(extra) if r not in common]\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/__init__.py\", line 3125, in reqs_for_extra\n",
+      "    if not req.marker or req.marker.evaluate({'extra': extra}):\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 252, in evaluate\n",
+      "    return _evaluate_markers(self._markers, current_environment)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 164, in _evaluate_markers\n",
+      "    return any(all(item) for item in groups)\n",
+      "  File \"/usr/local/lib/python3.10/dist-packages/pkg_resources/_vendor/packaging/markers.py\", line 164, in <genexpr>\n",
+      "    return any(all(item) for item in groups)\n",
+      "KeyboardInterrupt\n"
+     ]
+    }
+   ],
+   "source": [
+    "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_trt_inference.py \\\n",
+    "    out_path=$WORKDIR/trt_output_int8 \\\n",
+    "    unet_xl=$WORKDIR/int8_unet_xl.plan \\\n",
+    "    vae=$WORKDIR/plan/vae.plan \\\n",
+    "    clip1=$WORKDIR/plan/clip1.plan \\\n",
+    "    clip2=$WORKDIR/plan/clip2.plan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c48c21dd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file

From a9dac0edabba2297fd6e1d521cb297b1318a8df8 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Tue, 30 Apr 2024 11:10:25 -0700
Subject: [PATCH 27/30] Fix #8948, allow preprocessor to be stream captured to
 a cuda graph when doing per_feature normalization (#8964)

* Do feature normalization in parallel, rather than via a for loop.

At large batch sizes, this becomes a bottleneck, taking about 9 ms at
batch size 16, for example. See issue #8948.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* Remove all instances of cudaStreamSynchronize() in the featurizer when
doing "per_feature" normalization.

With this change, we can now do stream capture to a cuda graph on the
preprocessor. This is bound to increase performance
significantly. Even at batch size 16, the GPU is idle about 50% of the
time because these kernels finish so fast.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix crash in CPU mode.

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Daniel Galvez <dgalvez@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../asr/parts/preprocessing/features.py       | 40 +++++++++++++------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
index 67813f3e66d2..dccc81b1816c 100644
--- a/nemo/collections/asr/parts/preprocessing/features.py
+++ b/nemo/collections/asr/parts/preprocessing/features.py
@@ -60,17 +60,33 @@ def normalize_batch(x, seq_len, normalize_type):
     x_mean = None
     x_std = None
     if normalize_type == "per_feature":
-        x_mean = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-        x_std = torch.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-        for i in range(x.shape[0]):
-            if x[i, :, : seq_len[i]].shape[1] == 1:
-                raise ValueError(
-                    "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
-                    "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
-                    "feature (ex. at least `hop_length` for Mel Spectrograms)."
-                )
-            x_mean[i, :] = x[i, :, : seq_len[i]].mean(dim=1)
-            x_std[i, :] = x[i, :, : seq_len[i]].std(dim=1)
+        batch_size = x.shape[0]
+        max_time = x.shape[2]
+
+        # When doing stream capture to a graph, item() is not allowed
+        # becuase it calls cudaStreamSynchronize(). Therefore, we are
+        # sacrificing some error checking when running with cuda graphs.
+        if (
+            torch.cuda.is_available()
+            and not torch.cuda.is_current_stream_capturing()
+            and torch.any(seq_len == 1).item()
+        ):
+            raise ValueError(
+                "normalize_batch with `per_feature` normalize_type received a tensor of length 1. This will result "
+                "in torch.std() returning nan. Make sure your audio length has enough samples for a single "
+                "feature (ex. at least `hop_length` for Mel Spectrograms)."
+            )
+        time_steps = torch.arange(max_time, device=x.device).unsqueeze(0).expand(batch_size, max_time)
+        valid_mask = time_steps < seq_len.unsqueeze(1)
+        x_mean_numerator = torch.where(valid_mask.unsqueeze(1), x, 0.0).sum(axis=2)
+        x_mean_denominator = valid_mask.sum(axis=1)
+        x_mean = x_mean_numerator / x_mean_denominator.unsqueeze(1)
+
+        # Subtract 1 in the denominator to correct for the bias.
+        x_std = torch.sqrt(
+            torch.sum(torch.where(valid_mask.unsqueeze(1), x - x_mean.unsqueeze(2), 0.0) ** 2, axis=2)
+            / (x_mean_denominator.unsqueeze(1) - 1.0)
+        )
         # make sure x_std is not zero
         x_std += CONSTANT
         return (x - x_mean.unsqueeze(2)) / x_std.unsqueeze(2), x_mean, x_std
@@ -457,7 +473,7 @@ def forward(self, x, seq_len, linear_spec=False):
 
         # mask to zero any values beyond seq_len in batch, pad to multiple of `pad_to` (for efficiency)
         max_len = x.size(-1)
-        mask = torch.arange(max_len).to(x.device)
+        mask = torch.arange(max_len, device=x.device)
         mask = mask.repeat(x.size(0), 1) >= seq_len.unsqueeze(1)
         x = x.masked_fill(mask.unsqueeze(1).type(torch.bool).to(device=x.device), self.pad_value)
         del mask

From fe4b291175bdc2bc2c1f3b919ddd16e1233e9252 Mon Sep 17 00:00:00 2001
From: anteju <108555623+anteju@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:08:15 -0700
Subject: [PATCH 28/30] [ASR] Support for transcription of multi-channel audio
 for AED models (#9007)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Propagate channel selector for AED model + add channel selector to get_lhotse_dataloader_from config

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Included comments

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

* Added unit test

Signed-off-by: Ante Jukić <ajukic@nvidia.com>

---------

Signed-off-by: Ante Jukić <ajukic@nvidia.com>
---
 .../asr/models/aed_multitask_models.py        |   1 +
 .../common/data/lhotse/dataloader.py          |  28 +++++
 .../common/test_lhotse_dataloading.py         | 100 ++++++++++++++++++
 3 files changed, 129 insertions(+)

diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
index 7e20d7a16559..f9413a4dd738 100644
--- a/nemo/collections/asr/models/aed_multitask_models.py
+++ b/nemo/collections/asr/models/aed_multitask_models.py
@@ -875,6 +875,7 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'drop_last': False,
             'text_field': config.get('text_field', 'answer'),
             'lang_field': config.get('lang_field', 'target_lang'),
+            'channel_selector': config.get('channel_selector', None),
         }
 
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config), inference=True)
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 5bb3bf2988ea..eabc3da5d11b 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -89,6 +89,7 @@ class LhotseDataLoadingConfig:
     seed: int | str = 0
     num_workers: int = 0
     pin_memory: bool = False
+    channel_selector: int | str | None = None
 
     # 4. Optional Lhotse data augmentation.
     #   a. On-the-fly noise/audio mixing.
@@ -156,6 +157,11 @@ def get_lhotse_dataloader_from_config(
     # 1. Load a manifest as a Lhotse CutSet.
     cuts, is_tarred = read_cutset_from_config(config)
 
+    # Apply channel selector
+    if config.channel_selector is not None:
+        logging.info('Using channel selector %s.', config.channel_selector)
+        cuts = cuts.map(partial(_select_channel, channel_selector=config.channel_selector))
+
     # Resample as a safeguard; it's a no-op when SR is already OK
     cuts = cuts.resample(config.sample_rate)
 
@@ -443,3 +449,25 @@ def _flatten_alt_text(cut) -> list:
         text_instance.custom = {"text": data.pop("text"), "lang": data.pop("lang"), **data}
         ans.append(text_instance)
     return ans
+
+
+def _select_channel(cut, channel_selector: int | str) -> list:
+    if isinstance(channel_selector, int):
+        channel_idx = channel_selector
+    elif isinstance(channel_selector, str):
+        if channel_selector in cut.custom:
+            channel_idx = cut.custom[channel_selector]
+        else:
+            raise ValueError(f"Channel selector {channel_selector} not found in cut.custom")
+
+    if channel_idx >= cut.num_channels:
+        raise ValueError(
+            f"Channel index {channel_idx} is larger than the actual number of channels {cut.num_channels}"
+        )
+
+    if cut.num_channels == 1:
+        # one channel available and channel_idx==0
+        return cut
+    else:
+        # with_channels only defined on MultiCut
+        return cut.with_channels(channel_idx)
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index d4b3ad03050e..8eaebb2af68a 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -104,6 +104,51 @@ def nemo_manifest_path(cutset_path: Path):
     return p
 
 
+@pytest.fixture(scope="session")
+def mc_cutset_path(tmp_path_factory) -> Path:
+    """10 two-channel utterances of length 1s as a Lhotse CutSet."""
+    from lhotse import CutSet, MultiCut
+    from lhotse.testing.dummies import DummyManifest
+
+    num_examples = 10  # number of examples
+    num_channels = 2  # number of channels per example
+
+    # create a dummy manifest with single-channel examples
+    sc_cuts = DummyManifest(CutSet, begin_id=0, end_id=num_examples * num_channels, with_data=True)
+    mc_cuts = []
+
+    for n in range(num_examples):
+        # sources for individual channels
+        mc_sources = []
+        for channel in range(num_channels):
+            source = sc_cuts[n * num_channels + channel].recording.sources[0]
+            source.channels = [channel]
+            mc_sources.append(source)
+
+        # merge recordings
+        rec = Recording(
+            sources=mc_sources,
+            id=f'mc-dummy-recording-{n:02d}',
+            num_samples=sc_cuts[0].num_samples,
+            duration=sc_cuts[0].duration,
+            sampling_rate=sc_cuts[0].sampling_rate,
+        )
+
+        # multi-channel cut
+        cut = MultiCut(
+            recording=rec, id=f'mc-dummy-cut-{n:02d}', start=0, duration=1.0, channel=list(range(num_channels))
+        )
+        mc_cuts.append(cut)
+
+    mc_cuts = CutSet.from_cuts(mc_cuts)
+
+    tmp_path = tmp_path_factory.mktemp("data")
+    p = tmp_path / "mc_cuts.jsonl.gz"
+    pa = tmp_path / "mc_audio"
+    mc_cuts.save_audios(pa).to_file(p)
+    return p
+
+
 @pytest.fixture(scope="session")
 def nemo_tarred_manifest_path(nemo_manifest_path: Path) -> Tuple[str, str]:
     """10 utterances of length 1s as a NeMo tarred manifest."""
@@ -247,6 +292,61 @@ def test_dataloader_from_lhotse_cuts_cut_into_windows(cutset_path: Path):
     # exactly 20 cuts were used because we cut 10x 1s cuts into 20x 0.5s cuts
 
 
+def test_dataloader_from_lhotse_cuts_channel_selector(mc_cutset_path: Path):
+    # Dataloader without channel selector
+    config = OmegaConf.create(
+        {
+            "cuts_path": mc_cutset_path,
+            "sample_rate": 16000,
+            "shuffle": True,
+            "use_lhotse": True,
+            "num_workers": 0,
+            "batch_size": 4,
+            "seed": 0,
+        }
+    )
+
+    dl = get_lhotse_dataloader_from_config(
+        config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+    )
+    batches = [b for b in dl]
+    assert len(batches) == 3
+
+    # 1.0s = 16000 samples, two channels, note the constant duration and batch size
+    assert batches[0]["audio"].shape == (4, 2, 16000)
+    assert batches[1]["audio"].shape == (4, 2, 16000)
+    assert batches[2]["audio"].shape == (2, 2, 16000)
+    # exactly 10 cuts were used
+
+    # Apply channel selector
+    for channel_selector in [None, 0, 1]:
+
+        config_cs = OmegaConf.create(
+            {
+                "cuts_path": mc_cutset_path,
+                "channel_selector": channel_selector,
+                "sample_rate": 16000,
+                "shuffle": True,
+                "use_lhotse": True,
+                "num_workers": 0,
+                "batch_size": 4,
+                "seed": 0,
+            }
+        )
+
+        dl_cs = get_lhotse_dataloader_from_config(
+            config=config_cs, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset()
+        )
+
+        for n, b_cs in enumerate(dl_cs):
+            if channel_selector is None:
+                # no channel selector, needs to match the original dataset
+                assert torch.equal(b_cs["audio"], batches[n]["audio"])
+            else:
+                # channel selector, needs to match the selected channel
+                assert torch.equal(b_cs["audio"], batches[n]["audio"][:, channel_selector, :])
+
+
 @requires_torchaudio
 def test_dataloader_from_lhotse_shar_cuts(cutset_shar_path: Path):
     config = OmegaConf.create(

From 33494f566e07f4387a35cac06461d12f12f2ac41 Mon Sep 17 00:00:00 2001
From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:12:33 -0700
Subject: [PATCH 29/30] Enable Sequence Packing and Pipeline Parallel in NeVA
 (#8957)

* temp save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* temp save 2

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update code

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix neva and clip

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Enable parallel seq packing algo and few other fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Pipeline parallel support

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Update data preprocess

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix few pp issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* enable sequence packing w/ PP

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix cu_seqlens in inputs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add assert

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Depend on PP to decide whether do padding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add docstring

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix few PP evaluation issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix license

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Few neva bugs

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../multimodal_llm/neva/conf/neva_config.yaml |   1 +
 .../multimodal_llm/neva/eval/vqa_science.py   |  42 ++-
 .../multimodal_llm/neva/neva_evaluation.py    |  38 +-
 .../sequence_packing/preprocess_dataset.py    | 354 ++++++++++++++++++
 .../multimodal/data/neva/neva_dataset.py      |  74 +++-
 .../models/multimodal_llm/neva/neva_model.py  | 169 +++++++--
 nemo/collections/multimodal/parts/utils.py    |   3 +-
 .../language_modeling/megatron_gpt_model.py   |   6 +-
 .../modules/common/text_generation_utils.py   |   4 +
 .../vision/data/megatron/data_samplers.py     |   4 +-
 10 files changed, 627 insertions(+), 68 deletions(-)
 create mode 100644 examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py

diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
index b41f15c384a8..0caf4beb6a12 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_config.yaml
@@ -181,6 +181,7 @@ model:
     additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]
 
   data:
+    packed_sequence: False
     num_workers: 8
     dataloader_type: cyclic
     data_path:
diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
index 8ea267ac8116..62d8788067bb 100644
--- a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
+++ b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -79,7 +79,8 @@ def eval_model(args):
     cfg.base_model_file = args.model_base
     cfg.inference.images_base_path = args.image_folder
     cfg.tensor_model_parallel_size = args.tp
-    cfg.trainer.devices = args.tp
+    cfg.pipeline_model_parallel_size = args.pp
+    cfg.trainer.devices = args.tp * args.pp
 
     model, image_processor = create_neva_model_and_processor(cfg)
     length_params: LengthParam = {
@@ -102,7 +103,8 @@ def eval_model(args):
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
     answers_file = os.path.expanduser(args.answers_file)
     os.makedirs(os.path.dirname(answers_file), exist_ok=True)
-    ans_file = open(answers_file, "w")
+    if is_global_rank_zero():
+        ans_file = open(answers_file, "w")
     for i, line in enumerate(tqdm(questions, disable=(not is_global_rank_zero()))):
         idx = line["id"]
         question = line['conversations'][0]
@@ -123,7 +125,8 @@ def eval_model(args):
             sampling_params=sampling_params,
             inference_config=cfg,
         )
-        # import  pdb; pdb.set_trace()
+        if responses is None:
+            continue
         outputs = responses[0]["clean_response"]
 
         # prompt for answer
@@ -139,22 +142,24 @@ def eval_model(args):
             outputs = responses[0]["clean_response"]
             outputs = outputs_reasoning + '\n The answer is ' + outputs
 
-        ans_id = shortuuid.uuid()
-        ans_file.write(
-            json.dumps(
-                {
-                    "question_id": idx,
-                    "prompt": cur_prompt,
-                    "text": outputs,
-                    "answer_id": ans_id,
-                    "model_id": args.model_path,
-                    "metadata": {},
-                }
+        if is_global_rank_zero():
+            ans_id = shortuuid.uuid()
+            ans_file.write(
+                json.dumps(
+                    {
+                        "question_id": idx,
+                        "prompt": cur_prompt,
+                        "text": outputs,
+                        "answer_id": ans_id,
+                        "model_id": args.model_path,
+                        "metadata": {},
+                    }
+                )
+                + "\n"
             )
-            + "\n"
-        )
-        ans_file.flush()
-    ans_file.close()
+            ans_file.flush()
+    if is_global_rank_zero():
+        ans_file.close()
 
 
 if __name__ == "__main__":
@@ -166,6 +171,7 @@ def eval_model(args):
     parser.add_argument("--answers-file", type=str, default="answer.jsonl")
     parser.add_argument("--conv-mode", type=str, default="llava_v0")
     parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--num-chunks", type=int, default=1)
     parser.add_argument("--chunk-idx", type=int, default=0)
     parser.add_argument("--temperature", type=float, default=0.2)
diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
index bd3f975e4d54..d9d9a71db757 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -20,6 +20,7 @@
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
 from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 from nemo.core.config import hydra_runner
+from nemo.utils.get_rank import is_global_rank_zero
 
 
 try:
@@ -121,22 +122,27 @@ def forward_loop():
         )
     # ============== Quantization End =========================
 
-    results = []
-    for response, prompt in zip(responses, final_prompts):
-        prompt['full_text'] = response["clean_text"]
-        prompt['text'] = response["clean_response"]
-        prompt['model_id'] = cfg.neva_model_file
-        if 'image_path' in prompt:
-            prompt['image'] = prompt.pop('image_path')
-        if 'answer_id' not in prompt:
-            prompt['answer_id'] = 0
-        if 'metadata' not in prompt:
-            prompt['metadata'] = {}
-        results.append(prompt)
-
-    with open(cfg.output_file, 'w') as f:
-        for result in results:
-            f.write(json.dumps(result) + '\n')
+    # PP middle stages do not yield any responses
+    if responses is None:
+        return
+
+    if is_global_rank_zero():
+        results = []
+        for response, prompt in zip(responses, final_prompts):
+            prompt['full_text'] = response["clean_text"]
+            prompt['text'] = response["clean_response"]
+            prompt['model_id'] = cfg.neva_model_file
+            if 'image_path' in prompt:
+                prompt['image'] = prompt.pop('image_path')
+            if 'answer_id' not in prompt:
+                prompt['answer_id'] = 0
+            if 'metadata' not in prompt:
+                prompt['metadata'] = {}
+            results.append(prompt)
+
+        with open(cfg.output_file, 'w') as f:
+            for result in results:
+                f.write(json.dumps(result) + '\n')
 
 
 if __name__ == '__main__':
diff --git a/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
new file mode 100644
index 000000000000..ee96ff6489d3
--- /dev/null
+++ b/examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Example Usage:
+--------------
+This script preprocesses a dataset for the NeMo Multimodal Learning framework. It requires specifying paths for data, images, and the tokenizer model, among other parameters.
+
+Command:
+python examples/multimodal/multimodal_llm/neva/sequence_packing/preprocess_dataset.py \
+ --data_path=/path/to/LLaVA-Instruct-150K/llava_v1_5_mix665k_filtered.json \
+ --image_folder=/path/to/LLaVA-Instruct-150K/images \
+ --tokenizer_path=/path/to/checkpoints/tokenizer_add_special.model \
+ --output_dir=/path/to/LLaVA-Instruct-150K/packed_seq_4096_336_v1 \
+ --max_seq_length=12288 \
+ --packing_algorithm=first_fit_shuffle \
+ --hf_vision_encoder=openai/clip-vit-large-patch14-336 \
+ --conv_template=v1 \
+ --image_aspect_ratio=pad \
+ --seed=42
+
+Parameters:
+-----------
+--data_path: Path to the dataset file in JSON format.
+--image_folder: Directory containing the images referenced in the dataset.
+--tokenizer_path: Path to the tokenizer model.
+--output_dir: Directory where the processed dataset will be stored.
+--max_seq_length: The maximum sequence length of the model.
+--packing_algorithm: Algorithm used for packing sequences. Defaults to 'first_fit_shuffle'.
+--hf_vision_encoder: The Hugging Face vision encoder to use. Default is 'openai/clip-vit-large-patch14-336'.
+--conv_template: Template for data conversion. Default is 'plain', with 'v1' as an alternative.
+--image_aspect_ratio: The aspect ratio for processing images. Defaults to 'square', 'pad' for padding to maintain aspect ratio.
+--seed: Seed for random operations in 'first_fit_shuffle'.
+--hparams_file: Optional path to a YAML file containing additional hyperparameters.
+"""
+
+import collections
+import os
+import random
+import re
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import numpy as np
+import torch
+from megatron.core.datasets.indexed_dataset import IndexedDataset, IndexedDatasetBuilder, get_bin_path, get_idx_path
+from omegaconf import OmegaConf
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from nemo.collections.multimodal.data.neva.neva_dataset import make_supervised_data_module
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.utils import logging
+
+PACKING_ALGOS = ['first_fit_decreasing', 'first_fit_shuffle', 'shuffle_and_pack']
+
+
+def first_fit(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using the First Fit algorithm, by integrating the search
+    and assignment within the same function. It moves bins that can no longer fit the minimum sequence length
+    to a completed bins list, avoiding direct modification of the bins list during iteration.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    min_seq_len = min(seq_lens)  # Find the minimum sequence length
+    completed_bins = []  # Initialize the completed bins list
+    bins = []  # Initialize the bins list to store active bins
+
+    for s in tqdm(seq_lens):  # Iterate through each sequence length
+        found_bin = False
+        for i, abin in enumerate(bins[:]):  # Iterate over a shallow copy of bins
+            if sum(abin) + min_seq_len > max_seq_length:
+                completed_bins.append(abin)  # Add to completed bins
+                bins[i] = 'TO_REMOVE'  # Mark this bin for removal
+                continue
+            if sum(abin) + s <= max_seq_length:  # Check if the bin can fit the sequence
+                bins[i].append(s)  # If so, add the sequence to this bin
+                found_bin = True
+                break
+
+        if not found_bin:  # If no existing bin can fit the sequence
+            bins.append([s])  # Open a new bin for this sequence
+
+        # Clean up bins marked 'TO_REMOVE'
+        bins = [bin for bin in bins if bin != 'TO_REMOVE']
+
+    # Combine completed bins with any remaining active bins
+    all_bins = completed_bins + bins
+    return all_bins
+
+
+def chunkify(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]
+
+
+def parallel_first_fit(seq_lens, max_seq_length, chunk_size, num_workers):
+    """
+    Assigns sequences to bins in parallel using the First Fit algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+    - chunk_size: Size of chunks to divide seq_lens into for parallel processing.
+    - num_workers: Number of worker threads to use in the ThreadPoolExecutor.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    # Split the sequence lengths into chunks
+    chunks = list(chunkify(seq_lens, chunk_size))
+
+    # Function to process each chunk
+    def process_chunk(chunk):
+        return first_fit(chunk, max_seq_length)
+
+    bins = []  # This will hold the final bins
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        # Submit each chunk to the executor
+        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]
+
+        # As each future completes, combine its bins with the final bins
+        for future in as_completed(futures):
+            bins.extend(future.result())
+
+    return bins
+
+
+def first_fit_decreasing(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using the First Fit Decreasing algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    sorted_seq_lens = sorted(seq_lens, reverse=True)
+    return first_fit(sorted_seq_lens, max_seq_length)
+
+
+def first_fit_shuffle(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins using a shuffled version of the First Fit algorithm.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    shuffled_seq_lens = seq_lens[:]
+    np.random.shuffle(shuffled_seq_lens)
+    return parallel_first_fit(shuffled_seq_lens, max_seq_length, 20000, 32)
+
+
+def shuffle_and_pack(seq_lens, max_seq_length):
+    """
+    Assigns sequences to bins with shuffling, trying to maximize the packing efficiency.
+    After shuffling the sequences, they will be added to one bin in order. Once the bin cannot
+    take more sequences, we will move on to the next bin.
+
+    Parameters:
+    - seq_lens: List of sequence lengths.
+    - max_seq_length: Maximum capacity of each bin.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    shuffled_seq_lens = np.array(seq_lens)
+    np.random.shuffle(shuffled_seq_lens)
+    bins = [[]]
+    cur_bin_total = 0
+    for s in tqdm(shuffled_seq_lens):
+        if cur_bin_total + s <= max_seq_length:
+            bins[-1].append(s)
+            cur_bin_total += s
+        else:
+            bins.append([s])
+            cur_bin_total = s
+    return bins
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument("--data_path", type=str)
+    parser.add_argument("--image_folder", type=str)
+    parser.add_argument("--tokenizer_path", type=str)
+    parser.add_argument('--output_dir', required=True, type=str)
+    parser.add_argument("--max_seq_length", default=4096, type=int)
+    parser.add_argument('--packing_algorithm', default='first_fit_shuffle', choices=PACKING_ALGOS, type=str)
+    parser.add_argument("--hf_vision_encoder", default='openai/clip-vit-large-patch14-336', type=str)
+    parser.add_argument("--conv_template", default='plain', type=str)
+    parser.add_argument("--image_aspect_ratio", default='square', type=str)
+    parser.add_argument('--seed', default=0, type=int, help="Seed for shuffling, used with first_fit_shuffle.")
+    parser.add_argument(
+        "--hparams_file",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), '../conf/llava_config.yaml'),
+        required=False,
+        help="Path to the hparams file.",
+    )
+    return parser.parse_args()
+
+
+def pack_sequence(args, seq_lens):
+    """
+    Packs sequences according to the specified algorithm in args.
+
+    Parameters:
+    - args: Command line arguments.
+    - seq_lens: List of sequence lengths.
+
+    Returns:
+    - List of bins with assigned sequence lengths.
+    """
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+
+    packing_fn = globals()[args.packing_algorithm]
+    bins = packing_fn(seq_lens, args.max_seq_length)
+    return bins
+
+
+def main():
+    torch.multiprocessing.set_sharing_strategy('file_system')
+
+    args = get_args()
+    nemo_config = OmegaConf.load(args.hparams_file)
+    nemo_config.model.mm_cfg.vision_encoder.from_pretrained = args.hf_vision_encoder
+    nemo_config.model.data.data_path = args.data_path
+    nemo_config.model.data.image_folder = args.image_folder
+    nemo_config.model.data.conv_template = args.conv_template
+    nemo_config.model.data.image_aspect_ratio = args.image_aspect_ratio
+
+    tokenizer = get_nmt_tokenizer(library="sentencepiece", tokenizer_model=args.tokenizer_path,)
+    train_ds = make_supervised_data_module(tokenizer=tokenizer, model_cfg=nemo_config.model)["train_dataset"]
+    train_dl = DataLoader(train_ds, num_workers=32, collate_fn=None, shuffle=False)
+    # Example shape: {'tokens': torch.Size([1, 344]), 'labels': torch.Size([1, 344]), 'image': torch.Size([1, 1, 3, 224, 224])}
+
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    logging.info(f"Output directory: {output_dir}")
+
+    prefix_path = f"{output_dir}/packed_seq_dataset"
+    # Original Datasets to Sequence Lengths Files
+    builders = {}
+    for item_dict in tqdm(train_dl, desc="Building indexed datasets"):
+        item_dict = {k: v[0] for k, v in item_dict.items()}
+        seq_len = len(item_dict['tokens'])
+        if seq_len in builders:
+            builder = builders[seq_len]
+        else:
+            builder_path = get_bin_path(f"{prefix_path}/seqlen_{seq_len}")
+            logging.info(f"Creating builder for sequence length {seq_len} at {builder_path}")
+            builder = IndexedDatasetBuilder(builder_path, dtype=np.float32, multimodal=True)
+            builders[seq_len] = builder
+        builder.add_item(item_dict['tokens'])
+        builder.add_item(item_dict['labels'])
+        builder.add_item(item_dict['image'], 1)
+        builder.end_document()
+        del item_dict
+
+    for seq_len, builder in builders.items():
+        idx_path = get_idx_path(f"{prefix_path}/seqlen_{seq_len}")
+        logging.info(f"Finalizing builder for sequence length {seq_len} at {idx_path}")
+        builder.finalize(idx_path)
+
+    # Packing Sequences into Bins
+    files = os.listdir(f"{output_dir}/packed_seq_dataset")
+    pattern = rf"seqlen_(\d+).bin"
+    seq_len_list = []
+    for file in files:
+        match = re.match(pattern, file)
+        if match:
+            seq_len = int(match.group(1))
+            seq_len_list.append(seq_len)
+
+    aggregated_seq_lens = []
+    doc_pop_order = {}
+    indexed_datasets = {}
+    for seq_len in seq_len_list:
+        dataset_path = f"{prefix_path}/seqlen_{seq_len}"
+        dataset = IndexedDataset(dataset_path, multimodal=True)
+        aggregated_seq_lens.extend([seq_len] * (len(dataset.document_indices) - 1))
+        doc_pop_order[seq_len] = list(np.random.permutation(len(dataset.document_indices) - 1))
+        indexed_datasets[seq_len] = dataset
+
+    logging.info("Getting bins")
+    bins = pack_sequence(args, aggregated_seq_lens)
+    logging.info("Finished getting bins")
+
+    num_bins = len(bins)
+    avg_bins_len = sum([len(x) for x in bins]) / num_bins
+    avg_bins_sum = sum([sum(x) for x in bins]) / num_bins
+    logging.info(f"Number of bins: {num_bins}, Average bin length: {avg_bins_len}, Average bin sum: {avg_bins_sum}")
+
+    # Reading Sequence Lengths and Packing into New Files
+    final_builder_path = get_bin_path(f"{prefix_path}")
+    logging.info(f"Creating final builder at {final_builder_path}")
+    final_builder = IndexedDatasetBuilder(final_builder_path, dtype=np.float32, multimodal=True)
+
+    for assignment in tqdm(bins, desc="Building final dataset"):
+        packed_items = collections.defaultdict(list)
+        packed_items["seq_indices"] = [0]
+        for seq_len in assignment:
+            doc_index = doc_pop_order[seq_len].pop()
+            doc_start = indexed_datasets[seq_len].document_indices[doc_index]
+            doc_end = indexed_datasets[seq_len].document_indices[doc_index + 1]
+            item_dict = {
+                "tokens": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[0]),
+                "labels": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[1]),
+                "image": torch.tensor((indexed_datasets[seq_len][doc_start:doc_end][0])[2]),
+            }
+            for key in ["tokens", "labels", "image"]:
+                packed_items[key].append(item_dict[key])
+            packed_items["seq_indices"].append(packed_items["seq_indices"][-1] + seq_len)
+
+        for key in ["seq_indices", "tokens", "labels", "image"]:
+            final_builder.add_item(
+                torch.tensor(packed_items[key]) if key == "seq_indices" else torch.cat(packed_items[key], dim=0),
+                1 if key == "image" else 0,
+            )
+        final_builder.end_document()
+
+    idx_path = get_idx_path(f"{prefix_path}")
+    logging.info(f"Finalizing final builder at {idx_path}")
+    final_builder.finalize(idx_path)
+    logging.info(f"Number of bins: {num_bins}, Average bin length: {avg_bins_len}, Average bin sum: {avg_bins_sum}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py
index 71d9bda12de1..ddd409e928b2 100644
--- a/nemo/collections/multimodal/data/neva/neva_dataset.py
+++ b/nemo/collections/multimodal/data/neva/neva_dataset.py
@@ -18,7 +18,7 @@
 import re
 import tarfile
 from dataclasses import dataclass
-from typing import Any, Dict, List, Sequence, Union
+from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -49,6 +49,15 @@
 MAX_NUM_IMAGES = 1
 IGNORE_INDEX = -1
 
+try:
+    from megatron.core.datasets.indexed_dataset import IndexedDataset
+
+    HAVE_MEGATRON_CORE = True
+
+except (ImportError, ModuleNotFoundError):
+
+    HAVE_MEGATRON_CORE = False
+
 
 class TarOrFolderImageLoader:
     """
@@ -781,12 +790,27 @@ class DataCollatorForSupervisedDataset(object):
     tokenizer: transformers.PreTrainedTokenizer
 
     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        packed_sequence = "cu_seqlens" in instances[0]
         max_len = max(instance['tokens'].shape[0] for instance in instances)
         max_len = (max_len - 1) // 64 * 64 + 64
         for instance in instances:
             pad_len = max_len - instance['tokens'].shape[0]
             instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
             instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', -1)
+            if packed_sequence and instance["cu_seqlens"][-1] != max_len:
+                instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0)
+
+        if packed_sequence:
+            max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances)
+            max_len_image = max(instance['image'].shape[0] for instance in instances)
+            for instance in instances:
+                pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0]
+                instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len)
+
+                x = instance['image']
+                num_pad = max_len_image - x.shape[0]
+                pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device)
+                instance['image'] = torch.cat((x, pad_tensor), dim=0)
 
         batch = default_collate(instances)
         tokenizer = self.tokenizer
@@ -796,13 +820,25 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         labels = batch['labels']
         media = batch.get('image')
 
-        attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-            data=tokens,
-            eod_token=tokenizer.eos_id,
-            eod_mask_loss=model_cfg.data.get("eod_mask_loss", False),
-            reset_attention_mask=False,
-            reset_position_ids=False,
-        )
+        if packed_sequence:
+            cu_seqlens = batch["cu_seqlens"]
+            position_ids = []
+            for cu_seqlen in cu_seqlens:
+                position_ids.append([])
+                for ind in range(0, len(cu_seqlen) - 1):
+                    seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind]
+                    position_ids[-1].extend(list(range(seqlen)))
+            position_ids = torch.LongTensor(position_ids)
+            loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device)
+            attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device)
+        else:
+            attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
+                data=tokens,
+                eod_token=tokenizer.eos_id,
+                eod_mask_loss=model_cfg.data.get("eod_mask_loss", False),
+                reset_attention_mask=False,
+                reset_position_ids=False,
+            )
 
         loss_mask[labels == -1] = 0.0
         tokens[tokens == -1] = 0
@@ -821,6 +857,8 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
             'position_ids': position_ids,
             'media': media,
         }
+        if packed_sequence:
+            batch["cu_seqlens"] = cu_seqlens
         return batch
 
 
@@ -859,3 +897,23 @@ def make_supervised_data_module(tokenizer, model_cfg) -> Dict:
     )
 
     return dict(train_dataset=train_dataset, eval_dataset=train_dataset)
+
+
+class NevaPackedSeqDatatset(Dataset):
+    def __init__(self, data_path: str, crop_size: Tuple[int, int] = (224, 224)):
+        self.ds = IndexedDataset(data_path)
+        self.crop_size = crop_size
+
+    def __len__(self):
+        return len(self.ds.document_indices) - 1
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        doc_start = self.ds.document_indices[i]
+        batch = {
+            "cu_seqlens": torch.IntTensor(self.ds[doc_start]),
+            "tokens": torch.LongTensor(self.ds[doc_start + 1]),
+            "labels": torch.LongTensor(self.ds[doc_start + 2]),
+            "image": torch.FloatTensor(self.ds[doc_start + 3]).reshape(-1, 3, *self.crop_size),
+        }
+
+        return batch
diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
index cff8ab1a7b5f..5b50a8340b06 100644
--- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
+++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py
@@ -21,6 +21,7 @@
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from omegaconf.dictconfig import DictConfig
+from pkg_resources import packaging
 from pytorch_lightning.trainer.trainer import Trainer
 from transformers import CLIPVisionModel
 
@@ -28,6 +29,7 @@
 from nemo.collections.multimodal.data.neva.conversation import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN
 from nemo.collections.multimodal.data.neva.neva_dataset import (
     DataCollatorForSupervisedDataset,
+    NevaPackedSeqDatatset,
     make_supervised_data_module,
 )
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import (
@@ -43,7 +45,10 @@
     AdapterName,
     MultimodalProjectorAdapterConfig,
 )
-from nemo.collections.nlp.modules.common.megatron.utils import average_losses_across_data_parallel_group
+from nemo.collections.nlp.modules.common.megatron.utils import (
+    average_losses_across_data_parallel_group,
+    get_iterator_k_split,
+)
 from nemo.collections.nlp.modules.common.text_generation_utils import (
     generate,
     get_computeprob_response,
@@ -61,6 +66,7 @@
 
 try:
     import apex.transformer.pipeline_parallel.utils
+    from apex.transformer.pipeline_parallel.utils import get_num_microbatches
 
     HAVE_APEX = True
 
@@ -71,6 +77,7 @@
 try:
     from megatron.core import InferenceParams, dist_checkpointing, parallel_state
     from megatron.core.models.gpt import GPTModel as MCoreGPTModel
+    from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
 
     HAVE_MEGATRON_CORE = True
 
@@ -385,14 +392,24 @@ def __init__(
         NevaBaseModel.__init__(self, mm_cfg, media_start_id, media_end_id, mcore_gpt, **kwargs)
 
     def freeze_llm(self, mm_cfg):
-        for param in chain(self.embedding.parameters(), self.decoder.parameters(), self.output_layer.parameters(),):
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            embedding_parameters = self.embedding.parameters()
+        else:
+            embedding_parameters = {}
+        if parallel_state.is_pipeline_last_stage(ignore_virtual=True):
+            output_layer_parameters = self.output_layer.parameters()
+        else:
+            output_layer_parameters = {}
+
+        for param in chain(embedding_parameters, self.decoder.parameters(), output_layer_parameters,):
             param.requires_grad = False
 
     def forward(
         self, *args, **kwargs,
     ):
         media = kwargs.pop('media', None)
-        self.embedding.word_embeddings.set_media(media)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            self.embedding.word_embeddings.set_media(media)
         return MCoreGPTModel.forward(self, *args, **kwargs)
 
 
@@ -418,7 +435,8 @@ def forward(
         self, *args, **kwargs,
     ):
         media = kwargs.pop('media', None)
-        self.embedding.word_embeddings.set_media(media)
+        if parallel_state.is_pipeline_first_stage(ignore_virtual=True):
+            self.embedding.word_embeddings.set_media(media)
         return GPTModel.forward(self, *args, **kwargs)
 
 
@@ -611,7 +629,73 @@ def forward(self, tokens, text_position_ids, attention_mask, labels, media=None)
         return output_tensor
 
     def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None):
-        return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            return MegatronGPTModel.fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step)
+        else:
+            batch, _, _ = next(dataloader_iter)
+            _, seq_length = batch['tokens'].shape
+            batch_iter = get_iterator_k_split(batch, get_num_microbatches())
+
+            # handle asynchronous grad reduction
+            no_sync_func = None
+            grad_sync_func = None
+            param_sync_func = None
+            if not forward_only and self.with_distributed_adam:
+                no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,)
+                grad_sync_func = self.reduce_overlap_gradients
+                param_sync_func = self.sync_overlap_parameters
+
+            # pipeline schedules will get these from self.model.config
+            for module in self.get_model_module_list():
+                module.config.no_sync_func = no_sync_func
+                module.config.grad_sync_func = grad_sync_func
+                module.config.param_sync_func = param_sync_func
+
+            # run forward and backwards passes for an entire global batch
+            # we do this inside training_step to support pipeline parallelism
+            fwd_bwd_function = get_forward_backward_func()
+            # print(f"{torch.distributed.get_rank()}: {parallel_state.is_pipeline_last_stage()} {fwd_bwd_function}")
+
+            # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready
+            losses_reduced_per_micro_batch = fwd_bwd_function(
+                forward_step_func=self.get_forward_output_and_loss_func(forward_only),
+                data_iterator=self._make_data_iterator_list(batch_iter),
+                model=self.model,
+                num_microbatches=get_num_microbatches(),
+                forward_only=forward_only,
+                seq_length=seq_length,
+                micro_batch_size=self.cfg.micro_batch_size,
+                first_val_step=first_val_step,
+            )
+
+            # only the last stages of the pipeline return losses
+            if losses_reduced_per_micro_batch:
+                if (not forward_only) or self.cfg.data.get('validation_drop_last', True):
+                    # average loss across micro batches
+                    loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch]
+                    loss_tensor = torch.concat(loss_tensors_list)
+                    loss_mean = loss_tensor.mean()
+                else:
+                    # Get the total loss since micro batches sizes are not uniform
+                    loss_sum_tensors_list = [
+                        loss_sum['loss_sum_and_ub_size']
+                        for loss_sum in losses_reduced_per_micro_batch
+                        if loss_sum['loss_sum_and_ub_size'][1] > 0
+                    ]
+                    loss_sum = (
+                        torch.vstack(loss_sum_tensors_list).sum(axis=0)
+                        if len(loss_sum_tensors_list) > 0
+                        else torch.tensor([0.0, 0.0]).cuda()
+                    )
+                    return loss_sum
+            else:
+                # we're not on the last pipeline stage so no losses
+                if forward_only:
+                    loss_mean = []
+                else:
+                    loss_mean = torch.tensor(0.0).cuda()
+
+            return loss_mean
 
     def training_step(self, dataloader_iter):
         """
@@ -631,7 +715,9 @@ def loss_func(output_tensor, loss_mask):
                 return loss_for_ub, dict(avg=reduced_loss[0].unsqueeze(0))
 
         def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             if parallel_state.get_pipeline_model_parallel_world_size() == 1:
                 for k in batch.keys():
                     if self.get_attention_mask_from_fusion:
@@ -644,28 +730,36 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                     for k in batch.keys():
                         if self.get_attention_mask_from_fusion:
                             batch[k] = (
-                                batch[k].cuda(non_blocking=True) if k in ['tokens', 'position_ids', 'media'] else None
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['tokens', 'position_ids', 'media', 'cu_seqlens']
+                                else None
                             )
                         else:
                             batch[k] = (
                                 batch[k].cuda(non_blocking=True)
-                                if k in ['tokens', 'position_ids', 'attention_mask', 'media']
+                                if k in ['tokens', 'position_ids', 'attention_mask', 'media', 'cu_seqlens']
                                 else None
                             )
                 elif parallel_state.is_pipeline_last_stage():
                     # Last pipeline stage needs the labels, loss_mask, and attention_mask
                     for k in batch.keys():
                         if self.get_attention_mask_from_fusion:
-                            batch[k] = batch[k].cuda(non_blocking=True) if k in ['labels', 'loss_mask'] else None
+                            batch[k] = (
+                                batch[k].cuda(non_blocking=True)
+                                if k in ['labels', 'loss_mask', 'cu_seqlens']
+                                else None
+                            )
                         else:
                             batch[k] = (
                                 batch[k].cuda(non_blocking=True)
-                                if k in ['labels', 'loss_mask', 'attention_mask']
+                                if k in ['labels', 'loss_mask', 'attention_mask', 'cu_seqlens']
                                 else None
                             )
                 else:
                     # Intermediate pipeline stage doesn't need any inputs
-                    batch = {k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels', 'media']}
+                    batch = {
+                        k: None for k in ['tokens', 'position_ids', 'attention_mask', 'labels', 'media', 'loss_mask']
+                    }
 
             forward_args = {
                 'input_ids': batch['tokens'],
@@ -678,16 +772,40 @@ def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_
                 if self.use_loss_mask:
                     forward_args['loss_mask'] = batch['loss_mask']
                 forward_args['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers
+            else:
+                if 'cu_seqlens' in batch:  # packed sequence
+                    # these args are passed eventually into TEDotProductAttention.forward()
+                    cu_seqlens = batch['cu_seqlens'].squeeze()  # remove batch size dimension (mbs=1)
+                    max_seqlen = batch['max_seqlen'].squeeze() if 'max_seqlen' in batch else None
+
+                    try:
+                        from megatron.core.packed_seq_params import PackedSeqParams
+                    except (ImportError, ModuleNotFoundError) as e:
+                        mcore_version = packaging.version.Version(version('megatron-core'))
+                        logging.error(
+                            f"megatron-core v{mcore_version} does not support training with packed sequence. "
+                            "Please use megatron-core >= 0.5.0, or set model.data.train_ds.packed_sequence=False"
+                        )
+                        raise e
+                    forward_args['packed_seq_params'] = PackedSeqParams(
+                        cu_seqlens_q=cu_seqlens,
+                        cu_seqlens_kv=cu_seqlens,
+                        max_seqlen_q=max_seqlen,
+                        max_seqlen_kv=max_seqlen,
+                        qkv_format='thd',
+                    )
 
             output_tensor = model(**forward_args)
 
-            return output_tensor, partial(loss_func, loss_mask=batch['loss_mask'])
+            return output_tensor, partial(loss_func, loss_mask=batch.get('loss_mask'))
 
         return fwd_output_and_loss_func
 
     def get_forward_output_only_func(self):
         def fwd_output_only_func(dataloader_iter, model):
-            batch, _, _ = next(dataloader_iter)
+            batch = next(dataloader_iter)
+            if isinstance(batch, tuple):
+                batch = batch[0]
             extra_arg = {}
             (
                 tokens,
@@ -859,9 +977,14 @@ def setup(self, stage=None):
 
     def build_train_valid_test_datasets(self):
         logging.info('Building Neva datasets.')
-        ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
-        self._train_ds = ds_dict["train_dataset"]
-        self._validation_ds = ds_dict["eval_dataset"]
+        if self.cfg.data.get("packed_sequence", False):
+            assert self.cfg.micro_batch_size == 1, "Micro batch size must be 1 if using packed sequence"
+            self._train_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
+            self._validation_ds = NevaPackedSeqDatatset(self.cfg.data.data_prefix, self.cfg.data.get("crop_size"))
+        else:
+            ds_dict = make_supervised_data_module(tokenizer=self.tokenizer, model_cfg=self.cfg,)
+            self._train_ds = ds_dict["train_dataset"]
+            self._validation_ds = ds_dict["eval_dataset"]
 
         return self._train_ds, self._validation_ds
 
@@ -872,12 +995,17 @@ def build_pretraining_data_loader(
 
         logging.info(f'Building dataloader with consumed samples: {consumed_samples}')
         # Megatron sampler
+        if parallel_state.get_pipeline_model_parallel_world_size() == 1:
+            micro_batch_size = self.cfg.micro_batch_size
+        else:
+            micro_batch_size = self.cfg.global_batch_size // parallel_state.get_data_parallel_world_size()
+
         if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None:
             if self.cfg.data.dataloader_type == 'single':
                 batch_sampler = MegatronPretrainingSampler(
                     total_samples=len(dataset),
                     consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
+                    micro_batch_size=micro_batch_size,
                     data_parallel_rank=parallel_state.get_data_parallel_rank(),
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                     drop_last=drop_last,
@@ -889,7 +1017,7 @@ def build_pretraining_data_loader(
                     dataset=dataset,
                     total_samples=len(dataset),
                     consumed_samples=consumed_samples,
-                    micro_batch_size=self.cfg.micro_batch_size,
+                    micro_batch_size=micro_batch_size,
                     data_parallel_rank=parallel_state.get_data_parallel_rank(),
                     data_parallel_size=parallel_state.get_data_parallel_world_size(),
                     drop_last=self.cfg.get('drop_last', True),
@@ -953,14 +1081,9 @@ def load_state_dict(self, state_dict, strict=False):
 
     def on_load_checkpoint(self, checkpoint) -> None:
         pass
-        # if self.mcore_gpt:
-        #     state_dict = checkpoint["state_dict"]
-        #     self.load_state_dict(state_dict)
 
     def sharded_state_dict(self, prefix: str = ''):
         return None
-        # sharded_state_dict = MegatronGPTModel.sharded_state_dict(self, prefix)
-        # return sharded_state_dict
 
     def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None) -> Any:
         inference_config = self.get_inference_config()
diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py
index 723e965eb8a8..71c28cf00855 100644
--- a/nemo/collections/multimodal/parts/utils.py
+++ b/nemo/collections/multimodal/parts/utils.py
@@ -320,7 +320,7 @@ def dummy():
 
 
 def create_neva_model_and_processor(cfg):
-    from nemo.collections.multimodal.models.neva.neva_model import MegatronNevaModel
+    from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import MegatronNevaModel
 
     plugins = []
     if cfg.get('cluster_type', None) == 'BCP':
@@ -366,6 +366,7 @@ def create_neva_model_and_processor(cfg):
             neva_cfg.precision = trainer.precision
             neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None)
             neva_cfg.apply_rope_fusion = False
+            neva_cfg.fp8 = False
         #    neva_cfg.mm_cfg.vision_encoder.from_pretrained = None
 
         model = MegatronNevaModel.restore_from(
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index c2e1f0ed48b7..7a2f3459470c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -784,7 +784,11 @@ def training_step(self, dataloader_iter):
             self._optimizer._finish_bucket_grad_sync()
         elif self.megatron_amp_O2:
             # when using pipeline parallelism grads must be all-reduced after the pipeline (not asynchronously)
-            if self.cfg.get('pipeline_model_parallel_size', 1) > 1 or self.cfg.get('sequence_parallel', False):
+            if (
+                self.cfg.get('pipeline_model_parallel_size', 1) > 1
+                or self.cfg.get('sequence_parallel', False)
+                or not self.cfg.get('async_grad_allreduce', True)
+            ):
                 # main grads are stored in the MainParamsOptimizer wrapper
                 self._optimizer.allreduce_main_grads()
         else:
diff --git a/nemo/collections/nlp/modules/common/text_generation_utils.py b/nemo/collections/nlp/modules/common/text_generation_utils.py
index d130322404b6..b50c9de682f7 100644
--- a/nemo/collections/nlp/modules/common/text_generation_utils.py
+++ b/nemo/collections/nlp/modules/common/text_generation_utils.py
@@ -173,6 +173,10 @@ def megatron_neva_generate(model, prompt_dict_list, length_params, sampling_para
             **strategy_args,
         )
 
+        # Middle stages of PP will return None
+        if response is None:
+            continue
+
         # Regular expression pattern to match the sequence
         pattern = re.compile(rf'{DEFAULT_IM_START_TOKEN}( ⁇ )+{DEFAULT_IM_END_TOKEN}')
         pattern_nvgpt = re.compile(rf'{DEFAULT_IM_START_TOKEN}({DEFAULT_IMAGE_PATCH_TOKEN})+{DEFAULT_IM_END_TOKEN}')
diff --git a/nemo/collections/vision/data/megatron/data_samplers.py b/nemo/collections/vision/data/megatron/data_samplers.py
index 82fc49990c49..2f63e675731b 100644
--- a/nemo/collections/vision/data/megatron/data_samplers.py
+++ b/nemo/collections/vision/data/megatron/data_samplers.py
@@ -67,7 +67,9 @@ def __iter__(self):
             random_idx = torch.randperm(bucket_size, generator=g).tolist()
             idx_range = [start_idx + x for x in random_idx[bucket_offset:]]
         else:
-            full_bucket_size = (self.total_samples // self.micro_batch_size) * self.micro_batch_size
+            full_bucket_size = (
+                self.total_samples // self.micro_batch_times_data_parallel_size
+            ) * self.micro_batch_times_data_parallel_size
             full_bucket_offset = current_epoch_samples
             g = torch.Generator()
             g.manual_seed(self.epoch)

From 43ccc1d6bd82ec788d970f90c3ed7192882651b3 Mon Sep 17 00:00:00 2001
From: Pablo Garay <palenq@gmail.com>
Date: Tue, 30 Apr 2024 13:55:24 -0700
Subject: [PATCH 30/30] [Nemo CICD] Trigger on comment issued (#9062)

* match pytorch

* match pytorch
---
 .github/workflows/cicd-main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a13284521b3c..de250596da62 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -44,7 +44,7 @@ jobs:
 #  checkout-repository:
 #    runs-on: self-hosted-azure
 #    container:
-#      image: nvcr.io/nvidia/pytorch:24.01-py3
+#      image: nvcr.io/nvidia/pytorch:24.02-py3
 #      volumes:
 #        - ${{ github.workspace }}:/workspace
 #    steps:
@@ -60,7 +60,7 @@ jobs:
     if: ${{ github.event.label.name == 'Run CICD' }}
     # uses: actions/cache@v2
     #container:
-#      image: nvcr.io/nvidia/pytorch:24.01-py3
+#      image: nvcr.io/nvidia/pytorch:24.02-py3
 #      options: 
 #        # --user 0:128
 #        --device=/dev/nvidia0
@@ -78,7 +78,7 @@ jobs:
       run: |
         # Pull base PyTorch container
         docker pull nvcr.io/nvidia/pytorch:24.02-py3
-        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c '
+        docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.02-py3 /bin/bash -c '
             set -x
 
             # PyTorch version