From b2debd8d83ab88dbfbd6a1d9e7949320c4a7e8e2 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:03:12 -0700 Subject: [PATCH 01/39] Akoumparouli/fix get params for weight decay optimization (#8841) * fix get_params_for_weight_decay_optimization Signed-off-by: Alexandros Koumparoulis * filter returned values by presence of parameters Signed-off-by: Alexandros Koumparoulis * use module_._parameters.items instead of .named_parameters to avoid duplicate params Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis --- .../nlp/modules/common/megatron/utils.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/utils.py b/nemo/collections/nlp/modules/common/megatron/utils.py index 42d14592c363..97022ab5e459 100644 --- a/nemo/collections/nlp/modules/common/megatron/utils.py +++ b/nemo/collections/nlp/modules/common/megatron/utils.py @@ -366,17 +366,25 @@ def get_params_for_weight_decay_optimization( is_expert = lambda param: not getattr(param, 'allreduce', True) # Do the actual param classification for module in modules: - for name, param in module.named_parameters(): - if param is None: - continue - if name.endswith('.bias'): - no_weight_decay_params['params'].extend([param]) + for module_ in module.modules(): + if isinstance(module_, (FusedLayerNorm, FastLayerNorm, MixedFusedRMSNorm)): + no_weight_decay_params['params'].extend( + list(filter(lambda p: p is not None, module_._parameters.values())) + ) else: - if is_expert(param): - weight_decay_expert_params['params'].extend([param]) - else: - weight_decay_params['params'].extend([param]) - return weight_decay_params, weight_decay_expert_params, no_weight_decay_params + for name, param in module_._parameters.items(): + if param is None: + continue + if name.endswith('bias'): + no_weight_decay_params['params'].extend([param]) + else: + if is_expert(param): + weight_decay_expert_params['params'].extend([param]) + else: + weight_decay_params['params'].extend([param]) + + param_groups = [weight_decay_params, weight_decay_expert_params, no_weight_decay_params] + return tuple(filter(lambda g: len(g['params']) > 0, param_groups)) def get_all_params_for_weight_decay_optimization( @@ -394,7 +402,8 @@ def get_all_params_for_weight_decay_optimization( weight_decay_params['params'] += list(filter(lambda x: not is_expert(x), module.parameters())) weight_decay_expert_params['params'] += list(filter(is_expert, module.parameters())) - return weight_decay_params, weight_decay_expert_params + param_groups = [weight_decay_params, weight_decay_expert_params] + return tuple(filter(lambda g: len(g['params']) > 0, param_groups)) def get_iterator_k_split(batch: List[torch.Tensor], num_microbatches: int) -> Iterator: From 3f3df1c34aa73cfc890b93af678e3c3a63760fe5 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:07:27 -0700 Subject: [PATCH 02/39] Akoumparouli/peft fix (#8823) * Move precision restoration inside megtron_trainer_builder Signed-off-by: Alexandros Koumparoulis * Don't enforce O1 in eval Signed-off-by: Alexandros Koumparoulis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * safer prefix replacer Signed-off-by: Alexandros Koumparoulis * comment Signed-off-by: Alexandros Koumparoulis * drop conf resolve Signed-off-by: Alexandros Koumparoulis * typo Signed-off-by: Alexandros Koumparoulis * fix Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../tuning/megatron_gpt_finetuning.py | 5 ----- .../nlp/parts/megatron_trainer_builder.py | 7 ++++++- .../nlp/parts/mixins/nlp_adapter_mixins.py | 18 +++++++++++++++--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py index 1e6f680fad7e..aaa087a46623 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py @@ -56,12 +56,7 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - # cfg.trainer.precision becomes None in TrainerBuilder if precision_plugins exist since both precision plugins and precision - # can't exist in PTL >= 2.1, hence storing precision value from cfg.trainer.precision as its used for future steps like in merge_cfg_with func. - precision = cfg.trainer.precision trainer = MegatronLMPPTrainerBuilder(cfg).create_trainer() - # Restore the precision value after Trainer is built. - cfg.trainer.precision = precision exp_manager(trainer, cfg.exp_manager) model_cfg = MegatronGPTSFTModel.merge_cfg_with(cfg.model.restore_from_path, cfg) diff --git a/nemo/collections/nlp/parts/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py index 77d306c17da0..b25ce249d09d 100644 --- a/nemo/collections/nlp/parts/megatron_trainer_builder.py +++ b/nemo/collections/nlp/parts/megatron_trainer_builder.py @@ -129,12 +129,17 @@ def _plugins(self) -> list: return plugins def create_trainer(self, callbacks=None) -> Trainer: + # cfg.trainer.precision becomes None in Trainer if precision_plugins exist since both precision plugins and precision + precision = self.cfg.trainer.precision strategy = self._training_strategy() plugins = self._plugins() # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks if 'enable_progress_bar' not in self.cfg.trainer or self.cfg.trainer.enable_progress_bar: callbacks = [CustomProgressBar()] - return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks) + trainer = Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=callbacks) + # Restore the precision value after Trainer is built. + self.cfg.trainer.precision = precision + return trainer class MegatronBertTrainerBuilder(MegatronTrainerBuilder): diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 3797ec909737..123f0f06a33d 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -47,6 +47,14 @@ HAVE_MEGATRON_CORE = False +def replace_prefix(name, old_prefix, new_prefix): + if name.startswith(new_prefix): + return name + if not name.startswith(old_prefix): + return name + return name.replace(old_prefix, new_prefix, 1) + + class NLPAdapterModelMixin: """ NLP Adapter Mixin that can augment any transformer-based model with Adapter module support. This mixin class should be used only with a top level ModelPT subclass, that includes either a `model` or an `enc_dec_model` submodule. @@ -268,7 +276,7 @@ def load_adapters( """ Utility method that restores only the adapter module(s), and not the entire model itself. This allows the sharing of adapters which are often just a fraction of the size of the full model, - enabling easier deliver. + enabling easier delivery. .. note:: @@ -299,6 +307,8 @@ def load_adapters( '.nemo' ), "Inferring peft scheme is only supported for .nemo checkpoints. Please supply the `peft_cfgs` argument." peft_cfgs = [PEFT_CONFIG_MAP[conf.peft.peft_scheme](conf)] + if self.cfg.megatron_amp_O2: + state_dict = {replace_prefix(k, 'model.', 'model.module.'): v for k, v in state_dict.items()} self.add_adapter(peft_cfgs) if not self.ptuning_only_and_non_first_stage: assert set(state_dict.keys()) == self.adapter_keys.union(self.tunable_base_param_keys) @@ -506,17 +516,19 @@ def merge_inference_cfg(cls, path: str, cfg: DictConfig) -> DictConfig: with open_dict(peft_cfg): # update the model config of the trained model with params we want to set at inference time. - peft_cfg.precision = cfg.trainer.precision for key, val in cfg.model.items(): if key != 'data': peft_cfg[key] = val + if cfg.get("trainer", None) and cfg.trainer.get("precision"): + peft_cfg.precision = cfg.trainer.precision peft_cfg.data.test_ds = cfg.model.data.test_ds with open_dict(cfg): cfg.inference.add_BOS = peft_cfg.data.test_ds.add_bos cfg.inference.tokens_to_generate = peft_cfg.data.test_ds.get("tokens_to_generate", 1) - peft_cfg.megatron_amp_O2 = False # always evaluate with O1 + if cfg.model.get('megatron_amp_O2', None) is not None: + peft_cfg.megatron_amp_O2 = cfg.model.megatron_amp_O2 return peft_cfg def freeze(self, training: bool = False) -> None: From 4d0ae36dc8b6b720a66a43c5bb9eb6ef6e27fec1 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Tue, 9 Apr 2024 21:28:04 -0400 Subject: [PATCH 03/39] Add deploy triton and query scripts (#8852) * Add deploy triton and query scripts Signed-off-by: Onur Yilmaz * Update scripts based on reviews Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz --- scripts/deploy/nlp/deploy_triton.py | 274 ++++++++++++++++++++++++++++ scripts/deploy/nlp/query.py | 247 +++++++++++++++++++++++++ 2 files changed, 521 insertions(+) create mode 100755 scripts/deploy/nlp/deploy_triton.py create mode 100644 scripts/deploy/nlp/query.py diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py new file mode 100755 index 000000000000..aa896e924584 --- /dev/null +++ b/scripts/deploy/nlp/deploy_triton.py @@ -0,0 +1,274 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import os +import sys +from pathlib import Path + +from nemo.deploy import DeployPyTriton +from nemo.export import TensorRTLLM + + +LOGGER = logging.getLogger("NeMo") + + +def get_args(argv): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Deploy nemo models to Triton", + ) + parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") + parser.add_argument( + "-ptnc", + "--ptuning_nemo_checkpoint", + nargs='+', + type=str, + required=False, + help="Source .nemo file for prompt embeddings table", + ) + parser.add_argument( + '-ti', '--task_ids', nargs='+', type=str, required=False, help='Unique task names for the prompt embedding.' + ) + parser.add_argument( + "-mt", + "--model_type", + type=str, + required=False, + choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma"], + help="Type of the model. gptnext, gpt, llama, falcon, and starcoder are only supported." + " gptnext and gpt are the same and keeping it for backward compatibility", + ) + parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service") + parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service") + parser.add_argument( + "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests" + ) + parser.add_argument( + "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server" + ) + parser.add_argument( + "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion" + ) + parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment") + parser.add_argument( + "-dt", + "--dtype", + choices=["bfloat16", "float16", "fp8", "int8"], + default="bfloat16", + type=str, + help="dtype of the model on TensorRT-LLM", + ) + parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model") + parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model") + parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") + parser.add_argument( + "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" + ) + parser.add_argument( + "-upkc", "--use_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." + ) + parser.add_argument( + "-dcf", + "--disable_context_fmha", + default=False, + action='store_true', + help="Disable fused Context MultiHeadedAttention (required for V100 support).", + ) + parser.add_argument( + "-mbm", + '--multi_block_mode', + default=False, + action='store_true', + help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ + It is beneifical when batchxnum_heads cannot fully utilize GPU.', + ) + parser.add_argument( + "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences." + ) + parser.add_argument( + '--use_lora_plugin', + nargs='?', + const=None, + default=False, + choices=['float16', 'float32', 'bfloat16'], + help="Activates the lora plugin which enables embedding sharing.", + ) + parser.add_argument( + '--lora_target_modules', + nargs='+', + default=None, + choices=["attn_qkv", "attn_q", "attn_k", "attn_v", "attn_dense", "mlp_h_to_4h", "mlp_gate", "mlp_4h_to_h",], + help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.", + ) + parser.add_argument( + '--max_lora_rank', + type=int, + default=64, + help='maximum lora rank for different lora modules. ' + 'It is used to compute the workspace size of lora plugin.', + ) + parser.add_argument( + "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights" + ) + parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") + + args = parser.parse_args(argv) + return args + + +def nemo_deploy(argv): + args = get_args(argv) + + if args.debug_mode: + loglevel = logging.DEBUG + else: + loglevel = logging.INFO + + LOGGER.setLevel(loglevel) + LOGGER.info("Logging level set to {}".format(loglevel)) + LOGGER.info(args) + + if args.triton_model_repository is None: + trt_llm_path = "/tmp/trt_llm_model_dir/" + LOGGER.info( + "/tmp/trt_llm_model_dir/ path will be used as the TensorRT LLM folder. " + "Please set this parameter if you'd like to use a path that has already " + "included the TensorRT LLM model files." + ) + Path(trt_llm_path).mkdir(parents=True, exist_ok=True) + else: + trt_llm_path = args.triton_model_repository + + if args.nemo_checkpoint is None and args.triton_model_repository is None: + LOGGER.error( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint." + ) + return + + if args.nemo_checkpoint is None and not os.path.isdir(args.triton_model_repository): + LOGGER.error( + "The provided model repository is not a valid TensorRT-LLM model " + "directory. Please provide a --nemo_checkpoint." + ) + return + + if args.nemo_checkpoint is not None and args.model_type is None: + LOGGER.error("Model type is required to be defined if a nemo checkpoint is provided.") + return + + ptuning_tables_files = [] + if not args.ptuning_nemo_checkpoint is None: + if args.max_prompt_embedding_table_size is None: + LOGGER.error("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).") + return + + for pt_checkpoint in args.ptuning_nemo_checkpoint: + ptuning_nemo_checkpoint_path = Path(pt_checkpoint) + if ptuning_nemo_checkpoint_path.exists(): + if ptuning_nemo_checkpoint_path.is_file(): + ptuning_tables_files.append(pt_checkpoint) + else: + LOGGER.error("Could not read the prompt tuning tables from {0}".format(pt_checkpoint)) + return + else: + LOGGER.error("File or directory {0} does not exist.".format(pt_checkpoint)) + return + + if args.task_ids is not None: + if len(ptuning_tables_files) != len(args.task_ids): + LOGGER.error( + "Number of task ids and prompt embedding tables have to match. " + "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids)) + ) + return + + trt_llm_exporter = TensorRTLLM(model_dir=trt_llm_path, lora_ckpt_list=args.lora_ckpt) + + if args.nemo_checkpoint is not None: + try: + LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") + trt_llm_exporter.export( + nemo_checkpoint_path=args.nemo_checkpoint, + model_type=args.model_type, + n_gpus=args.num_gpus, + tensor_parallel_size=args.num_gpus, + pipeline_parallel_size=1, + max_input_token=args.max_input_len, + max_output_token=args.max_output_len, + max_batch_size=args.max_batch_size, + max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, + paged_kv_cache=args.use_paged_kv_cache, + enable_context_fmha=not args.disable_context_fmha, + dtype=args.dtype, + enable_multi_block_mode=args.multi_block_mode, + use_lora_plugin=args.use_lora_plugin, + lora_target_modules=args.lora_target_modules, + max_lora_rank=args.max_lora_rank, + save_nemo_model_config=True, + ) + except Exception as error: + LOGGER.error("An error has occurred during the model export. Error message: " + str(error)) + return + + try: + for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files): + if args.task_ids is not None: + task_id = args.task_ids[i] + else: + task_id = i + + LOGGER.info( + "Adding prompt embedding table: {0} with task id: {1}.".format( + prompt_embeddings_checkpoint_path, task_id + ) + ) + trt_llm_exporter.add_prompt_table( + task_name=str(task_id), prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, + ) + except Exception as error: + LOGGER.error("An error has occurred during adding the prompt embedding table(s). Error message: " + str(error)) + return + + try: + nm = DeployPyTriton( + model=trt_llm_exporter, + triton_model_name=args.triton_model_name, + triton_model_version=args.triton_model_version, + max_batch_size=args.max_batch_size, + port=args.triton_port, + address=args.triton_http_address, + streaming=args.enable_streaming, + ) + + LOGGER.info("Triton deploy function will be called.") + nm.deploy() + except Exception as error: + LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) + return + + try: + LOGGER.info("Model serving on Triton is will be started.") + nm.serve() + except Exception as error: + LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) + return + + LOGGER.info("Model serving will be stopped.") + nm.stop() + + +if __name__ == '__main__': + nemo_deploy(sys.argv[1:]) diff --git a/scripts/deploy/nlp/query.py b/scripts/deploy/nlp/query.py new file mode 100644 index 000000000000..20f3d587a1cc --- /dev/null +++ b/scripts/deploy/nlp/query.py @@ -0,0 +1,247 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +import typing + +import numpy as np +from pytriton.client import DecoupledModelClient, ModelClient + + +def get_args(argv): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Exports nemo models stored in nemo checkpoints to TensorRT-LLM", + ) + parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server") + parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model") + prompt_group = parser.add_mutually_exclusive_group(required=True) + prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt") + prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from") + parser.add_argument("-swl", "--stop_words_list", type=str, help="Stop words list") + parser.add_argument("-bwl", "--bad_words_list", type=str, help="Bad words list") + parser.add_argument("-nrns", "--no_repeat_ngram_size", type=int, help="No repeat ngram size") + parser.add_argument("-mot", "--max_output_token", default=128, type=int, help="Max output token length") + parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k") + parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p") + parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature") + parser.add_argument("-ti", "--task_id", type=str, help="Task id for the prompt embedding tables") + parser.add_argument( + "-lt", + "--lora_task_uids", + default=None, + type=str, + nargs="+", + help="The list of LoRA task uids; use -1 to disable the LoRA module", + ) + parser.add_argument( + "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences." + ) + parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server") + + args = parser.parse_args(argv) + return args + + +def str_list2numpy(str_list: typing.List[str]) -> np.ndarray: + str_ndarray = np.array(str_list)[..., np.newaxis] + return np.char.encode(str_ndarray, "utf-8") + + +def query_llm( + url, + model_name, + prompts, + stop_words_list=None, + bad_words_list=None, + no_repeat_ngram_size=None, + max_output_token=128, + top_k=1, + top_p=0.0, + temperature=1.0, + random_seed=None, + task_id=None, + lora_uids=None, + init_timeout=60.0, +): + prompts = str_list2numpy(prompts) + inputs = {"prompts": prompts} + + if max_output_token is not None: + inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_) + + if top_k is not None: + inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) + + if top_p is not None: + inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) + + if temperature is not None: + inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) + + if random_seed is not None: + inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.single) + + if stop_words_list is not None: + stop_words_list = np.char.encode(stop_words_list, "utf-8") + inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list) + + if bad_words_list is not None: + bad_words_list = np.char.encode(bad_words_list, "utf-8") + inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list) + + if no_repeat_ngram_size is not None: + inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single) + + if task_id is not None: + task_id = np.char.encode(task_id, "utf-8") + inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id) + + if lora_uids is not None: + lora_uids = np.char.encode(lora_uids, "utf-8") + inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids) + + with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: + result_dict = client.infer_batch(**inputs) + output_type = client.model_config.outputs[0].dtype + + if output_type == np.bytes_: + sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8") + return sentences + else: + return result_dict["outputs"] + + +def query_llm_streaming( + url, + model_name, + prompts, + stop_words_list=None, + bad_words_list=None, + no_repeat_ngram_size=None, + max_output_token=512, + top_k=1, + top_p=0.0, + temperature=1.0, + random_seed=None, + task_id=None, + lora_uids=None, + init_timeout=60.0, +): + prompts = str_list2numpy(prompts) + inputs = {"prompts": prompts} + + if max_output_token is not None: + inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_) + + if top_k is not None: + inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) + + if top_p is not None: + inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) + + if temperature is not None: + inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) + + if random_seed is not None: + inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_) + + if stop_words_list is not None: + stop_words_list = np.char.encode(stop_words_list, "utf-8") + inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list) + + if bad_words_list is not None: + bad_words_list = np.char.encode(bad_words_list, "utf-8") + inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list) + + if no_repeat_ngram_size is not None: + inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single) + + if task_id is not None: + task_id = np.char.encode(task_id, "utf-8") + inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id) + + if lora_uids is not None: + lora_uids = np.char.encode(lora_uids, "utf-8") + inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids) + + with DecoupledModelClient(url, model_name, init_timeout_s=init_timeout) as client: + for partial_result_dict in client.infer_batch(**inputs): + output_type = client.model_config.outputs[0].dtype + if output_type == np.bytes_: + sentences = np.char.decode(partial_result_dict["outputs"].astype("bytes"), "utf-8") + yield sentences + else: + yield partial_result_dict["outputs"] + + +def query(argv): + args = get_args(argv) + + if args.prompt_file is not None: + with open(args.prompt_file, "r") as f: + args.prompt = f.read() + + if args.enable_streaming: + output_generator = query_llm_streaming( + url=args.url, + model_name=args.model_name, + prompts=[args.prompt], + stop_words_list=None if args.stop_words_list is None else [args.stop_words_list], + bad_words_list=None if args.bad_words_list is None else [args.bad_words_list], + no_repeat_ngram_size=args.no_repeat_ngram_size, + max_output_token=args.max_output_token, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + task_id=args.task_id, + lora_uids=args.lora_task_uids, + init_timeout=args.init_timeout, + ) + # The query returns a generator that yields one array per model step, + # with the partial generated text in the last dimension. Print that partial text + # incrementally and compare it with all the text generated so far. + prev_output = '' + for output in output_generator: + cur_output = output[0][0] + if prev_output == '' or cur_output.startswith(prev_output): + print(cur_output[len(prev_output) :], end='', flush=True) + else: + print("WARN: Partial output mismatch, restarting output...") + print(cur_output, end='', flush=True) + prev_output = cur_output + print() + + else: + outputs = query_llm( + url=args.url, + model_name=args.model_name, + prompts=[args.prompt], + stop_words_list=None if args.stop_words_list is None else [args.stop_words_list], + bad_words_list=None if args.bad_words_list is None else [args.bad_words_list], + no_repeat_ngram_size=args.no_repeat_ngram_size, + max_output_token=args.max_output_token, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + task_id=args.task_id, + lora_uids=args.lora_task_uids, + init_timeout=args.init_timeout, + ) + print(outputs[0][0]) + + +if __name__ == '__main__': + query(sys.argv[1:]) From a6db8dbabec50ee151d94e1352df8c078874fbfb Mon Sep 17 00:00:00 2001 From: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Date: Tue, 9 Apr 2024 21:30:10 -0400 Subject: [PATCH 04/39] add check if pos embed (#8857) Signed-off-by: jiemingz Co-authored-by: jiemingz Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> --- .../nlp/models/language_modeling/megatron_base_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index baa6e30af81d..854c5ee02e31 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -775,7 +775,8 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any: if parallel_state.is_pipeline_first_stage(ignore_virtual=True): if self.mcore_gpt: fp32_params.append(modules[0].shared_embedding_or_output_weight()) - fp32_params.append(modules[0].embedding.position_embeddings.weight) + if modules[0].embedding.add_position_embedding: + fp32_params.append(modules[0].embedding.position_embeddings.weight) else: fp32_params.append(modules[0].word_embeddings_weight()) fp32_params.append(modules[0].position_embeddings_weight()) From 0ea94f78b39eec76cfa9bf9df3126328a93337c4 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Tue, 9 Apr 2024 18:36:22 -0700 Subject: [PATCH 05/39] Enable DGRAD RS overlap (#8840) * Enable DGRAD RS overlap Signed-off-by: Jaemin Choi * Support cases where TE version is new but NeMo/MCore is not Signed-off-by: Jaemin Choi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Clean up syntax Signed-off-by: Jaemin Choi * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jaemin Choi Co-authored-by: Jaemin Choi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../gpt_full_te_layer_autocast_spec.py | 30 ++++++++++++++++--- .../modules/common/megatron/transformer.py | 30 ++++++++++++++++--- 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index f89cbedf9f5d..a6d422a3f2d4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -123,8 +123,18 @@ def __init__( } te_version = packaging.version.Version(version("transformer-engine")) if te_version > packaging.version.Version("1.5.0"): - transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True) - transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True) + for comm in ["ag", "rs"]: + ub_overlap_flag = "ub_overlap_" + comm + split_gemm_flag = "ub_split_" + comm + atomic_gemm_flag = "ub_atomic_gemm_" + comm + # Use old overlap flags if they were supplied instead + if ub_overlap_flag in kwargs: + transformer_layer_args[ub_overlap_flag] = kwargs[ub_overlap_flag] + else: + transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get( + atomic_gemm_flag, False + ) + transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False) else: transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True) transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True) @@ -204,8 +214,20 @@ def __init__(self, config, layer_number=1, hidden_dropout=None): } te_version = packaging.version.Version(version("transformer-engine")) if te_version > packaging.version.Version("1.5.0"): - transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag - transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs + # Use old overlap flags if they were supplied instead + transformer_layer_args["ub_overlap_ag"] = ( + config.tp_comm_overlap_ag + if hasattr(config, "tp_comm_overlap_ag") + else config.tp_comm_split_ag or config.tp_comm_atomic_ag + ) + transformer_layer_args["ub_overlap_rs"] = ( + config.tp_comm_overlap_rs + if hasattr(config, "tp_comm_overlap_rs") + else config.tp_comm_split_rs or config.tp_comm_atomic_rs + ) + transformer_layer_args["ub_overlap_rs_dgrad"] = ( + config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False + ) else: transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index d37c1e75d341..b33a996b7987 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -839,8 +839,18 @@ def __init__( } te_version = packaging.version.Version(version("transformer-engine")) if te_version > packaging.version.Version("1.5.0"): - transformer_layer_args["ub_overlap_ag"] = kwargs.get("ub_overlap_ag", True) - transformer_layer_args["ub_overlap_rs"] = kwargs.get("ub_overlap_rs", True) + for comm in ["ag", "rs"]: + ub_overlap_flag = "ub_overlap_" + comm + split_gemm_flag = "ub_split_" + comm + atomic_gemm_flag = "ub_atomic_gemm_" + comm + # Use old overlap flags if they were supplied instead + if ub_overlap_flag in kwargs: + transformer_layer_args[ub_overlap_flag] = kwargs[ub_overlap_flag] + else: + transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get( + atomic_gemm_flag, False + ) + transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False) else: transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True) transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True) @@ -1099,8 +1109,20 @@ def build_layer(layer_number): } te_version = packaging.version.Version(version("transformer-engine")) if te_version > packaging.version.Version("1.5.0"): - transformer_layer_args["ub_overlap_ag"] = config.tp_comm_overlap_ag - transformer_layer_args["ub_overlap_rs"] = config.tp_comm_overlap_rs + # Use old overlap flags if they were supplied instead + transformer_layer_args["ub_overlap_ag"] = ( + config.tp_comm_overlap_ag + if hasattr(config, "tp_comm_overlap_ag") + else config.tp_comm_split_ag or config.tp_comm_atomic_ag + ) + transformer_layer_args["ub_overlap_rs"] = ( + config.tp_comm_overlap_rs + if hasattr(config, "tp_comm_overlap_rs") + else config.tp_comm_split_rs or config.tp_comm_atomic_rs + ) + transformer_layer_args["ub_overlap_rs_dgrad"] = ( + config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False + ) else: transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs From 6b660e74439f96f987034ae1ecbf9e837dbff02f Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 10 Apr 2024 08:02:38 -0400 Subject: [PATCH 06/39] fix precision of output model in conversion scripts (#8855) Signed-off-by: Chen Cui --- scripts/checkpoint_converters/convert_bert_hf_to_nemo.py | 3 +++ scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py | 3 +++ scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py | 3 +++ scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py | 3 +++ 4 files changed, 12 insertions(+) diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py index 24294cfdfb85..278f7b879b28 100644 --- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py @@ -31,6 +31,7 @@ from transformers import AutoModel from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import logging @@ -238,6 +239,8 @@ def convert(args): nemo_state_dict['model.language_model.embedding.word_embeddings.weight'] = padded_embedding model.load_state_dict(nemo_state_dict, strict=True) + dtype = torch_dtype_from_precision(args.precision) + model = model.to(dtype=dtype) model.save_to(args.output_path) logging.info(f'NeMo model saved to: {args.output_path}') diff --git a/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py index 9e2eb5e3a797..de12aefd1844 100644 --- a/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_gemma_hf_to_nemo.py @@ -30,6 +30,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import logging @@ -259,6 +260,8 @@ def convert(args): ), f'prediction mismatch: {hf_tokenizer.decode(hf_next_token)} != {hf_tokenizer.decode(next_token)}' logging.info(f'=' * 100) + dtype = torch_dtype_from_precision(args.precision) + model = model.to(dtype=dtype) model.save_to(args.output_path) logging.info(f'NeMo model saved to: {args.output_path}') diff --git a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py index eeefbd215a1a..c35906dc78c1 100644 --- a/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py +++ b/scripts/checkpoint_converters/convert_gemma_jax_to_nemo.py @@ -33,6 +33,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import logging @@ -224,6 +225,8 @@ def convert(args): nemo_state_dict = adjust_tensor_shapes(model, new_state_dict) model.load_state_dict(nemo_state_dict, strict=False) + dtype = torch_dtype_from_precision(args.precision) + model = model.to(dtype=dtype) model.save_to(args.output_path) logging.info(f'NeMo model saved to: {args.output_path}') diff --git a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py index b0dddcc60233..583ee7893c0f 100644 --- a/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_gemma_pyt_to_nemo.py @@ -33,6 +33,7 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder +from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision from nemo.utils import logging PAD_TOKEN_ID = -1 @@ -303,6 +304,8 @@ def convert(args): ) assert torch.argmax(nemo_outputs[0, -1], dim=-1) == pyt_outputs, "Predicted next token not match." + dtype = torch_dtype_from_precision(args.precision) + model = model.to(dtype=dtype) model.save_to(args.output_path) logging.info(f'NeMo model saved to: {args.output_path}') From 95a0a3e2b6b72f8fd6941a7de176029821d5cc3e Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Wed, 10 Apr 2024 19:01:38 +0300 Subject: [PATCH 07/39] NeMo upgrade to ToT mcore & ToT TE (#8755) * add mcore dataset updates Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix mcore import Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * update mcore installation Signed-off-by: dimapihtar * update mcore installation Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update apex, TE & PyT Signed-off-by: dimapihtar * setup pythonpath for mcore Signed-off-by: dimapihtar * add mcore to python path Signed-off-by: dimapihtar * add mcore to pythonpath Signed-off-by: dimapihtar * update pythonpath for mcore Signed-off-by: dimapihtar * change pythonpath for mcore Signed-off-by: dimapihtar * update mcore pythonpath Signed-off-by: dimapihtar * update mcore pythonpath Signed-off-by: dimapihtar * revert mcore ds changes Signed-off-by: dimapihtar * revert config Signed-off-by: dimapihtar * add qk_layernorm support for Falcon self attn submodule Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code style changes Signed-off-by: dimapihtar * add nemo implementation for get_gpt_layer_ammo_spec Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix typo Signed-off-by: dimapihtar * skip Llama2 - INT8 SQ test Signed-off-by: dimapihtar * skip Llama2 - INT8 SQ test Signed-off-by: dimapihtar * comment out NeMo PTQ test Signed-off-by: dimapihtar * bert mcore updates Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add qk_layernorm support for bert's self attention submodule Signed-off-by: dimapihtar * add qk_layernorm support for bert's self attn submodule Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * change mcore commit Signed-off-by: dimapihtar * switch back to mcore original Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * bugfix Signed-off-by: dimapihtar * update TE Signed-off-by: dimapihtar * change legacy model to mcore based model for lora Signed-off-by: dimapihtar * remove unnecessary files Signed-off-by: dimapihtar * update mcore commit Signed-off-by: dimapihtar * uncomment PTQ tests Signed-off-by: dimapihtar * remove sbert Signed-off-by: dimapihtar * switch back to mcore main Signed-off-by: dimapihtar * remove unused variable Signed-off-by: dimapihtar * comment out CUDA Graph test Signed-off-by: dimapihtar --------- Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: Pablo Garay --- Jenkinsfile | 111 ++++++++---------- .../megatron/gpt_fim_dataset.py | 5 - .../megatron_bert_embedding_model.py | 4 +- .../megatron/bert/bert_model.py | 19 +-- .../megatron/bert/bert_spec.py | 9 +- .../megatron/falcon/falcon_spec.py | 5 +- .../gpt_full_te_layer_autocast_spec.py | 2 +- .../megatron/gpt_layer_ammo_spec.py | 77 ++++++++++++ .../language_modeling/megatron_bert_model.py | 2 +- .../language_modeling/megatron_gpt_model.py | 28 ++--- 10 files changed, 168 insertions(+), 94 deletions(-) create mode 100644 nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py diff --git a/Jenkinsfile b/Jenkinsfile index 14f9a38a9c17..431bc24907ed 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,7 +1,7 @@ pipeline { agent { docker { - image 'nvcr.io/nvidia/pytorch:24.01-py3' + image 'nvcr.io/nvidia/pytorch:24.02-py3' args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1' } } @@ -63,44 +63,35 @@ pipeline { } } - // Transformer Engine 1.2.0 stage('Transformer Engine installation') { steps { sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \ cd TransformerEngine && \ - git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \ + git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \ git checkout FETCH_HEAD && \ git submodule init && git submodule update && \ NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .' } } - // Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760 stage('Apex installation') { steps { sh 'git clone https://github.com/NVIDIA/apex.git && \ cd apex && \ - git checkout c07a4cf67102b9cd3f97d1ba36690f985bae4227 && \ + git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \ cp -R apex /usr/local/lib/python3.10/dist-packages' } } - stage('Pytorch lightning installation') { - steps { - sh 'git clone -b bug_fix https://github.com/athitten/pytorch-lightning.git && \ - cd pytorch-lightning && \ - PACKAGE_NAME=pytorch pip install -e .' - } - } - - // pip package should be working with main, if not we can update the commit here - // until the pip package is updated stage('Megatron Core installation') { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \ - pip install .' + git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \ + pip install . && \ + cd megatron/core/datasets && \ + make' + sh 'export PYTHONPATH="${PYTHONPATH}:/mnt/D3/JenkinsWorkDir/workspace/NeMo-multibranch_${GIT_BRANCH}/Megatron-LM"' } } @@ -217,48 +208,48 @@ pipeline { sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" } } - stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { - when { - anyOf { - branch 'main' - changeRequest target: 'main' - } - } - failFast true - steps { - sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ - trainer.precision=16 \ - trainer.num_nodes=1 \ - trainer.devices=1 \ - ++exp_manager.max_time_per_run=00:00:03:00 \ - exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ - trainer.max_steps=20 \ - model.micro_batch_size=1 \ - model.global_batch_size=1 \ - model.data.synthetic_data=True \ - model.first_stage_key=images_moments \ - model.cond_stage_key=clip_encoded \ - model.optim.name=megatron_fused_adam \ - +model.optim.capturable=True \ - exp_manager.ema.enable=False \ - model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ - ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ - ++model.cond_stage_config.max_length=77 \ - model.inductor=False \ - ~model.cond_stage_config.restore_from_path \ - ~model.cond_stage_config.freeze \ - ~model.cond_stage_config.layer \ - model.first_stage_config.from_pretrained=null \ - model.ddp_overlap=False \ - model.capture_cudagraph_iters=15 \ - model.unet_config.use_flash_attention=False \ - model.unet_config.attention_resolutions=[1] \ - model.unet_config.channel_mult=[1] \ - " - sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - } - } + //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { + // when { + // anyOf { + // branch 'main' + // changeRequest target: 'main' + // } + // } + // failFast true + // steps { + // sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" + // sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ + // trainer.precision=16 \ + // trainer.num_nodes=1 \ + // trainer.devices=1 \ + // ++exp_manager.max_time_per_run=00:00:03:00 \ + // exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ + // trainer.max_steps=20 \ + // model.micro_batch_size=1 \ + // model.global_batch_size=1 \ + // model.data.synthetic_data=True \ + // model.first_stage_key=images_moments \ + // model.cond_stage_key=clip_encoded \ + // model.optim.name=megatron_fused_adam \ + // +model.optim.capturable=True \ + // exp_manager.ema.enable=False \ + // model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ + // ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ + // ++model.cond_stage_config.max_length=77 \ + // model.inductor=False \ + // ~model.cond_stage_config.restore_from_path \ + // ~model.cond_stage_config.freeze \ + // ~model.cond_stage_config.layer \ + // model.first_stage_config.from_pretrained=null \ + // model.ddp_overlap=False \ + // model.capture_cudagraph_iters=15 \ + // model.unet_config.use_flash_attention=False \ + // model.unet_config.attention_resolutions=[1] \ + // model.unet_config.channel_mult=[1] \ + // " + // sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" + // } + //} // stage('L2: Multimodal ControlNet Train') { // when { // anyOf { @@ -4654,7 +4645,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' model.pipeline_model_parallel_size=1 \ model.tensor_model_parallel_size=2 \ model.sequence_parallel=true \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \ model.peft.peft_scheme='lora' \ model.answer_only_loss=True \ model.micro_batch_size=1 \ diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py index 8862b52ee84b..474761c41d67 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_fim_dataset.py @@ -33,11 +33,6 @@ IMPORT_ERROR = e -# is_dataset_built_on_rank function is needed for mcore GPTDatasetConfig -def is_dataset_built_on_rank(): - return True - - class GPTFIMDatasetConfig(GPTDatasetConfig): """Configuration object for Megatron Core GPT FIM datasets diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py index 5d8ff1d305bd..849438d408a5 100644 --- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py +++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py @@ -227,7 +227,7 @@ def setup(self, stage=None): for i, module in enumerate(self.model): parallel_state.set_virtual_pipeline_model_parallel_rank(i) sync_embeddings = ( - module.initialize_last_stage_with_word_embeddings + module.setup_embeddings_and_output_layer if self.mcore_bert else module.sync_initial_word_embeddings ) @@ -235,7 +235,7 @@ def setup(self, stage=None): parallel_state.set_virtual_pipeline_model_parallel_rank(0) else: sync_embeddings = ( - self.model.initialize_last_stage_with_word_embeddings + self.model.setup_embeddings_and_output_layer if self.mcore_bert else self.model.sync_initial_word_embeddings ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index 0fed19dd7718..749d960b9729 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -347,17 +347,19 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw # Output if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly - self.lm_head = MCoreBertLMHead( + self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,) + + self.output_layer = tensor_parallel.ColumnParallelLinear( self.config.hidden_size, - self.config, - self.parallel_output, self.vocab_size, - self.pre_process, - self.share_embeddings_and_output_weights, + config=self.config, + init_method=self.config.init_method, + bias=True, + skip_bias_add=False, + gather_output=not self.parallel_output, + skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, ) - self.output_layer = self.lm_head.output_layer - self.binary_head = None if self.add_binary_head: # TODO: Shoudl switch this to TE ? @@ -412,7 +414,8 @@ def forward( if self.share_embeddings_and_output_weights: output_weight = self.shared_embedding_or_output_weight() - logits = self.lm_head(hidden_states=hidden_states, word_embeddings_weight=output_weight) + hidden_states_after_lm_head = self.lm_head(hidden_states=hidden_states) + logits, _ = self.output_layer(hidden_states_after_lm_head, weight=output_weight) binary_logits = None if self.binary_head is not None and self.add_pooler: diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py index 31fd62126c15..58ea9c26fbcf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_spec.py @@ -26,6 +26,7 @@ ) from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType + from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.spec_utils import ModuleSpec @@ -51,6 +52,8 @@ linear_qkv=TEColumnParallelLinear, core_attention=TEDotProductAttention, linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, ), ), self_attn_bda=get_bias_dropout_add, @@ -71,7 +74,11 @@ module=SelfAttention, params={"attn_mask_type": AttnMaskType.padding}, submodules=SelfAttentionSubmodules( - linear_qkv=ColumnParallelLinear, core_attention=DotProductAttention, linear_proj=RowParallelLinear, + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, ), ), self_attn_bda=get_bias_dropout_add, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py index 924e5f4321e6..cf0c4c4d99ef 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/falcon/falcon_spec.py @@ -24,9 +24,9 @@ TERowParallelLinear, ) from megatron.core.transformer.enums import AttnMaskType + from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.spec_utils import ModuleSpec - from megatron.core.transformer.transformer_layer import TransformerLayerSubmodules HAVE_MEGATRON_CORE = True @@ -39,6 +39,7 @@ from .falcon_decoder_layer import FalconTransformerLayer + # Use this spec for an implementation using modules in TE def get_falcon_layer_spec() -> ModuleSpec: if not HAVE_MEGATRON_CORE: @@ -54,6 +55,8 @@ def get_falcon_layer_spec() -> ModuleSpec: linear_qkv=TEColumnParallelLinear, core_attention=TEDotProductAttention, linear_proj=TERowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, ), ), self_attn_bda=get_bias_dropout_add, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index a6d422a3f2d4..19766e4a34ca 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -287,7 +287,7 @@ def _get_layer_offset(self): return offset - def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = ()): + def sharded_state_dict(self, prefix: str = '', sharded_offsets: tuple = (), metadata=None): TENSOR_PARALLEL_LAYERS_AXIS_MAP = { 'self_attention.layernorm_qkv.weight': 0, 'self_attention.layernorm_qkv.bias': 0, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py new file mode 100644 index 000000000000..e51ecaba463a --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_ammo_spec.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear + from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules + from megatron.core.transformer.custom_layers.transformer_engine import TENorm + from megatron.core.transformer.dot_product_attention import DotProductAttention + from megatron.core.transformer.enums import AttnMaskType + from megatron.core.transformer.identity_op import IdentityOp + from megatron.core.transformer.mlp import MLP, MLPSubmodules + from megatron.core.transformer.spec_utils import ModuleSpec + from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError) as e: + + TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults + MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults + AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults + ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults + + HAVE_MEGATRON_CORE = False + IMPORT_ERROR = e + +# Use this spec for AMMO PTQ and TensorRT-LLM export +def get_gpt_layer_ammo_spec() -> ModuleSpec: + """Mix the native spec with TENorm. + + This is essentially the native local spec except for the layernorm implementation + is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and + prevents the apex dependency. + """ + if not HAVE_MEGATRON_CORE: + raise Exception(IMPORT_ERROR) + + return ModuleSpec( + module=TransformerLayer, + submodules=TransformerLayerSubmodules( + input_layernorm=TENorm, + self_attention=ModuleSpec( + module=SelfAttention, + params={"attn_mask_type": AttnMaskType.causal}, + submodules=SelfAttentionSubmodules( + linear_qkv=ColumnParallelLinear, + core_attention=DotProductAttention, + linear_proj=RowParallelLinear, + q_layernorm=IdentityOp, + k_layernorm=IdentityOp, + ), + ), + self_attn_bda=get_bias_dropout_add, + pre_mlp_layernorm=TENorm, + mlp=ModuleSpec( + module=MLP, submodules=MLPSubmodules(linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear,), + ), + mlp_bda=get_bias_dropout_add, + # Map TE-layernorm-fusion keys back + sharded_state_dict_keys_map={ + 'input_layernorm.': 'self_attention.linear_qkv.layer_norm_', + 'pre_mlp_layernorm.': 'mlp.linear_fc1.layer_norm_', + }, + ), + ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index fb02223112d6..82b2b1a96ff4 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -806,7 +806,7 @@ def setup(self, stage=None): if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: parallel_state.set_virtual_pipeline_model_parallel_rank(index) sync_embeddings = ( - module.initialize_last_stage_with_word_embeddings + module.setup_embeddings_and_output_layer if self.mcore_bert else module.sync_initial_word_embeddings ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index ae33cc6761e9..6648abac8ee0 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -36,15 +36,12 @@ MegatronPretrainingSampler, ) from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import build_train_valid_test_datasets -from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import ( - GPTFIMDataset, - GPTFIMDatasetConfig, - is_dataset_built_on_rank, -) +from nemo.collections.nlp.data.language_modeling.megatron.gpt_fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig from nemo.collections.nlp.models.language_modeling.megatron.falcon.falcon_spec import get_falcon_layer_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_full_te_layer_autocast_spec import ( get_gpt_full_te_layer_autocast_spec, ) +from nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_ammo_spec import get_gpt_layer_ammo_spec from nemo.collections.nlp.models.language_modeling.megatron.gpt_model import GPTModel from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel from nemo.collections.nlp.modules.common.megatron.build_model import build_model @@ -92,7 +89,9 @@ from megatron.core import InferenceParams, parallel_state, tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset - from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec + + # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used + # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec from megatron.core.models.gpt import GPTModel as MCoreGPTModel from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, @@ -1375,9 +1374,11 @@ def build_train_valid_test_datasets(self): tokenizer=self.tokenizer, ) else: + # Function needed for mcore GPTDataset + is_dataset_built_on_rank = lambda: True + mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False kwargs = { - "is_built_on_rank": is_dataset_built_on_rank, "random_seed": self.cfg.seed, "sequence_length": self.cfg.data.seq_length, "path_to_cache": self.cfg.data.index_mapping_dir, @@ -1399,17 +1400,14 @@ def build_train_valid_test_datasets(self): if self.cfg.data.get('add_fim', False): dataset_config = GPTFIMDatasetConfig(self.cfg.data.fim, **kwargs) - - self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( - GPTFIMDataset, train_valid_test_num_samples, dataset_config, - ).build() + dataset_type = GPTFIMDataset else: dataset_config = GPTDatasetConfig(**kwargs) dataset_type = MockGPTDataset if mock_dataset else GPTDataset - self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( - dataset_type, train_valid_test_num_samples, dataset_config, - ).build() + self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder( + dataset_type, train_valid_test_num_samples, is_dataset_built_on_rank, dataset_config, + ).build() if self._train_ds is not None: logging.info(f'Length of train dataset: {len(self._train_ds)}') @@ -1746,7 +1744,7 @@ def initialize_last_rank_embeddings(self): if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: parallel_state.set_virtual_pipeline_model_parallel_rank(index) sync_embeddings = ( - module.initialize_last_stage_with_word_embeddings + module.setup_embeddings_and_output_layer if self.mcore_gpt else module.sync_initial_word_embeddings ) From b33af25bdd1425eb42b96dc3aa06211c830b5278 Mon Sep 17 00:00:00 2001 From: Vladimir Bataev Date: Wed, 10 Apr 2024 21:03:16 +0400 Subject: [PATCH 08/39] Use Label-Looping algorithm for RNN-T decoding by default (#8831) * Use Label-Looping algorithm for RNN-T decoding by default * Fix loop labels + stateless decoding --------- Signed-off-by: Vladimir Bataev --- nemo/collections/asr/modules/rnnt.py | 4 +- .../asr/parts/submodules/rnnt_decoding.py | 2 +- .../parts/submodules/rnnt_greedy_decoding.py | 8 +- .../submodules/rnnt_loop_labels_computer.py | 15 +- .../submodules/tdt_loop_labels_computer.py | 15 +- .../test_asr_hybrid_rnnt_ctc_model_char.py | 63 ++++++-- .../asr/test_asr_rnnt_encdec_model.py | 139 ++++++++++++++---- 7 files changed, 173 insertions(+), 73 deletions(-) diff --git a/nemo/collections/asr/modules/rnnt.py b/nemo/collections/asr/modules/rnnt.py index 948760e68b30..5a7457f6379d 100644 --- a/nemo/collections/asr/modules/rnnt.py +++ b/nemo/collections/asr/modules/rnnt.py @@ -310,7 +310,9 @@ def score_hypothesis( def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]: batch = y.size(0) - state = [torch.ones([batch, self.context_size], dtype=torch.long, device=y.device) * self.blank_idx] + # state contains context_size - 1 elements for each utterance in batch, + # consistent with the state returned from StatelessNet.forward + state = [torch.ones([batch, self.context_size - 1], dtype=torch.long, device=y.device) * self.blank_idx] return state def batch_initialize_states(self, batch_states: List[torch.Tensor], decoder_states: List[List[torch.Tensor]]): diff --git a/nemo/collections/asr/parts/submodules/rnnt_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_decoding.py index ad71e5371f01..7a260f3c6c89 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_decoding.py @@ -319,7 +319,7 @@ def __init__(self, decoding_cfg, decoder, joint, blank_id: int): preserve_alignments=self.preserve_alignments, preserve_frame_confidence=self.preserve_frame_confidence, confidence_method_cfg=self.confidence_method_cfg, - loop_labels=self.cfg.greedy.get('loop_labels', False), + loop_labels=self.cfg.greedy.get('loop_labels', True), use_cuda_graph_decoder=self.cfg.greedy.get('use_cuda_graph_decoder', False), ) else: diff --git a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py index d69ed1c41049..464dc46e358c 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py +++ b/nemo/collections/asr/parts/submodules/rnnt_greedy_decoding.py @@ -568,9 +568,9 @@ class GreedyBatchedRNNTInfer(_GreedyRNNTInfer): - 'lin' for using the linear mapping. - 'exp' for using exponential mapping with linear shift. loop_labels: Switching between decoding algorithms. Both algorithms produce equivalent results. - loop_labels=True algorithm is faster (especially for large batches) but can use a bit more memory + loop_labels=True (default) algorithm is faster (especially for large batches) but can use a bit more memory (negligible overhead compared to the amount of memory used by the encoder). - loop_labels=False (default) is an implementation of a traditional decoding algorithm, which iterates over + loop_labels=False is an implementation of a traditional decoding algorithm, which iterates over frames (encoder output vectors), and in the inner loop, decodes labels for the current frame one by one, stopping when is found. loop_labels=True iterates over labels, on each step finding the next non-blank label @@ -588,7 +588,7 @@ def __init__( preserve_alignments: bool = False, preserve_frame_confidence: bool = False, confidence_method_cfg: Optional[DictConfig] = None, - loop_labels: bool = False, + loop_labels: bool = True, use_cuda_graph_decoder: bool = False, ): super().__init__( @@ -2299,7 +2299,7 @@ class GreedyBatchedRNNTInferConfig: preserve_alignments: bool = False preserve_frame_confidence: bool = False confidence_method_cfg: Optional[ConfidenceMethodConfig] = field(default_factory=lambda: ConfidenceMethodConfig()) - loop_labels: bool = False + loop_labels: bool = True use_cuda_graph_decoder: bool = False def __post_init__(self): diff --git a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py index 89b474e0f8ba..92cb8a36aeb5 100644 --- a/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py +++ b/nemo/collections/asr/parts/submodules/rnnt_loop_labels_computer.py @@ -283,21 +283,12 @@ def loop_labels_torch( became_inactive_mask = torch.empty_like(active_mask) # loop while there are active utterances - first_step = True while active_mask.any(): active_mask_prev.copy_(active_mask, non_blocking=True) # stage 1: get decoder (prediction network) output - if first_step: - # start of the loop, SOS symbol is passed into prediction network, state is None - # we need to separate this for torch.jit - decoder_output, state, *_ = self.decoder.predict( - labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size - ) - first_step = False - else: - decoder_output, state, *_ = self.decoder.predict( - labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size - ) + decoder_output, state, *_ = self.decoder.predict( + labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size + ) decoder_output = self.joint.project_prednet(decoder_output) # do not recalculate joint projection # stage 2: get joint output, iteratively seeking for non-blank labels diff --git a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py index e95ea48d15fe..c289ce06cdfa 100644 --- a/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py +++ b/nemo/collections/asr/parts/submodules/tdt_loop_labels_computer.py @@ -294,21 +294,12 @@ def loop_labels_torch( became_inactive_mask = torch.empty_like(active_mask) # loop while there are active utterances - first_step = True while active_mask.any(): active_mask_prev.copy_(active_mask, non_blocking=True) # stage 1: get decoder (prediction network) output - if first_step: - # start of the loop, SOS symbol is passed into prediction network, state is None - # we need to separate this for torch.jit - decoder_output, state, *_ = self.decoder.predict( - labels.unsqueeze(1), None, add_sos=False, batch_size=batch_size - ) - first_step = False - else: - decoder_output, state, *_ = self.decoder.predict( - labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size - ) + decoder_output, state, *_ = self.decoder.predict( + labels.unsqueeze(1), state, add_sos=False, batch_size=batch_size + ) decoder_output = self.joint.project_prednet(decoder_output) # do not recalculate joint projection # stage 2: get joint output, iteratively seeking for non-blank labels diff --git a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py index 60f807dc7b3e..85156bf9e2c5 100644 --- a/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py +++ b/tests/collections/asr/test_asr_hybrid_rnnt_ctc_model_char.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import copy +from typing import Optional import pytest import torch @@ -309,9 +310,14 @@ def test_BeamRNNTInferConfig(self): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding(self, greedy_class): + def test_greedy_decoding(self, greedy_class, loop_labels: Optional[bool]): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -330,7 +336,10 @@ def test_greedy_decoding(self, greedy_class): decoder = RNNTDecoder(prednet_cfg, vocab_size) joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list) - greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} + greedy = greedy_class( + decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs + ) # (B, D, T) enc_out = torch.randn(1, encoder_output_size, 30) @@ -381,9 +390,15 @@ def test_greedy_multi_decoding(self, greedy_class): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_stateless_decoder(self, greedy_class): + @pytest.mark.parametrize("context_size", [1, 2]) + def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Optional[bool], context_size: int): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -391,7 +406,7 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class): decoder_output_size = 4 joint_output_shape = 4 - prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1} + prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1, 'context_size': context_size} jointnet_cfg = { 'encoder_hidden': encoder_output_size, 'pred_hidden': decoder_output_size, @@ -402,7 +417,10 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class): decoder = StatelessTransducerDecoder(prednet_cfg, vocab_size) joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list) - greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} + greedy = greedy_class( + decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs + ) # (B, D, T) enc_out = torch.randn(1, encoder_output_size, 30) @@ -453,9 +471,14 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_preserve_alignment(self, greedy_class): + def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Optional[bool]): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -474,8 +497,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class): decoder = RNNTDecoder(prednet_cfg, vocab_size) joint_net = RNNTJoint(jointnet_cfg, vocab_size, vocabulary=token_list) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} greedy = greedy_class( - decoder, joint_net, blank_index=len(token_list) - 1, preserve_alignments=True, max_symbols_per_step=5 + decoder, + joint_net, + blank_index=len(token_list) - 1, + preserve_alignments=True, + max_symbols_per_step=5, + **additional_decoding_kwargs, ) # (B, D, T) @@ -591,9 +620,14 @@ def test_beam_decoding_preserve_alignments(self, beam_config): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_SampledRNNTJoint(self, greedy_class): + def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optional[bool]): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -612,7 +646,10 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class): decoder = RNNTDecoder(prednet_cfg, vocab_size) joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list) - greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} + greedy = greedy_class( + decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs + ) # (B, D, T) enc_out = torch.randn(1, encoder_output_size, 30) diff --git a/tests/collections/asr/test_asr_rnnt_encdec_model.py b/tests/collections/asr/test_asr_rnnt_encdec_model.py index d7c47adce1ad..d5ab0054ff87 100644 --- a/tests/collections/asr/test_asr_rnnt_encdec_model.py +++ b/tests/collections/asr/test_asr_rnnt_encdec_model.py @@ -73,7 +73,7 @@ def predict( return ( output, [ - torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :].exand( + torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :].expand( [1, batch_size, -1] ) ], @@ -90,22 +90,25 @@ def predict( ], ) - def initialize_state(self, y: torch.Tensor) -> Optional[List[torch.Tensor]]: - return None + def initialize_state(self, y: torch.Tensor) -> List[torch.Tensor]: + batch_size = y.shape[0] + # NB: .clone is necessary after .expand, since the decoding algorithm manipulates the state + # (replacing elements), and this requires the state to be a real full tensor + # (not an expanded view, in which different elements can refer to the same memory location) + return [ + torch.tensor([0] * self.vocab_size + [1], dtype=torch.float32)[None, None, :] + .expand([1, batch_size, -1]) + .clone() + ] def score_hypothesis( self, hypothesis: Hypothesis, cache: Dict[Tuple[int], Any] ) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]: return torch.tensor(), [torch.tensor()], torch.tensor() - def batch_select_state( - self, batch_states: Optional[List[torch.Tensor]], idx: int - ) -> Optional[List[List[torch.Tensor]]]: - if batch_states is not None: - states = [batch_states[0][:, idx]] - return [states] - else: - return None + def batch_select_state(self, batch_states: List[torch.Tensor], idx: int) -> Optional[List[List[torch.Tensor]]]: + states = [batch_states[0][:, idx]] + return [states] def batch_copy_states( self, @@ -126,6 +129,22 @@ def mask_select_states( return None return [states[0][:, mask]] + @classmethod + def batch_replace_states_mask( + cls, src_states: list[torch.Tensor], dst_states: list[torch.Tensor], mask: torch.Tensor, + ): + """Replace states in dst_states with states from src_states using the mask""" + for src_substate, dst_substate in zip(src_states, dst_states): + torch.where(mask.unsqueeze(0).unsqueeze(-1), src_substate, dst_substate, out=dst_substate) + + @classmethod + def batch_split_states(cls, batch_states: list[torch.Tensor]) -> list[list[torch.Tensor]]: + """ + Split states into a list of states. + Useful for splitting the final state for converting results of the decoding algorithm to Hypothesis class. + """ + return [sub_state.split(1, dim=1) for sub_state in batch_states] + class DummyRNNTJoint(AbstractRNNTJoint): def __init__(self, num_outputs: int): super().__init__() @@ -621,9 +640,15 @@ def test_greedy_multi_decoding(self, greedy_class): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_stateless_decoder(self, greedy_class): + @pytest.mark.parametrize("context_size", [1, 2]) + def test_greedy_decoding_stateless_decoder(self, greedy_class, loop_labels: Optional[bool], context_size: int): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -631,7 +656,7 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class): decoder_output_size = 4 joint_output_shape = 4 - prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1} + prednet_cfg = {'pred_hidden': decoder_output_size, 'pred_rnn_layers': 1, 'context_size': context_size} jointnet_cfg = { 'encoder_hidden': encoder_output_size, 'pred_hidden': decoder_output_size, @@ -642,8 +667,14 @@ def test_greedy_decoding_stateless_decoder(self, greedy_class): decoder = StatelessTransducerDecoder(prednet_cfg, vocab_size) for joint_type in [RNNTJoint, HATJoint]: joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list) - - greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} + greedy = greedy_class( + decoder, + joint_net, + blank_index=len(token_list) - 1, + max_symbols_per_step=5, + **additional_decoding_kwargs, + ) # (B, D, T) enc_out = torch.randn(1, encoder_output_size, 30) @@ -696,9 +727,14 @@ def test_greedy_multi_decoding_stateless_decoder(self, greedy_class): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_preserve_alignment(self, greedy_class): + def test_greedy_decoding_preserve_alignment(self, greedy_class, loop_labels: Optional[bool]): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -719,13 +755,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class): max_symbols_per_step = 5 for joint_type in [RNNTJoint, HATJoint]: joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list) - + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} greedy = greedy_class( decoder, joint_net, blank_index=len(token_list), preserve_alignments=True, max_symbols_per_step=max_symbols_per_step, + **additional_decoding_kwargs, ) # (B, D, T) @@ -760,9 +797,14 @@ def test_greedy_decoding_preserve_alignment(self, greedy_class): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_preserve_frame_confidence(self, greedy_class): + def test_greedy_decoding_preserve_frame_confidence(self, greedy_class, loop_labels: Optional[bool]): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -784,12 +826,14 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class): for joint_type in [RNNTJoint, HATJoint]: joint_net = joint_type(jointnet_cfg, vocab_size, vocabulary=token_list) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} greedy = greedy_class( decoder, joint_net, blank_index=len(token_list), preserve_frame_confidence=True, max_symbols_per_step=max_symbols_per_step, + **additional_decoding_kwargs, ) # (B, D, T) @@ -827,10 +871,17 @@ def test_greedy_decoding_preserve_frame_confidence(self, greedy_class): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) @pytest.mark.parametrize("max_symbols_per_step", [1, 5]) - def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_class, max_symbols_per_step): + def test_greedy_decoding_max_symbols_alignment( + self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool] + ): decoders = [max_symbols_setup["decoder"]] if greedy_class is greedy_decode.GreedyBatchedRNNTInfer: decoders.append(max_symbols_setup["decoder_masked"]) @@ -839,12 +890,14 @@ def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_c encoded_lengths = max_symbols_setup["encoded_lengths"] for decoder in decoders: + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} greedy = greedy_class( decoder_model=decoder, joint_model=joint, blank_index=decoder.blank_idx, max_symbols_per_step=max_symbols_per_step, preserve_alignments=True, + **additional_decoding_kwargs, ) with torch.no_grad(): @@ -869,10 +922,17 @@ def test_greedy_decoding_max_symbols_alignment(self, max_symbols_setup, greedy_c ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) @pytest.mark.parametrize("max_symbols_per_step", [-1, 0]) - def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step): + def test_greedy_decoding_max_symbols_confidence_incorrect_max_symbols( + self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool] + ): """Test ValueError for max_symbols_per_step <= 0""" decoders = [max_symbols_setup["decoder"]] if greedy_class is greedy_decode.GreedyBatchedRNNTInfer: @@ -880,6 +940,7 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_ joint = max_symbols_setup["joint"] for decoder in decoders: + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} with pytest.raises(ValueError): _ = greedy_class( decoder_model=decoder, @@ -887,6 +948,7 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_ blank_index=decoder.blank_idx, max_symbols_per_step=max_symbols_per_step, preserve_frame_confidence=True, + **additional_decoding_kwargs, ) @pytest.mark.skipif( @@ -894,10 +956,17 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_ ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) @pytest.mark.parametrize("max_symbols_per_step", [1, 5]) - def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_class, max_symbols_per_step): + def test_greedy_decoding_max_symbols_confidence( + self, max_symbols_setup, greedy_class, max_symbols_per_step: int, loop_labels: Optional[bool] + ): decoders = [max_symbols_setup["decoder"]] if greedy_class is greedy_decode.GreedyBatchedRNNTInfer: decoders.append(max_symbols_setup["decoder_masked"]) @@ -906,12 +975,14 @@ def test_greedy_decoding_max_symbols_confidence(self, max_symbols_setup, greedy_ encoded_lengths = max_symbols_setup["encoded_lengths"] for decoder in decoders: + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} greedy = greedy_class( decoder_model=decoder, joint_model=joint, blank_index=decoder.blank_idx, max_symbols_per_step=max_symbols_per_step, preserve_frame_confidence=True, + **additional_decoding_kwargs, ) with torch.no_grad(): @@ -1035,9 +1106,14 @@ def test_beam_decoding_preserve_alignments(self, beam_config): ) @pytest.mark.unit @pytest.mark.parametrize( - "greedy_class", [greedy_decode.GreedyRNNTInfer, greedy_decode.GreedyBatchedRNNTInfer], + ("greedy_class", "loop_labels"), + [ + (greedy_decode.GreedyRNNTInfer, None), + (greedy_decode.GreedyBatchedRNNTInfer, True), + (greedy_decode.GreedyBatchedRNNTInfer, False), + ], ) - def test_greedy_decoding_SampledRNNTJoint(self, greedy_class): + def test_greedy_decoding_SampledRNNTJoint(self, greedy_class, loop_labels: Optional[bool]): token_list = [" ", "a", "b", "c"] vocab_size = len(token_list) @@ -1056,7 +1132,10 @@ def test_greedy_decoding_SampledRNNTJoint(self, greedy_class): decoder = RNNTDecoder(prednet_cfg, vocab_size) joint_net = SampledRNNTJoint(jointnet_cfg, vocab_size, n_samples=2, vocabulary=token_list) - greedy = greedy_class(decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5) + additional_decoding_kwargs = {} if loop_labels is None else {"loop_labels": loop_labels} + greedy = greedy_class( + decoder, joint_net, blank_index=len(token_list) - 1, max_symbols_per_step=5, **additional_decoding_kwargs + ) # (B, D, T) enc_out = torch.randn(1, encoder_output_size, 30) From dd75285b295c5d5e71ea27bf9ddd74dbbd99c87a Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 10 Apr 2024 11:19:20 -0700 Subject: [PATCH 09/39] Cancel old runs for PR commit update (#8874) --- .github/workflows/cicd-main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 309e7936ee3b..550defff7814 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -15,9 +15,12 @@ name: "CICD NeMo" on: pull_request: - types: [opened, reopened, ready_for_review] branches: [ "main" ] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: gpu-test: runs-on: self-hosted-azure From 7c07a8de3743c4c03fb6727567cf1c3de3e7d193 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 10 Apr 2024 16:36:02 -0400 Subject: [PATCH 10/39] Fix packed seq doc math rendering issue (#8832) * Fix packed seq doc math rendering issue Signed-off-by: Chen Cui * Fix packed seq doc math rendering issue Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui --- docs/source/nlp/nemo_megatron/packed_sequence.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/nlp/nemo_megatron/packed_sequence.rst b/docs/source/nlp/nemo_megatron/packed_sequence.rst index 23c8976d4f5e..e31444fe1e60 100644 --- a/docs/source/nlp/nemo_megatron/packed_sequence.rst +++ b/docs/source/nlp/nemo_megatron/packed_sequence.rst @@ -123,7 +123,7 @@ To train with packed sequences, you need to change four items in the SFT/PEFT co preprocessing step. You can increase the ``pack_size`` to achieve the same purpose of increasing micro batch size. - Global batch size has to be adjusted so that the training recipe is maintained. Because each pack contains multiple sequences now, global batch size needs to be reduced by the average number of sequences per pack ``n``, - where :math:`n = \frac{# sequences in dataset}{# packs}`. This ensures that each gradient iteration sees (on + where ``n = num_sequences_in_dataset / num_packs``. This ensures that each gradient iteration sees (on average) the same number of tokens. The value of ``n`` is printed out when the script is run. .. code-block:: bash From f7941cbd41697291cbee714c2182a18b70b85755 Mon Sep 17 00:00:00 2001 From: Tim Moon <4406448+timmoon10@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:06:37 -0500 Subject: [PATCH 11/39] Move logic for distopt FP32 grads to models (#8867) * Move logic for FP32 embedding grads to models Signed-off-by: Tim Moon * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Tim Moon Signed-off-by: Tim Moon <4406448+timmoon10@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- .../language_modeling/megatron_base_model.py | 27 ------- .../language_modeling/megatron_gpt_model.py | 72 ++++++++++--------- .../megatron_lm_encoder_decoder_model.py | 5 ++ 3 files changed, 44 insertions(+), 60 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 854c5ee02e31..980ea8f9f76d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -768,33 +768,6 @@ def get_config_arg(key: str, default_value: Optional[Any] = None) -> Any: optim_dtype = str_to_dtype(get_config_arg('dtype', torch.float32)) optim_kwargs['dtype'] = optim_dtype - # Make sure embedding grad reductions are in FP32 - if optim_dtype == torch.float32: - fp32_params = [] - modules = self.get_model_module_list() - if parallel_state.is_pipeline_first_stage(ignore_virtual=True): - if self.mcore_gpt: - fp32_params.append(modules[0].shared_embedding_or_output_weight()) - if modules[0].embedding.add_position_embedding: - fp32_params.append(modules[0].embedding.position_embeddings.weight) - else: - fp32_params.append(modules[0].word_embeddings_weight()) - fp32_params.append(modules[0].position_embeddings_weight()) - if parallel_state.is_pipeline_last_stage(ignore_virtual=True): - share_embeddings_and_output_weights = ( - modules[-1].share_embeddings_and_output_weights - if self.mcore_gpt - else modules[-1].share_token_embeddings - ) - if share_embeddings_and_output_weights: - if self.mcore_gpt: - fp32_params.append(modules[-1].shared_embedding_or_output_weight()) - else: - fp32_params.append(modules[-1].word_embeddings_weight()) - for param in fp32_params: - if param is not None: - param._with_fp32_optimizer = True - # Match param allgather with model dtype model_dtype = torch.float32 if self.megatron_amp_O2 and hasattr(self, 'autocast_dtype'): diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 6648abac8ee0..8d1d428a9989 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -102,9 +102,6 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import drain_embedding_wgrad_compute, init_method_normal, scaled_init_method_normal - # TODO @tmoon: Use once available in Megatron-LM - # from megatron.core.pipeline_parallel.schedules import DataIteratorList - HAVE_MEGATRON_CORE = True except (ImportError, ModuleNotFoundError): @@ -494,36 +491,45 @@ def configure_optimizers(self): if self.with_distributed_adam: - # Disable overlapped grad sync for embedding grad when - # pipeline parallelism is enabled - if parallel_state.get_pipeline_model_parallel_world_size() > 1: - modules = self.get_model_module_list() - if parallel_state.is_pipeline_first_stage(ignore_virtual=True): - if len(modules) > 1: - module = modules[0] # only the first virtual rank has the embeddings - else: - module = modules[0] - if self.cfg.get('share_embeddings_and_output_weights', True): - param = ( - module.shared_embedding_or_output_weight() - if self.mcore_gpt - else module.word_embeddings_weight() - ) - param._disable_greedy_grad_copy = not self.megatron_amp_O2 - param._disable_overlap_grad_sync = True - if parallel_state.is_pipeline_last_stage(ignore_virtual=True): - if len(modules) > 1: - module = modules[-1] # only the last virtual rank has the embeddings - else: - module = modules[0] - if self.cfg.get('share_embeddings_and_output_weights', True): - param = ( - module.shared_embedding_or_output_weight() - if self.mcore_gpt - else module.word_embeddings_weight() - ) - param._disable_greedy_grad_copy = not self.megatron_amp_O2 - param._disable_overlap_grad_sync = True + # Special handling for embedding grads + modules = self.get_model_module_list() + if parallel_state.is_pipeline_first_stage(ignore_virtual=True): + module = modules[0] # first virtual rank has the embeddings + + # Word embeddings: use FP32 grads and disable + # overlapped grad sync with pipeline parallelism + word_embeddings = ( + module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight() + ) + word_embeddings._with_fp32_optimizer = True + if parallel_state.get_pipeline_model_parallel_world_size() > 1 and self.cfg.get( + 'share_embeddings_and_output_weights', True + ): + word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2 + word_embeddings._disable_overlap_grad_sync = True + + # Position embeddings: use FP32 grads + position_embeddings = None + if self.mcore_gpt: + if module.embedding.add_position_embedding: + position_embeddings = module.embedding.position_embeddings.weight + else: + position_embeddings = module.position_embeddings_weight() + if position_embeddings is not None: + position_embeddings._with_fp32_optimizer = True + + # Handle case where embeddings are used in output layer + if parallel_state.is_pipeline_last_stage(ignore_virtual=True) and self.cfg.get( + 'share_embeddings_and_output_weights', True + ): + module = modules[-1] # last virtual rank has the embeddings + word_embeddings = ( + module.shared_embedding_or_output_weight() if self.mcore_gpt else module.word_embeddings_weight() + ) + word_embeddings._with_fp32_optimizer = True + if parallel_state.get_pipeline_model_parallel_world_size() > 1: + word_embeddings._disable_greedy_grad_copy = not self.megatron_amp_O2 + word_embeddings._disable_overlap_grad_sync = True # Disable overlapped grad sync for layer norm grads when # sequence parallelism is enabled diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 651034c91520..3a7ad3d6714c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -189,6 +189,11 @@ def configure_optimizers(self): param._disable_greedy_grad_copy = not self.megatron_amp_O2 param._disable_overlap_grad_sync = True + # Make sure embedding grads are reduced in FP32 + for name, param in self.named_parameters(): + if 'word_embedding' in name or 'position_embedding' in name or 'output_layer' in name: + param._with_fp32_optimizer = True + return super().configure_optimizers() def _handle_bias_activation_fusion_args(self, cfg): From 6e2398a896313c8806766129832423b334c8d876 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Wed, 10 Apr 2024 14:35:47 -0700 Subject: [PATCH 12/39] Fix transcription utils function for duration check (#8862) * add none check Signed-off-by: Nithin Rao Koluguri * add for restore func Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- nemo/collections/asr/parts/utils/transcribe_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 980500e9ef00..8465406224e7 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -298,7 +298,7 @@ def prepare_audio_data(cfg: DictConfig) -> Tuple[List[str], bool]: def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict]: """Sorts the manifest if duration key is available for every utterance.""" items = manifest_utils.read_manifest(path) - if try_sort and all("duration" in item for item in items): + if try_sort and all("duration" in item and item["duration"] is not None for item in items): items = sorted(items, reverse=True, key=lambda item: item["duration"]) return items @@ -306,7 +306,7 @@ def read_and_maybe_sort_manifest(path: str, try_sort: bool = False) -> List[dict def restore_transcription_order(manifest_path: str, transcriptions: list) -> list: with open(manifest_path) as f: items = [(idx, json.loads(l)) for idx, l in enumerate(f)] - if not all("duration" in item[1] for item in items): + if not all("duration" in item[1] and item[1]["duration"] is not None for item in items): return transcriptions new2old = [item[0] for item in sorted(items, reverse=True, key=lambda it: it[1]["duration"])] del items # free up some memory From 2c6e65e7dd42751f74fdaa47e8c2bc060e8f29f1 Mon Sep 17 00:00:00 2001 From: Danial Mohseni Taheri <49656670+DanialTaheri@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:29:15 -0700 Subject: [PATCH 13/39] Add clip conv layer (#8838) * Replace einops with ConvLayer Signed-off-by: Danial * Modify the layers Signed-off-by: Danial * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: Danial * Fix version and arch Signed-off-by: Danial * Fix a bug in openclip conversion Signed-off-by: Danial --------- Signed-off-by: Danial Co-authored-by: Danial Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../clip/convert_external_clip_to_nemo.py | 17 +++++------------ .../vision/modules/vit/vit_backbone.py | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py index 4ac99a951f0d..631b3faa2f47 100644 --- a/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py +++ b/examples/multimodal/vision_language_foundation/clip/convert_external_clip_to_nemo.py @@ -55,8 +55,8 @@ def get_args(): parser = ArgumentParser() - parser.add_argument("--arch", type=str, default="ViT-H-14") - parser.add_argument("--version", type=str, default="laion2b_s32b_b79k") + parser.add_argument("--arch", type=str, default="openai/clip-vit-base-patch32") + parser.add_argument("--version", type=str, default="huggingface") parser.add_argument( "--hparams_file", @@ -112,7 +112,6 @@ def mapping_openclip_state_dict(open_model): ".positional_embedding": ".position_embeddings", ".backbone.proj": ".head.weight", ".class_embedding": ".cls_token", - ".backbone.conv1.weight": ".backbone.linear_encoder.weight", } nemo_state_dict = {} @@ -139,9 +138,6 @@ def mapping_openclip_state_dict(open_model): nemo_state_dict["vision_encoder.backbone.cls_token"] = nemo_state_dict[ "vision_encoder.backbone.cls_token" ].reshape(1, 1, -1) - w = nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] - nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] = einops.rearrange(w, "b c p1 p2 -> b (p1 p2 c)",) - nemo_state_dict["vision_encoder.backbone.linear_encoder.bias"] = torch.zeros(w.shape[0]) return nemo_state_dict @@ -168,10 +164,10 @@ def mapping_hf_state_dict(hf_model): ".pre_layrnorm.bias": ".preprocess_layernorm.bias", ".post_layernorm.weight": ".transformer.final_layernorm.weight", ".post_layernorm.bias": ".transformer.final_layernorm.bias", - ".backbone.embeddings.position_embedding.weight": ".backbone.position_embeddings", - ".language_model.embeddings.position_embedding.weight": ".language_model.embedding.position_embeddings", + ".backbone.embeddings.position_embedding.weight": ".backbone.position_embeddings.weight", + ".language_model.embeddings.position_embedding.weight": ".language_model.embedding.position_embeddings.weight", ".embeddings.class_embedding": ".cls_token", - ".backbone.embeddings.patch_embedding.weight": ".backbone.linear_encoder.weight", + ".backbone.embeddings.patch_embedding.weight": ".backbone.conv1.weight", ".final_layer_norm.weight": ".encoder.final_layernorm.weight", ".final_layer_norm.bias": ".encoder.final_layernorm.bias", ".embeddings.token_embedding.weight": ".embedding.word_embeddings.weight", @@ -208,9 +204,6 @@ def mapping_hf_state_dict(hf_model): nemo_state_dict["vision_encoder.backbone.cls_token"] = nemo_state_dict[ "vision_encoder.backbone.cls_token" ].reshape(1, 1, -1) - w = nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] - nemo_state_dict["vision_encoder.backbone.linear_encoder.weight"] = einops.rearrange(w, "b c p1 p2 -> b (p1 p2 c)",) - nemo_state_dict["vision_encoder.backbone.linear_encoder.bias"] = torch.zeros(w.shape[0]) return nemo_state_dict diff --git a/nemo/collections/vision/modules/vit/vit_backbone.py b/nemo/collections/vision/modules/vit/vit_backbone.py index ebd7e0da3e5c..67989f0f5496 100644 --- a/nemo/collections/vision/modules/vit/vit_backbone.py +++ b/nemo/collections/vision/modules/vit/vit_backbone.py @@ -227,8 +227,14 @@ def __init__( torch.nn.init.zeros_(self.cls_token) self.position_ids = torch.arange(self.seq_length).expand(1, -1).cuda() - # Linear encoder - self.linear_encoder = torch.nn.Linear(self.flatten_dim, self.hidden_size) + # Convolution layer + self.conv1 = torch.nn.Conv2d( + in_channels=model_cfg.num_channels, # Number of input channels + out_channels=self.hidden_size, # Number of output channels + kernel_size=(self.patch_dim, self.patch_dim), # Kernel size (height, width) + stride=(self.patch_dim, self.patch_dim), # Stride (height, width) + bias=False, + ) # Disable bias # embedding self.position_embedding_type = model_cfg.get("position_embedding_type", "learned_absolute") @@ -332,12 +338,9 @@ def interpolate_pos_encoding( def forward(self, input): if self.pre_process: - rearranged_input = einops.rearrange( - input, "b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1=self.patch_dim, p2=self.patch_dim, - ) - - # [b num_patch patch_dim*patch_dim*c] -> [b, s, h]; s:=num_patch, h:=hidden - encoder_output = self.linear_encoder(rearranged_input) + rearranged_input = self.conv1(input) + rearranged_input = rearranged_input.reshape(rearranged_input.shape[0], rearranged_input.shape[1], -1) + encoder_output = rearranged_input.permute(0, 2, 1) concatenated_tokens = encoder_output if self.class_token: From 1809b61efa95e0440ca7e35c62148c8b4fcc2e9d Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 10 Apr 2024 16:47:53 -0700 Subject: [PATCH 14/39] [Nemo CICD] Update dependencies for container build (#8878) * Cancel old runs for PR commit update * update dependencies for container build * temp for test * update back * Revert "temp for test" This reverts commit 9f9221155412393d05b2c862880f9128a93b26a4. --- .github/workflows/cicd-main.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 550defff7814..5cc990902953 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -73,7 +73,7 @@ jobs: - name: Container setup run: | # Pull base PyTorch container - docker pull nvcr.io/nvidia/pytorch:24.01-py3 + docker pull nvcr.io/nvidia/pytorch:24.02-py3 docker run --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume ${{ github.workspace }}/${{ github.run_id }}:/workspace --volume /mnt/datadrive/TestData:/home/TestData nvcr.io/nvidia/pytorch:24.01-py3 /bin/bash -c ' set -x @@ -93,21 +93,19 @@ jobs: # NeMo Installation ./reinstall.sh release - # Transformer Engine 1.2.0 # Transformer Engine installation git clone https://github.com/NVIDIA/TransformerEngine.git && \ pushd TransformerEngine && \ - git fetch origin 9b2fed514ea419141146f843ab2c84b22b86bfd7 && \ + git fetch origin bfe21c3d68b0a9951e5716fb520045db53419c5e && \ git checkout FETCH_HEAD && \ git submodule init && git submodule update && \ NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . && \ popd - # Apex bugfix for PyTorch 23.11 container: https://github.com/NVIDIA/apex/pull/1760 # Apex installation git clone https://github.com/NVIDIA/apex.git && \ pushd apex && \ - git checkout b496d85fb88a801d8e680872a12822de310951fd && \ + git checkout 810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c && \ cp -R apex /usr/local/lib/python3.10/dist-packages && \ popd @@ -116,12 +114,13 @@ jobs: # Megatron Core installation git clone https://github.com/NVIDIA/Megatron-LM.git && \ pushd Megatron-LM && \ - git checkout 43792028f003ed25a3ee8c5a0d4cad82317d81b5 && \ + git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \ pip install . && \ pushd megatron/core/datasets && \ make && \ popd && \ popd + export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" # Install only for test: L2: Segmentation Tool pushd tools/ctc_segmentation && \ From 2890b3338f18c972246b26487d0d4a18795248fd Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Wed, 10 Apr 2024 20:17:11 -0700 Subject: [PATCH 15/39] Akoumparouli/fix sd train (#8876) * hardcode autocast Signed-off-by: Alexandros Koumparoulis * uncomment sd_train Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis --- Jenkinsfile | 86 +++++++++---------- .../stable_diffusion/sd_train.py | 3 +- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 431bc24907ed..6471fa3d011f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -208,48 +208,48 @@ pipeline { sh "rm -rf /home/TestData/multimodal/stable_diffusion_train" } } - //stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { - // when { - // anyOf { - // branch 'main' - // changeRequest target: 'main' - // } - // } - // failFast true - // steps { - // sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - // sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ - // trainer.precision=16 \ - // trainer.num_nodes=1 \ - // trainer.devices=1 \ - // ++exp_manager.max_time_per_run=00:00:03:00 \ - // exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ - // trainer.max_steps=20 \ - // model.micro_batch_size=1 \ - // model.global_batch_size=1 \ - // model.data.synthetic_data=True \ - // model.first_stage_key=images_moments \ - // model.cond_stage_key=clip_encoded \ - // model.optim.name=megatron_fused_adam \ - // +model.optim.capturable=True \ - // exp_manager.ema.enable=False \ - // model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ - // ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ - // ++model.cond_stage_config.max_length=77 \ - // model.inductor=False \ - // ~model.cond_stage_config.restore_from_path \ - // ~model.cond_stage_config.freeze \ - // ~model.cond_stage_config.layer \ - // model.first_stage_config.from_pretrained=null \ - // model.ddp_overlap=False \ - // model.capture_cudagraph_iters=15 \ - // model.unet_config.use_flash_attention=False \ - // model.unet_config.attention_resolutions=[1] \ - // model.unet_config.channel_mult=[1] \ - // " - // sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" - // } - //} + stage('L2: Multimodal Stable Diffusion Train with Cuda Graph') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" + sh "python examples/multimodal/text_to_image/stable_diffusion/sd_train.py \ + trainer.precision=16 \ + trainer.num_nodes=1 \ + trainer.devices=1 \ + ++exp_manager.max_time_per_run=00:00:03:00 \ + exp_manager.exp_dir=/home/TestData/multimodal/stable_diffusion_train_with_cuda_graph \ + trainer.max_steps=20 \ + model.micro_batch_size=1 \ + model.global_batch_size=1 \ + model.data.synthetic_data=True \ + model.first_stage_key=images_moments \ + model.cond_stage_key=clip_encoded \ + model.optim.name=megatron_fused_adam \ + +model.optim.capturable=True \ + exp_manager.ema.enable=False \ + model.cond_stage_config._target_=nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder \ + ++model.cond_stage_config.version=openai/clip-vit-large-patch14 \ + ++model.cond_stage_config.max_length=77 \ + model.inductor=False \ + ~model.cond_stage_config.restore_from_path \ + ~model.cond_stage_config.freeze \ + ~model.cond_stage_config.layer \ + model.first_stage_config.from_pretrained=null \ + model.ddp_overlap=False \ + model.capture_cudagraph_iters=15 \ + model.unet_config.use_flash_attention=False \ + model.unet_config.attention_resolutions=[1] \ + model.unet_config.channel_mult=[1] \ + " + sh "rm -rf /home/TestData/multimodal/stable_diffusion_train_with_cuda_graphs" + } + } // stage('L2: Multimodal ControlNet Train') { // when { // anyOf { @@ -5849,4 +5849,4 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' cleanWs() } } -} \ No newline at end of file +} diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py index 968d9bec2884..b10eda550e9a 100644 --- a/examples/multimodal/text_to_image/stable_diffusion/sd_train.py +++ b/examples/multimodal/text_to_image/stable_diffusion/sd_train.py @@ -83,7 +83,8 @@ def main(cfg) -> None: else: autocast_enabled = True dgrad_dtype = torch.float16 - + # akoumparouli: temp fix. + autocast_enabled = True model = model.cuda() for _ in range(5): with torch.autocast(device_type="cuda", enabled=autocast_enabled, dtype=torch.float16): From 57444ae9b63afc5419a16948f7d032a1a9f7dc7f Mon Sep 17 00:00:00 2001 From: fedorovgv <50668534+fedorovgv@users.noreply.github.com> Date: Thu, 11 Apr 2024 17:10:47 +0300 Subject: [PATCH 16/39] Add Semi Sorted Batching. (#8584) --- docs/source/asr/datasets.rst | 119 +++++---- nemo/collections/asr/data/audio_to_text.py | 7 + nemo/collections/asr/models/ctc_bpe_models.py | 17 ++ nemo/collections/asr/models/ctc_models.py | 17 ++ .../asr/models/hybrid_rnnt_ctc_bpe_models.py | 17 ++ .../collections/asr/models/rnnt_bpe_models.py | 17 ++ nemo/collections/asr/models/rnnt_models.py | 17 ++ .../asr/parts/utils/asr_batching.py | 237 ++++++++++++++++++ nemo/core/optim/lr_scheduler.py | 8 + tests/collections/asr/test_asr_samplers.py | 157 ++++++++++++ 10 files changed, 566 insertions(+), 47 deletions(-) create mode 100644 nemo/collections/asr/parts/utils/asr_batching.py create mode 100644 tests/collections/asr/test_asr_samplers.py diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst index 7b6873de0ed7..7612c6a3f630 100644 --- a/docs/source/asr/datasets.rst +++ b/docs/source/asr/datasets.rst @@ -250,62 +250,49 @@ To enable sharded manifest filename expansion, set the ``shard_manifests`` field ``defer_setup`` flag needs to be true as well, so that the dataloader will be initialized after the DDP and its length can be collected from the distributed workers. +Batching strategies +--------------------- -Conversion to Tarred Datasets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length. +These extra paddings is a significant source of computation waste. -You can easily convert your existing NeMo-compatible ASR datasets using the -`conversion script here `_. +Semi Sorted Batching +--------------------- -.. code:: bash +Sorting samples by duration and spliting them into batches speeds up training, but can degrade the quality of the model. To avoid quality degradation and maintain some randomness in the partitioning process, we add pseudo noise to the sample length when sorting. - python convert_to_tarred_audio_dataset.py \ - --manifest_path= \ - --target_dir= \ - --num_shards= - --max_duration= \ - --min_duration= \ - --force_codec=flac \ - --shuffle --shuffle_seed=0 + .. image:: images/ssb.png + :align: center + :alt: semi sorted batching + :scale: 50% -.. note:: For extra reduction of storage space at the cost of lossy (but high-quality) compression, you may use ``--force_codec=opus`` instead. +It may result into training speeedup of more than 40 percent with the same quality. To enable and use semi sorted batching add some lines in config. -This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter -audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory -``--target_dir`` in ``n`` shards, along with separate manifest and metadata files. + .. code:: -The files in the target directory should look similar to the following: + ++model.train_ds.use_semi_sorted_batching=true + ++model.train_ds.randomization_factor=0.1 -.. code:: none +Semi sorted batching is supported by the following models: - target_dir/ - ├── audio_1.tar - ├── audio_2.tar - ├── ... - ├── metadata.yaml - ├── tarred_audio_manifest.json - ├── sharded_manifests/ - ├── manifest_1.json - ├── ... - └── manifest_N.json + .. code:: + nemo.collections.asr.models.EncDecCTCModel + nemo.collections.asr.models.EncDecCTCModelBPE + nemo.collections.asr.models.EncDecRNNTModel + nemo.collections.asr.models.EncDecRNNTBPEModel + nemo.collections.asr.models.EncDecHybridRNNTCTCModel + nemo.collections.asr.models.EncDecHybridRNNTCTCBPEModel -Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that -filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are -simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav`` -in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``. - -Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag. +For more details about this algorithm, see the `paper `_ . Bucketing Datasets ------------------- +--------------------- -For training ASR models, audios with different lengths may be grouped into a batch. It would make it necessary to use paddings to make all the same length. -These extra paddings is a significant source of computation waste. Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy. +Splitting the training samples into buckets with different lengths and sampling from the same bucket for each batch would increase the computation efficicncy. It may result into training speeedup of more than 2X. To enable and use the bucketing feature, you need to create the bucketing version of the dataset by using `conversion script here `_. You may use --buckets_num to specify the number of buckets (Recommend to use 4 to 8 buckets). It creates multiple tarred datasets, one per bucket, based on the audio durations. The range of [min_duration, max_duration) is split into equal sized buckets. - To enable the bucketing feature in the dataset section of the config files, you need to pass the multiple tarred datasets as a list of lists. If user passes just a list of strings, then the datasets would simply get concatenated which would be different from bucketing. Here is an example for 4 buckets and 512 shards: @@ -352,6 +339,50 @@ The fully_randomized strategy would have lower speedup than synced_randomized bu Bucketing may improve the training speed more than 2x but may affect the final accuracy of the model slightly. Training for more epochs and using 'synced_randomized' strategy help to fill this gap. Currently bucketing feature is just supported for tarred datasets. + +Conversion to Tarred Datasets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can easily convert your existing NeMo-compatible ASR datasets using the +`conversion script here `_. + +.. code:: bash + + python convert_to_tarred_audio_dataset.py \ + --manifest_path= \ + --target_dir= \ + --num_shards= + --max_duration= \ + --min_duration= \ + --shuffle --shuffle_seed=0 + +This script shuffles the entries in the given manifest (if ``--shuffle`` is set, which we recommend), filter +audio files according to ``min_duration`` and ``max_duration``, and tar the remaining audio files to the directory +``--target_dir`` in ``n`` shards, along with separate manifest and metadata files. + +The files in the target directory should look similar to the following: + +.. code:: + + target_dir/ + ├── audio_1.tar + ├── audio_2.tar + ├── ... + ├── metadata.yaml + ├── tarred_audio_manifest.json + ├── sharded_manifests/ + ├── manifest_1.json + ├── ... + └── manifest_N.json + + +Note that file structures are flattened such that all audio files are at the top level in each tarball. This ensures that +filenames are unique in the tarred dataset and the filepaths do not contain "-sub" and forward slashes in each ``audio_filepath`` are +simply converted to underscores. For example, a manifest entry for ``/data/directory1/file.wav`` would be ``_data_directory1_file.wav`` +in the tarred dataset manifest, and ``/data/directory2/file.wav`` would be converted to ``_data_directory2_file.wav``. + +Sharded manifests are generated by default; this behavior can be toggled via the ``no_shard_manifests`` flag. + Upsampling Datasets ------------------- @@ -437,7 +468,7 @@ For tarred datasets, shards from the AIS cluster are used by piping ``ais get`` Tarred Dataset from AIS ^^^^^^^^^^^^^^^^^^^^^^^ -A tarred dataset can be easily used as described in the `Tarred Datasets`_ section by providing paths to manifests on an AIS cluster. +A tarred dataset can be easily used as described in the :ref:`Tarred Datasets` section by providing paths to manifests on an AIS cluster. For example, a tarred dataset from an AIS cluster can be configured as .. code:: @@ -445,7 +476,7 @@ For example, a tarred dataset from an AIS cluster can be configured as manifest_filepath='ais://bucket/tarred_audio_manifest.json' tarred_audio_filepaths='ais://bucket/shard_{1..64}.tar' -`Bucketing Datasets`_ are configured in a similar way by providing paths on an AIS cluster. +:ref:`Bucketing Datasets` are configured in a similar way by providing paths on an AIS cluster. Non-tarred Dataset from AIS ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -640,20 +671,14 @@ We recommend to pre-compute the bucket duration bins in order to accelerate the The following script may be used: .. code-block:: bash - $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 manifest.json - Use the following options in your config: num_buckets=30 bucket_duration_bins=[1.78,2.34,2.69,... - For multi-dataset setups, one may provide multiple manifests and even their weights: - .. code-block:: bash - $ python scripts/speech_recognition/estimate_duration_bins.py -b 30 [[manifest.json,0.7],[other.json,0.3]] - Use the following options in your config: num_buckets=30 bucket_duration_bins=[1.91,3.02,3.56,... diff --git a/nemo/collections/asr/data/audio_to_text.py b/nemo/collections/asr/data/audio_to_text.py index a689450c94ba..00c15109b64f 100644 --- a/nemo/collections/asr/data/audio_to_text.py +++ b/nemo/collections/asr/data/audio_to_text.py @@ -16,6 +16,7 @@ import math import multiprocessing import os +from collections.abc import Iterable as IterableABC from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union import braceexpand @@ -472,6 +473,12 @@ def get_manifest_sample(self, sample_id): return self.manifest_processor.collection[sample_id] def __getitem__(self, index): + if isinstance(index, IterableABC): + return [self._process_sample(_index) for _index in index] + else: + return self._process_sample(index) + + def _process_sample(self, index): sample = self.manifest_processor.collection[index] offset = sample.offset diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index 9f3b6b4cf83b..f861a971f5ea 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -20,6 +20,7 @@ from omegaconf import DictConfig, ListConfig, OmegaConf, open_dict from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.losses.ctc import CTCLoss @@ -27,6 +28,7 @@ from nemo.collections.asr.models.ctc_models import EncDecCTCModel from nemo.collections.asr.parts.mixins import ASRBPEMixin from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig +from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging, model_utils @@ -129,9 +131,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn + batch_sampler = None + if config.get('use_semi_sorted_batching', False): + if not isinstance(dataset, _AudioTextDataset): + raise RuntimeError( + "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " + f"but found dataset of type {type(dataset)}" + ) + # set batch_size and batch_sampler to None to disable automatic batching + batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) + config['batch_size'] = None + config['drop_last'] = False + shuffle = False + return torch.utils.data.DataLoader( dataset=dataset, batch_size=config['batch_size'], + sampler=batch_sampler, + batch_sampler=None, collate_fn=collate_fn, drop_last=config.get('drop_last', False), shuffle=shuffle, diff --git a/nemo/collections/asr/models/ctc_models.py b/nemo/collections/asr/models/ctc_models.py index 5f380619db68..4df02b1177cd 100644 --- a/nemo/collections/asr/models/ctc_models.py +++ b/nemo/collections/asr/models/ctc_models.py @@ -25,6 +25,7 @@ from tqdm.auto import tqdm from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.losses.ctc import CTCLoss @@ -33,6 +34,7 @@ from nemo.collections.asr.parts.mixins import ASRModuleMixin, ASRTranscriptionMixin, InterCTCMixin, TranscribeConfig from nemo.collections.asr.parts.mixins.transcription import GenericTranscriptionType, TranscriptionReturnType from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecoding, CTCDecodingConfig +from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.parts.preprocessing.parsers import make_parser @@ -319,9 +321,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn + batch_sampler = None + if config.get('use_semi_sorted_batching', False): + if not isinstance(dataset, _AudioTextDataset): + raise RuntimeError( + "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " + f"but found dataset of type {type(dataset)}" + ) + # set batch_size and batch_sampler to None to disable automatic batching + batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) + config['batch_size'] = None + config['drop_last'] = False + shuffle = False + return torch.utils.data.DataLoader( dataset=dataset, batch_size=config['batch_size'], + sampler=batch_sampler, + batch_sampler=None, collate_fn=collate_fn, drop_last=config.get('drop_last', False), shuffle=shuffle, diff --git a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py index 182acf3904db..39375f08e139 100644 --- a/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py +++ b/nemo/collections/asr/models/hybrid_rnnt_ctc_bpe_models.py @@ -21,6 +21,7 @@ from pytorch_lightning import Trainer from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.losses.ctc import CTCLoss @@ -30,6 +31,7 @@ from nemo.collections.asr.parts.mixins import ASRBPEMixin from nemo.collections.asr.parts.submodules.ctc_decoding import CTCBPEDecoding, CTCBPEDecodingConfig from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig +from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging, model_utils @@ -169,9 +171,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn + batch_sampler = None + if config.get('use_semi_sorted_batching', False): + if not isinstance(dataset, _AudioTextDataset): + raise RuntimeError( + "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " + f"but found dataset of type {type(dataset)}" + ) + # set batch_size and batch_sampler to None to disable automatic batching + batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) + config['batch_size'] = None + config['drop_last'] = False + shuffle = False + return torch.utils.data.DataLoader( dataset=dataset, batch_size=config['batch_size'], + sampler=batch_sampler, + batch_sampler=None, collate_fn=collate_fn, drop_last=config.get('drop_last', False), shuffle=shuffle, diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 6fba163c65e1..bb4e7f718a8e 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -21,6 +21,7 @@ from pytorch_lightning import Trainer from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToBPEDALIDataset from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.losses.rnnt import RNNTLoss @@ -28,6 +29,7 @@ from nemo.collections.asr.models.rnnt_models import EncDecRNNTModel from nemo.collections.asr.parts.mixins import ASRBPEMixin from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTBPEDecoding, RNNTBPEDecodingConfig +from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging, model_utils @@ -527,9 +529,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn + batch_sampler = None + if config.get('use_semi_sorted_batching', False): + if not isinstance(dataset, _AudioTextDataset): + raise RuntimeError( + "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " + f"but found dataset of type {type(dataset)}" + ) + # set batch_size and batch_sampler to None to disable automatic batching + batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) + config['batch_size'] = None + config['drop_last'] = False + shuffle = False + return torch.utils.data.DataLoader( dataset=dataset, batch_size=config['batch_size'], + sampler=batch_sampler, + batch_sampler=None, collate_fn=collate_fn, drop_last=config.get('drop_last', False), shuffle=shuffle, diff --git a/nemo/collections/asr/models/rnnt_models.py b/nemo/collections/asr/models/rnnt_models.py index 047e25b8dd5d..386f2a915142 100644 --- a/nemo/collections/asr/models/rnnt_models.py +++ b/nemo/collections/asr/models/rnnt_models.py @@ -25,6 +25,7 @@ from tqdm.auto import tqdm from nemo.collections.asr.data import audio_to_text_dataset +from nemo.collections.asr.data.audio_to_text import _AudioTextDataset from nemo.collections.asr.data.audio_to_text_dali import AudioToCharDALIDataset, DALIOutputs from nemo.collections.asr.data.audio_to_text_lhotse import LhotseSpeechToTextBpeDataset from nemo.collections.asr.losses.rnnt import RNNTLoss, resolve_rnnt_default_loss_name @@ -38,6 +39,7 @@ TranscriptionReturnType, ) from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecoding, RNNTDecodingConfig +from nemo.collections.asr.parts.utils.asr_batching import get_semi_sorted_batch_sampler from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.parts.preprocessing.parsers import make_parser @@ -467,9 +469,24 @@ def _setup_dataloader_from_config(self, config: Optional[Dict]): # support datasets that are lists of lists collate_fn = dataset.datasets[0].datasets[0].collate_fn + batch_sampler = None + if config.get('use_semi_sorted_batching', False): + if not isinstance(dataset, _AudioTextDataset): + raise RuntimeError( + "Semi Sorted Batch sampler can be used with AudioToCharDataset or AudioToBPEDataset " + f"but found dataset of type {type(dataset)}" + ) + # set batch_size and batch_sampler to None to disable automatic batching + batch_sampler = get_semi_sorted_batch_sampler(self, dataset, config) + config['batch_size'] = None + config['drop_last'] = False + shuffle = False + return torch.utils.data.DataLoader( dataset=dataset, batch_size=config['batch_size'], + sampler=batch_sampler, + batch_sampler=None, collate_fn=collate_fn, drop_last=config.get('drop_last', False), shuffle=shuffle, diff --git a/nemo/collections/asr/parts/utils/asr_batching.py b/nemo/collections/asr/parts/utils/asr_batching.py new file mode 100644 index 000000000000..dcbebdc0f949 --- /dev/null +++ b/nemo/collections/asr/parts/utils/asr_batching.py @@ -0,0 +1,237 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Iterator, List, Optional, Union + +import numpy as np +import torch +from torch.utils.data.distributed import DistributedSampler + +from nemo.collections.asr.data.audio_to_text import AudioToBPEDataset, AudioToCharDataset +from nemo.collections.asr.models.asr_model import ASRModel +from nemo.utils import logging + + +class SemiSortBatchSampler(DistributedSampler): + def __init__( + self, + global_rank: int, + world_size: int, + durations: List[int], + batch_size: int, + batch_shuffle: bool = True, + drop_last: bool = False, + randomization_factor: Optional[float] = None, + seed: int = 42, + ) -> None: + """ + Semi Sorted Batching, as proposed in _SSB ("Speed up training with variable + length inputs by efficient batching strategies.", Zhenhao Ge et al. (2021).). + + The Semi Sorted Batch Sampler (SSB) samples the indices by their duration + with the addition of pseudo noise that is sampled from the uniform + distribution \mathbb{U}\left[ -delta * r, delta * r \right], where delta is + defined as the difference between the maximum and minimum duration and r is + the randomization factor that controls the strength of the noise (when r = 0, + there will be a strong sorting). The heuristic value of the r according to + the experiments from paper is 0.2. + + The torch calls the set_epoch method from the distributed data loader sampler + at the end of each epoch to shuffle the samples according to the seed and + epoch number. So the SSB is passed to the dataloader as a sampler with the + dataloader's batch size options and the batch_sampler option set to None to + disable automatical batching. In this case, the sampler has become an iterator + that returns a list of batch indices. + + Args: + global_rank: Rank among all GPUs. + world_size: The number of GPUs used. + durations: Sample durations parsed from `dataset.manifest_processor`. + batch_size: Micro batch size or batch size per singe gpu. + batch_shuffle: Batch sort before each epoch. + drop_last: Drop the last batch if the number of samples is less than batch + size. Defaults to False. + randomization_factor: The strength of noise that will be added to the sample + duration. If no value is passed, the value 0.2 will be used. + seed: Seed for batch shuffleling. Defaults to 42. + + Raises: + ValueError: Wrong randomization factor value. + RuntimeError: Unexpected behavior. + + .. SSB_: + https://www.isca-speech.org/archive/pdfs/interspeech_2021/ge21_interspeech.pdf + """ + if randomization_factor is None: + randomization_factor = 0.1 + logging.info("Randomization factor not found in config, default value 0.1 will be set.") + else: + logging.info(f"A randomization factor {randomization_factor} will be used.") + + if randomization_factor < 0.0: + raise ValueError(f'Randomization factor must be non-negative but found {randomization_factor}.') + + self.rank: List = global_rank + self.num_replicas: int = world_size + + self.durations: np.array = np.array(durations, dtype=np.float32) + + self.shuffle: bool = batch_shuffle + self.micro_batch_size: int = batch_size + self.drop_last: bool = drop_last + self.epoch: int = 0 + self.seed: int = seed + self.randomization_factor: float = randomization_factor + + self.local_num_batches: int = self._calculate_local_num_batches() + + logging.info(f"Semi Sorted Batch Sampler will be used") + + def _calculate_local_num_batches(self) -> int: + init_num_samples = len(self.durations) + + # delete batches with a non-integer number of samples + if self.drop_last: + init_num_samples -= init_num_samples % self.micro_batch_size + + # calculate the number of batches according to the counted number of samples + global_num_batches = math.ceil(init_num_samples / self.micro_batch_size) + + # add extra batches to make it divisible by world size (num replicas) + num_batches_pad = (self.num_replicas - global_num_batches % self.num_replicas) % self.num_replicas + global_num_batches += num_batches_pad + + # calculate the number of batches per rank + local_num_batches = global_num_batches // self.num_replicas + + return local_num_batches + + def _make_batches(self) -> List[np.array]: + max_duration: float = np.max(self.durations) + min_duration: float = np.min(self.durations) + bound: float = (max_duration - min_duration) * self.randomization_factor / 2 + + # generate pseudo noise + noise: np.array = np.random.uniform(low=-bound, high=bound, size=len(self.durations)) + + # sort indices accroding to pseudo noise + sorted_indices: np.array = np.argsort(self.durations + noise) + + # delete batches with a non-integer number of samples + tail = 0 + if self.drop_last: + tail: int = len(sorted_indices) % self.micro_batch_size + exclude = np.random.choice(len(sorted_indices), tail, replace=False) + sorted_indices = np.delete(sorted_indices, exclude) + logging.warning(f"Drop last is set to True, so {len(exclude)} samples will be dropped.") + + global_num_batches: int = math.ceil(len(sorted_indices) / self.micro_batch_size) + + # if the global_num_batches is zero than return empty list + if global_num_batches == 0: + logging.warning( + f"The number of all batches is {global_num_batches}, than dataloader will " + "be empty. To avoid this try to decrease batch size or world size or set " + "drop_last to False." + ) + return [] + + # add extra batches to make it divisible by world size (num replicas) + pad_batches_num: int = (self.num_replicas - global_num_batches % self.num_replicas) % self.num_replicas + if global_num_batches < self.num_replicas: + logging.warning( + f"The number of all batches is {global_num_batches}, which is less than the " + f"world size of {self.num_replicas}. SSB Sampler will add {pad_batches_num} " + "batches. To avoid this try to decrease batch size or world size." + ) + + if pad_batches_num != 0: + # randomly select batch indeces to pad and concatenate them + batch_indeces_pad: np.array = np.random.randint( + low=0, high=len(sorted_indices), size=pad_batches_num * self.micro_batch_size, + ) + sorted_indices: np.array = np.concatenate( + (sorted_indices, sorted_indices[batch_indeces_pad]), axis=0, + ) + + # local indeces are selected by world size and local rank + local_indices: np.array = sorted_indices[self.rank :: self.num_replicas] + + # split local batches + size_mask = range(self.micro_batch_size, len(local_indices), self.micro_batch_size) + local_batches = np.split(local_indices, size_mask, axis=0) + + if len(local_batches) != self.local_num_batches: + raise RuntimeError( + f'Number of calculated indices {len(local_batches)} is not equal to calculated ' + f'number of local batches {self.local_num_batches}.' + ) + + return local_batches + + def __iter__(self) -> Iterator[List[int]]: + local_batches = self._make_batches() + + if self.shuffle: + g = torch.Generator() + g.manual_seed(self.seed + self.epoch + 1) + indices = torch.randperm(self.local_num_batches, generator=g) + else: + indices = torch.arange(0, self.local_num_batches) + + for _, index in enumerate(indices): + yield local_batches[index] + + def __len__(self) -> int: + return self.local_num_batches + + +def get_semi_sorted_batch_sampler( + model: ASRModel, dataset: Union[AudioToCharDataset, AudioToBPEDataset], config: dict +) -> SemiSortBatchSampler: + """ + Instantiates a Semi Sorted (Batch) Sampler. + + Args: + model: ASR Model. + dataset: Dataset which allow iterate over all object and parse durations. + config: Train, Vaidation or Test dataset config. + + Raises: + ValueError: Wrong dataset type. + + Returns: + SemiSortBatchSampler: Semi Sorted Batch Sampler class. + """ + if not (isinstance(dataset, AudioToCharDataset) or isinstance(dataset, AudioToBPEDataset)): + raise ValueError( + "Only AudioToCharDataset or AudioToBPEDataset supported with semi sorted batching, " + f"but found {type(dataset)}." + ) + + durations = [sample.duration for sample in dataset.manifest_processor.collection.data] + + sampler = SemiSortBatchSampler( + global_rank=model.global_rank, + world_size=model.world_size, + durations=durations, + batch_size=config['batch_size'], + batch_shuffle=config.get('shuffle', True), + drop_last=config.get('drop_last', False), + randomization_factor=config.get('randomization_factor', None), + seed=config.get('semi_sort_sampler_seed', 42), + ) + + return sampler diff --git a/nemo/core/optim/lr_scheduler.py b/nemo/core/optim/lr_scheduler.py index 38ad372f3e51..473ca0f5c416 100644 --- a/nemo/core/optim/lr_scheduler.py +++ b/nemo/core/optim/lr_scheduler.py @@ -877,6 +877,14 @@ def prepare_lr_scheduler( batch_size = train_dataloader.batch_sampler.micro_batch_size else: raise ValueError(f'Could not find batch_size from batch_sampler: {train_dataloader.batch_sampler}') + elif hasattr(train_dataloader, 'sampler') and train_dataloader.sampler is not None: + if ( + hasattr(train_dataloader.sampler, 'micro_batch_size') + and train_dataloader.sampler.micro_batch_size is not None + ): + batch_size = train_dataloader.sampler.micro_batch_size + else: + raise ValueError(f'Could not find batch_size from sampler: {train_dataloader.sampler}') else: raise ValueError(f'Could not find batch_size from train_dataloader: {train_dataloader}') drop_last = train_dataloader.drop_last diff --git a/tests/collections/asr/test_asr_samplers.py b/tests/collections/asr/test_asr_samplers.py new file mode 100644 index 000000000000..0b4d11fe2946 --- /dev/null +++ b/tests/collections/asr/test_asr_samplers.py @@ -0,0 +1,157 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import tempfile + +import numpy as np +import pytest +import soundfile as sf +import torch + +from nemo.collections.asr.data import audio_to_text +from nemo.collections.asr.parts.utils.asr_batching import SemiSortBatchSampler +from nemo.collections.asr.parts.utils.manifest_utils import write_manifest + + +class TestASRSamplers: + labels = [ + " ", + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "'", + ] + + @pytest.mark.unit + def test_ssb_sampler(self): + # Generate random signals + data_min_duration = 0.1 + data_max_duration = 16.7 + + random_seed = 42 + sample_rate = 16000 + + _rng = np.random.default_rng(seed=random_seed) + + def generate_samples(num_examples: int) -> list: + data_duration = np.round(_rng.uniform(low=data_min_duration, high=data_max_duration, size=num_examples), 3) + data_duration_samples = np.floor(data_duration * sample_rate).astype(int) + samples = [] + for data_duration_sample in data_duration_samples: + samples.append(_rng.uniform(low=-0.5, high=0.5, size=(data_duration_sample))) + return samples + + with tempfile.TemporaryDirectory() as test_dir: + # Build metadata for manifest + metadata = [] + + # Test size of dataloader with and without ssb + for num_samples in np.concatenate([np.array([1, 2]), _rng.integers(3, 10, 2), _rng.integers(10, 1000, 2)]): + samples = generate_samples(num_samples) + + for n, sample in enumerate(samples): + meta = dict() + signal_filename = f'{n:04d}.wav' + # write audio files + sf.write(os.path.join(test_dir, signal_filename), sample, sample_rate) + # update metadata + meta['audio_filepath'] = os.path.join(test_dir, signal_filename) + meta['duration'] = len(sample) / sample_rate + meta['text'] = 'non empty' + metadata.append(meta) + + # Save manifest + manifest_filepath = os.path.join(test_dir, 'manifest.json') + write_manifest(manifest_filepath, metadata) + + # Make dataset + dataset = audio_to_text.AudioToCharDataset( + manifest_filepath=manifest_filepath, + labels=self.labels, + sample_rate=sample_rate, + max_duration=data_max_duration, + min_duration=data_min_duration, + ) + durations = [sample.duration for sample in dataset.manifest_processor.collection.data] + + # Compare two dataloader + for batch_size in _rng.integers(1, n + 20, 5): + batch_size = int(batch_size) + drop_last = True if _rng.integers(0, 2) else False + sampler = SemiSortBatchSampler( + global_rank=0, + world_size=1, + durations=durations, + batch_size=batch_size, + batch_shuffle=True, + drop_last=drop_last, + randomization_factor=0.1, + seed=random_seed, + ) + dataloader_with_ssb = torch.utils.data.DataLoader( + dataset=dataset, + batch_size=None, + sampler=sampler, + batch_sampler=None, + collate_fn=lambda x: audio_to_text._speech_collate_fn(x, pad_id=0), + ) + dataloader = torch.utils.data.DataLoader( + dataset=dataset, + batch_size=batch_size, + collate_fn=lambda x: audio_to_text._speech_collate_fn(x, pad_id=0), + drop_last=drop_last, + shuffle=True, + ) + + assert abs(len(dataloader) - len(dataloader_with_ssb)) == 0, ( + "Different num of batches with batch! Num of batches with ssb is " + f"{len(dataloader_with_ssb)} and without ssb is {len(dataloader)}!" + ) + + dataloader_with_ssb_exception, dataloader_exception = False, False + + try: + list(dataloader_with_ssb) + except: + dataloader_with_ssb_exception = True + + try: + list(dataloader) + except: + dataloader_exception = True + + assert dataloader_with_ssb_exception == dataloader_exception From 9c80bdd9671ff32d6472fd7e8726220a5e349241 Mon Sep 17 00:00:00 2001 From: anteju <108555623+anteju@users.noreply.github.com> Date: Thu, 11 Apr 2024 23:13:11 +0200 Subject: [PATCH 17/39] Added codec checkpoint to docs (#8860) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ante Jukić --- docs/source/tts/checkpoints.rst | 7 +++++++ docs/source/tts/data/ngc_models_codec.csv | 2 ++ 2 files changed, 9 insertions(+) create mode 100644 docs/source/tts/data/ngc_models_codec.csv diff --git a/docs/source/tts/checkpoints.rst b/docs/source/tts/checkpoints.rst index 9c3de1ab4926..7d5daedd0559 100644 --- a/docs/source/tts/checkpoints.rst +++ b/docs/source/tts/checkpoints.rst @@ -152,3 +152,10 @@ End2End models :file: data/ngc_models_e2e.csv :align: left :header-rows: 1 + +Codec models +^^^^^^^^^^^^ +.. csv-table:: + :file: data/ngc_models_codec.csv + :align: left + :header-rows: 1 diff --git a/docs/source/tts/data/ngc_models_codec.csv b/docs/source/tts/data/ngc_models_codec.csv new file mode 100644 index 000000000000..d46567012600 --- /dev/null +++ b/docs/source/tts/data/ngc_models_codec.csv @@ -0,0 +1,2 @@ +Model Name,Dataset,Sampling Rate,Model Class,Overview,Checkpoint +audio_codec_16khz_small,Libri-Light,16000Hz,nemo.collections.tts.models.AudioCodecModel,`audio_codec_16khz_small `_,``https://api.ngc.nvidia.com/v2/models/nvidia/nemo/audio_codec_16khz_small/versions/v1/files/audio_codec_16khz_small.nemo`` From 83a5cad63c91d55b6c2a63a32beecd15ea12580a Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Thu, 11 Apr 2024 16:38:54 -0600 Subject: [PATCH 18/39] fix header (#8892) Signed-off-by: eharper --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 1beef67832f0..f7374641d66d 100644 --- a/README.rst +++ b/README.rst @@ -36,7 +36,7 @@ .. _main-readme: **NVIDIA NeMo Framework** -=============== +========================= Latest News ----------- From b63cbe0e072a6531e3276e1474cb7019240959e9 Mon Sep 17 00:00:00 2001 From: anmolgupt <14880251+anmolgupt@users.noreply.github.com> Date: Thu, 11 Apr 2024 16:35:08 -0700 Subject: [PATCH 19/39] disable bprop reduce for cpl when SP is enabled (#8889) * disable bprop reduce for cpl when SP is enabled Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> * updated megatron commit in jenkinsfile Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> --------- Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> Signed-off-by: Chen Cui Co-authored-by: Chen Cui --- Jenkinsfile | 2 +- .../nlp/modules/common/megatron/adapters/parallel_adapters.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6471fa3d011f..c98d13fbed38 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -87,7 +87,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \ + git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \ pip install . && \ cd megatron/core/datasets && \ make' diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 70ed4d695b3c..419126bd3f18 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -187,6 +187,7 @@ def __init__( bias=False, gather_output=True, init_method=self._get_init_fn(column_init_method), + disable_grad_reduce=self._sequence_parallel, ) if gather_output: self.linear_out = RowParallelLinear( From 5da310109f9b09ac950b4bf83e226e955d376728 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Thu, 11 Apr 2024 17:10:24 -0700 Subject: [PATCH 20/39] [Nemo CICD] Add further runners for cpu-intensive-only (non-gpu using) jobs (#8894) * Cancel old runs for PR commit update * update dependencies for container build * temp for test * update back * Revert "temp for test" This reverts commit 9f9221155412393d05b2c862880f9128a93b26a4. * Add further runners for cpu-intensive-only (non-gpu using) jobs --- .github/workflows/cicd-main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 5cc990902953..a9509fda51e9 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -31,7 +31,7 @@ jobs: nvidia-smi cicd-cluster-clean: - runs-on: self-hosted-azure + runs-on: self-hosted-azure-cpu steps: - name: Clean server from old files run: | @@ -53,7 +53,7 @@ jobs: cicd-test-container-setup: needs: [cicd-cluster-clean] - runs-on: self-hosted-azure + runs-on: self-hosted-azure-cpu # uses: actions/cache@v2 #container: # image: nvcr.io/nvidia/pytorch:24.01-py3 @@ -196,7 +196,7 @@ jobs: L0_Unit_Tests_CPU: needs: [cicd-test-container-setup] - runs-on: self-hosted-azure + runs-on: self-hosted-azure-cpu container: image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} options: From acf9a7eebfeb80ba7c7f540331c8d772839ff158 Mon Sep 17 00:00:00 2001 From: Jaemin Choi Date: Thu, 11 Apr 2024 18:30:24 -0700 Subject: [PATCH 21/39] Add TE guards for DGRAD RS overlap (#8879) * Add TE guards for DGRAD RS overlap Signed-off-by: Jaemin Choi * Fix TE guard Signed-off-by: Jaemin Choi --------- Signed-off-by: Jaemin Choi Co-authored-by: Jaemin Choi --- .../megatron/gpt_full_te_layer_autocast_spec.py | 10 ++++++---- .../nlp/modules/common/megatron/transformer.py | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py index 19766e4a34ca..02858b119bfa 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_full_te_layer_autocast_spec.py @@ -134,7 +134,8 @@ def __init__( transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get( atomic_gemm_flag, False ) - transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False) + if te_version > packaging.version.Version("1.6.0.dev0"): + transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False) else: transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True) transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True) @@ -225,9 +226,10 @@ def __init__(self, config, layer_number=1, hidden_dropout=None): if hasattr(config, "tp_comm_overlap_rs") else config.tp_comm_split_rs or config.tp_comm_atomic_rs ) - transformer_layer_args["ub_overlap_rs_dgrad"] = ( - config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False - ) + if te_version > packaging.version.Version("1.6.0.dev0"): + transformer_layer_args["ub_overlap_rs_dgrad"] = ( + config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False + ) else: transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py index b33a996b7987..cb23c4a6b1fd 100644 --- a/nemo/collections/nlp/modules/common/megatron/transformer.py +++ b/nemo/collections/nlp/modules/common/megatron/transformer.py @@ -850,7 +850,8 @@ def __init__( transformer_layer_args[ub_overlap_flag] = kwargs.get(split_gemm_flag, True) or kwargs.get( atomic_gemm_flag, False ) - transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False) + if te_version > packaging.version.Version("1.6.0.dev0"): + transformer_layer_args["ub_overlap_rs_dgrad"] = kwargs.get("ub_overlap_rs_dgrad", False) else: transformer_layer_args["ub_split_ag"] = kwargs.get("ub_split_ag", True) transformer_layer_args["ub_split_rs"] = kwargs.get("ub_split_rs", True) @@ -1120,9 +1121,10 @@ def build_layer(layer_number): if hasattr(config, "tp_comm_overlap_rs") else config.tp_comm_split_rs or config.tp_comm_atomic_rs ) - transformer_layer_args["ub_overlap_rs_dgrad"] = ( - config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False - ) + if te_version > packaging.version.Version("1.6.0.dev0"): + transformer_layer_args["ub_overlap_rs_dgrad"] = ( + config.tp_comm_overlap_rs_dgrad if hasattr(config, "tp_comm_overlap_rs_dgrad") else False + ) else: transformer_layer_args["ub_split_ag"] = config.tp_comm_split_ag transformer_layer_args["ub_split_rs"] = config.tp_comm_split_rs From 752ed8a822a1135e73b94307099804e26f20b703 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Fri, 12 Apr 2024 06:45:06 -0700 Subject: [PATCH 22/39] Skip validation model gradient zeroing (#8890) * Skip validation model gradient zeroing Signed-off-by: Sangkug Lym * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Sangkug Lym Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../conf/megatron_gpt_config.yaml | 1 + .../language_modeling/megatron_gpt_model.py | 21 +++++++++++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 7be891a156c8..ea37237f2eac 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -142,6 +142,7 @@ model: gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages nccl_communicator_config_path: null # Path to the yaml file with NCCL communicator options (min_ctas, max_ctas, and cga_cluster_size) + validation_param_sync_overlap: False # Overlap parameter AllGather with validation step. # FSDP fsdp: False # Enable training with torch FSDP. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 8d1d428a9989..a651ada5c38a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -365,6 +365,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer): self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1))) self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0))) self.loss_broadcast_src_rank = None + self.validation_param_sync_overlap = self.cfg.get('validation_param_sync_overlap', False) self.inference_params = None @@ -585,10 +586,14 @@ def fwd_bwd_step(self, dataloader_iter, forward_only, first_val_step=None): no_sync_func = None grad_sync_func = None param_sync_func = None - if not forward_only and self.with_distributed_adam: - no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,) - grad_sync_func = self.reduce_overlap_gradients - param_sync_func = self.sync_overlap_parameters + if self.with_distributed_adam: + if forward_only: + if self.validation_param_sync_overlap: + param_sync_func = self.sync_overlap_parameters + else: + no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,) + grad_sync_func = self.reduce_overlap_gradients + param_sync_func = self.sync_overlap_parameters # pipeline schedules will get these from self.model.config for module in self.get_model_module_list(): @@ -1703,6 +1708,14 @@ def on_load_checkpoint(self, checkpoint) -> None: self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) parallel_state.set_virtual_pipeline_model_parallel_rank(0) + def on_validation_model_zero_grad(self) -> None: + """ + Skip gradient zeroing at the beginning of validation routine. + This is needed when overlapping the AllGather of the updated parameters with the following valdation step. + """ + if not self.validation_param_sync_overlap: + super().on_validation_model_zero_grad() + def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]: """ Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk. From 8bf2bc0a435ef18dd43830f4727d7824ae7e60e4 Mon Sep 17 00:00:00 2001 From: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> Date: Fri, 12 Apr 2024 10:54:51 -0500 Subject: [PATCH 23/39] Correcting bullets and notes within NeMo Forced Aligner (#8903) * Update nemo_forced_aligner.rst Correcting bullets and note tags Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> * Update nemo_forced_aligner.rst Note that the bottom is not showing up correctly. Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> --------- Signed-off-by: Andrew Schilling <85314306+aschilling-nv@users.noreply.github.com> --- docs/source/tools/nemo_forced_aligner.rst | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/source/tools/nemo_forced_aligner.rst b/docs/source/tools/nemo_forced_aligner.rst index a4ed90fa7f9f..aa8d2139653f 100644 --- a/docs/source/tools/nemo_forced_aligner.rst +++ b/docs/source/tools/nemo_forced_aligner.rst @@ -45,7 +45,7 @@ Call the ``align.py`` script, specifying the parameters as follows: * ``model_path``: string specifying the local filepath to a CTC NeMo ASR model which will be used to generate the log-probs which we will use to do alignment. If ``pretrained_name`` is specified, ``model_path`` must not be specified. - Note: Currently NFA can only use CTC models, or Hybrid CTC-Transducer models (in CTC mode). Pure Transducer models cannot be used. + .. note:: Currently NFA can only use CTC models, or Hybrid CTC-Transducer models (in CTC mode). Pure Transducer models cannot be used. * ``manifest_filepath``: The path to the manifest of the data you want to align, containing ``'audio_filepath'`` and ``'text'`` fields. The audio filepaths need to be absolute paths. @@ -66,7 +66,7 @@ Optional parameters: * ``additional_segment_grouping_separator``: an optional string used to separate the text into smaller segments. If this is not specified, then the whole text will be treated as a single segment. (Default: ``None``. Cannot be empty string or space (" "), as NFA will automatically produce word-level timestamps for substrings separated by spaces). - Note: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``. + .. note:: the ``additional_segment_grouping_separator`` will be removed from the reference text and all the output files, ie it is treated as a marker which is not part of the reference text. The separator will essentially be treated as a space, and any additional spaces around it will be amalgamated into one, i.e. if ``additional_segment_grouping_separator="|"``, the following texts will be treated equivalently: ``“abc|def”``, ``“abc |def”``, ``“abc| def”``, ``“abc | def"``. * ``remove_blank_tokens_from_ctm``: a boolean denoting whether to remove tokens from token-level output CTMs. (Default: False). @@ -98,13 +98,14 @@ By default, NFA needs to be provided with a 'manifest' file where each line spec You can omit the ``"text"`` field from the manifest if you specify ``align_using_pred_text=true``. In that case, any ``"text"`` fields in the manifest will be ignored: the ASR model at ``pretrained_name`` or ``model_path`` will be used to transcribe the audio and obtain ``"pred_text"``, which will be used as the reference text for the forced alignment process. The ``"pred_text"`` will also be saved in the output manifest JSON file at ``/_with_output_file_paths.json``. To remove the possibility of overwriting ``"pred_text"``, NFA will raise an error if ``align_using_pred_text=true`` and there are existing ``"pred_text"`` fields in the original manifest. - ..note:: NFA does not require ``"duration"`` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in ``"text"`` is. + .. note:: NFA does not require ``"duration"`` fields in the manifest, and can align long audio files without running out of memory. The duration of audio file you can align will depend on the amount of memory on your machine. NFA will also produce better alignments the more accurate the reference text in ``"text"`` is. Output CTM file format ---------------------- For each utterance specified in a line of ``manifest_filepath``, several CTM files will be generated: + * a CTM file containing token-level alignments at ``/ctm/tokens/.ctm``, * a CTM file containing word-level alignments at ``/ctm/words/.ctm``, * a CTM file containing segment-level alignments at ``/ctm/segments/.ctm``. If ``additional_segment_grouping_separator`` is specified, the segments will be parts of the text separated by ``additonal_segment_grouping_separator``. If it is not specified, the entire text will be treated as a single segment. @@ -117,6 +118,7 @@ Note the second item in the line (the 'channel ID', which is required by the CTM ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``CTMFileConfig`` (which is passed into the main NFA config) has the following parameters: + * ``remove_blank_tokens``: bool (default ``False``) to specify if the token-level CTM files should have the timestamps of the blank tokens removed. * ``minimum_timestamp_duration``: float (default ``0``) to specify the minimum duration that will be applied to all timestamps. If any line in the CTM has a duration lower than this, it will be enlarged from the middle outwards until it meets the ``minimum_timestamp_duration``, or reaches the beginning or end of the audio file. Note that using a non-zero value may cause timestamps to overlap. @@ -124,14 +126,17 @@ Output ASS file format ---------------------- NFA will produce the following ASS files, which you can use to generate subtitle videos: + * ASS files with token-level highlighting will be at ``/ass/tokens/.ass,`` * ASS files with word-level highlighting will be at ``/ass/words/.ass``. + All words belonging to the same segment 'segments' will appear at the same time in the subtitles generated with the ASS files. If you find that your segments are not the right size, you can use set ``ass_file_config.resegment_text_to_fill_space=true`` and specify some number of ``ass_file_config.max_lines_per_segment``. ``ASSFileConfig`` parameters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``ASSFileConfig`` (which is passed into the main NFA config) has the following parameters: + * ``fontsize``: int (default value ``20``) which will be the fontsize of the text * ``vertical_alignment``: string (default value ``center``) to specify the vertical alignment of the text. Can be one of ``center``, ``top``, ``bottom``. * ``resegment_text_to_fill_space``: bool (default value ``False``). If ``True``, the text will be resegmented such that each segment will not take up more than (approximately) ``max_lines_per_segment`` when the ASS file is applied to a video. @@ -144,6 +149,7 @@ Output JSON manifest file format -------------------------------- A new manifest file will be saved at ``/_with_output_file_paths.json``. It will contain the same fields as the original manifest, and additionally: + * ``"token_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``) * ``"word_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``) * ``"segment_level_ctm_filepath"`` (if ``save_output_file_formats`` contains ``ctm``) @@ -159,8 +165,9 @@ Ideally you would have some 'true' CTM files to compare with your generated CTM Alternatively (or additionally), you can visualize the quality of alignments using tools such as Gecko, which can play your audio file and display the predicted alignments at the same time. The Gecko tool requires you to upload an audio file and at least one CTM file. The Gecko tool can be accessed here: https://gong-io.github.io/gecko/. More information about the Gecko tool can be found on its Github page here: https://github.com/gong-io/gecko. -**Note**: the following may help improve your experience viewing the CTMs in Gecko: +.. note:: + The following may help improve your experience viewing the CTMs in Gecko: -* setting ``minimum_timestamp_duration`` to a larger number, as Gecko may not display some tokens/words/segments properly if their timestamps are too short. -* setting ``remove_blank_tokens_from_ctm=true`` if you are analyzing token-level CTMs, as it will make the Gecko visualization less cluttered. + * setting ``minimum_timestamp_duration`` to a larger number, as Gecko may not display some tokens/words/segments properly if their timestamps are too short. + * setting ``remove_blank_tokens_from_ctm=true`` if you are analyzing token-level CTMs, as it will make the Gecko visualization less cluttered. From f05ecb601556ccc72cf487603ee8774916b88698 Mon Sep 17 00:00:00 2001 From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Date: Fri, 12 Apr 2024 11:27:08 -0700 Subject: [PATCH 24/39] Adding distributed checkpointing for bert (#8650) * Adding distributed checkpointing for bert * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update megatron_bert_model.py Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> * Adding dist checkpointing to bert * Simple bug fix * Fixing parallel state * Simple bug fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixing bug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Simple bug fix * Simple bug fix --------- Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay --- .../megatron_lm_ckpt_to_nemo.py | 2 +- .../language_modeling/megatron_retro_eval.py | 2 +- .../multimodal/data/common/webdataset.py | 2 +- .../models/multimodal_llm/neva/neva_model.py | 8 +-- .../megatron_bert_embedding_model.py | 4 +- .../megatron/bert/bert_model.py | 1 + .../language_modeling/megatron_base_model.py | 9 ++- .../language_modeling/megatron_bert_model.py | 69 +++++++++++++++---- .../language_modeling/megatron_gpt_model.py | 6 +- .../megatron_gpt_prompt_learning_model.py | 2 +- .../megatron_lm_encoder_decoder_model.py | 2 +- .../megatron_retrieval_model.py | 2 +- nemo/collections/nlp/models/nlp_model.py | 2 +- nemo/collections/nlp/parts/nlp_overrides.py | 4 +- nemo/export/quantize/quantizer.py | 2 +- nemo/utils/distributed.py | 2 +- .../convert_prompt_learning_ckpt_to_nemo.py | 2 +- .../start_retro_model_service.py | 2 +- 18 files changed, 80 insertions(+), 43 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py index d09c79f7a051..03d6fd94e4e2 100644 --- a/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py +++ b/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py @@ -529,7 +529,7 @@ def convert(local_rank, rank, world_size, args): if args.nemo_file_path: if args.model_type == 'gpt': - if mcore_output and parallel_state.is_unitialized(): + if mcore_output and not parallel_state.is_initialized(): parallel_state.initialize_model_parallel( tensor_model_parallel_size=args.tensor_model_parallel_size, pipeline_model_parallel_size=args.pipeline_model_parallel_size, diff --git a/examples/nlp/language_modeling/megatron_retro_eval.py b/examples/nlp/language_modeling/megatron_retro_eval.py index 79b1e2debdfa..9978bab78bfc 100644 --- a/examples/nlp/language_modeling/megatron_retro_eval.py +++ b/examples/nlp/language_modeling/megatron_retro_eval.py @@ -108,7 +108,7 @@ def main(cfg) -> None: } # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/multimodal/data/common/webdataset.py b/nemo/collections/multimodal/data/common/webdataset.py index 8d70a03fa911..79d22f34f77c 100644 --- a/nemo/collections/multimodal/data/common/webdataset.py +++ b/nemo/collections/multimodal/data/common/webdataset.py @@ -302,7 +302,7 @@ def run(self, src): epoch = self.epoch rng = random.Random() # This seed to be deterministic AND the same across all nodes/workers in each epoch - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): seed = self.seed + epoch else: seed = self.seed + epoch + (100 * parallel_state.get_data_parallel_rank()) diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 78a46ce3b0db..4556ba1b3e72 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -461,7 +461,7 @@ def model_provider_func(self, pre_process, post_process): media_end_id = self.tokenizer.token_to_id(DEFAULT_IM_END_TOKEN) if self.mcore_gpt: - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return @@ -795,9 +795,7 @@ def setup(self, stage=None): Args: stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. """ - num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert( - self.model - ) + num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert() logging.info( f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' @@ -998,7 +996,7 @@ def generate( ) -> OutputType: # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py index 849438d408a5..d974c8182234 100644 --- a/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py +++ b/nemo/collections/nlp/models/information_retrieval/megatron_bert_embedding_model.py @@ -189,9 +189,7 @@ def setup(self, stage=None): stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. """ - num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert( - self.model - ) + num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert() logging.info( f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index 749d960b9729..e7ae529fe4e2 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -347,6 +347,7 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw # Output if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly + self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,) self.output_layer = tensor_parallel.ColumnParallelLinear( diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 980ea8f9f76d..035d194de09f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -996,10 +996,11 @@ def is_data_parallel_rank_zero(self): else: return False - def _get_total_params_across_model_parallel_groups_gpt_bert(self, model): + def _get_total_params_across_model_parallel_groups_gpt_bert(self): """Returns the total number of parameters across all model parallel groups.""" is_mcore_model = self.__dict__.get('mcore_gpt', False) or self.__dict__.get('mcore_bert', False) # log number of parameters + model = self.get_model_module_list() if isinstance(model, list): num_parameters_on_device = sum( [sum([p.nelement() for p in model_module.parameters()]) for model_module in model] @@ -1010,7 +1011,7 @@ def _get_total_params_across_model_parallel_groups_gpt_bert(self, model): and self.cfg.get('share_embeddings_and_output_weights', True) ): word_embeddings_weight = ( - model[-1].module.shared_embedding_or_output_weight() + model[-1].shared_embedding_or_output_weight() if is_mcore_model else model[-1].word_embeddings_weight() ) @@ -1025,9 +1026,7 @@ def _get_total_params_across_model_parallel_groups_gpt_bert(self, model): and self.cfg.get('share_embeddings_and_output_weights', True) ): word_embeddings_weight = ( - model.module.shared_embedding_or_output_weight() - if is_mcore_model - else model.word_embeddings_weight() + model.shared_embedding_or_output_weight() if is_mcore_model else model.word_embeddings_weight() ) # substract the embedding weights on the last stage num_word_embedding_parameters = sum([p.nelement() for p in word_embeddings_weight]) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py index 82b2b1a96ff4..dc6d81649122 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_bert_model.py @@ -767,9 +767,7 @@ def setup(self, stage=None): stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. """ - num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert( - self.model - ) + num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert() logging.info( f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' @@ -1084,25 +1082,70 @@ def input_example(self, max_batch=1, max_dim=256): input_dict = {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids} return tuple([input_dict]) + def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]: + """ + Creates the sharded state dict which is used by dist_checkpoint to save the sharded tensors to disk. + When given the sharded_stated_dict, dist_checkpoint.load will load the tensors corresponding to + self.state_dict(). + The sharded tensor mapping is defined in the GPTModel class from mcore. + """ + if self.mcore_bert: + module_prefix = f'{prefix}model.' + sharded_state_dict = {} + for index, module in enumerate(self.get_model_module_list()): + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + # virtual pipline rank must be set so that GPTModel returns the correct sharded state dict + parallel_state.set_virtual_pipeline_model_parallel_rank(index) + module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix) + sharded_state_dict[f'model_{index}'] = module_sharded_state_dict + else: + module_sharded_state_dict = module.sharded_state_dict(prefix=module_prefix) + sharded_state_dict.update(module_sharded_state_dict) + + # reset vp rank + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + parallel_state.set_virtual_pipeline_model_parallel_rank(0) + + return sharded_state_dict + def on_save_checkpoint(self, checkpoint) -> None: """LightningModule hook: https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-save-checkpoint """ - if isinstance(self.model, list): - for i in range(len(self.model)): - parallel_state.set_virtual_pipeline_model_parallel_rank(i) - checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint() - parallel_state.set_virtual_pipeline_model_parallel_rank(0) + if self.mcore_bert: + checkpoint['sharded_state_dict'] = self.sharded_state_dict() + else: + if isinstance(self.model, list): + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + checkpoint[f'model{i}'] = self.model[i].module.state_dict_for_save_checkpoint() + parallel_state.set_virtual_pipeline_model_parallel_rank(0) def on_load_checkpoint(self, checkpoint) -> None: """LightningModule hook: https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-load-checkpoint """ - if isinstance(self.model, list): - for i in range(len(self.model)): - parallel_state.set_virtual_pipeline_model_parallel_rank(i) - self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) - parallel_state.set_virtual_pipeline_model_parallel_rank(0) + if self.mcore_bert: + if 'state_dict' in checkpoint and checkpoint['state_dict']: + for index, module in enumerate(self.get_model_module_list()): + if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: + checkpoint_state_dict = checkpoint['state_dict'][f'model_{index}'] + else: + checkpoint_state_dict = checkpoint['state_dict'] + # checkpoint_state_dict has "model." but module does not so we need to remove it when loading + checkpoint_state_dict = { + key.replace('model.', ''): checkpoint_state_dict.pop(key) + for key in list(checkpoint_state_dict.keys()) + } + module.load_state_dict(checkpoint_state_dict, strict=True) + else: + checkpoint['state_dict'] = {} + else: + if isinstance(self.model, list): + for i in range(len(self.model)): + parallel_state.set_virtual_pipeline_model_parallel_rank(i) + self.model[i].module.load_state_dict(checkpoint[f'model{i}'], strict=True) + parallel_state.set_virtual_pipeline_model_parallel_rank(0) def build_transformer_config(self) -> TransformerConfig: """ Builds the megatron core gpt transformer config for the model. diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index a651ada5c38a..f5b1667be27f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1479,9 +1479,7 @@ def setup(self, stage=None): Args: stage (str, optional): Can be 'fit', 'validate', 'test' or 'predict'. Defaults to None. """ - num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert( - self.model - ) + num_parameters_on_device, total_num_parameters = self._get_total_params_across_model_parallel_groups_gpt_bert() logging.info( f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' @@ -1570,7 +1568,7 @@ def generate( ) -> OutputType: # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 617a585ef3a9..5ee7a3fcf480 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -729,7 +729,7 @@ def generate( ): # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py index 3a7ad3d6714c..459bf5b71c7e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py @@ -1006,7 +1006,7 @@ def encode(self, tokens_enc, enc_mask, encoder_input=None, batch_data=None, reco Format is not defined and should match the expected format of the used hiddens modules. """ # Check whether the DDP is initialized. This is needed when running inference outside of training loop. - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index ebe936a8178a..acd85261f7e5 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -482,7 +482,7 @@ def generate( ) -> OutputType: # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/nlp/models/nlp_model.py b/nemo/collections/nlp/models/nlp_model.py index 04bbb2ca17fe..65d8645688fd 100644 --- a/nemo/collections/nlp/models/nlp_model.py +++ b/nemo/collections/nlp/models/nlp_model.py @@ -385,7 +385,7 @@ def load_from_checkpoint( sharded_state_dict = model.sharded_state_dict() checkpoint['state_dict'] = sharded_state_dict # dist checkpointing needs torch.distributed to load the checkpoint - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 91f1fab348da..d4a75e3353c7 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -869,7 +869,7 @@ def save_to(self, model, save_path: str): sharded_state_dict = model.sharded_state_dict() # dist checkpoint needs torch.distributed to save the checkpoint - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return @@ -1110,7 +1110,7 @@ def restore_from( # if we're using dist checkpointing then state_dict will be None if state_dict is None: # dist checkpointing needs torch.distributed to load the checkpoint - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py index 435ca6a496b1..d60ede29e22e 100644 --- a/nemo/export/quantize/quantizer.py +++ b/nemo/export/quantize/quantizer.py @@ -133,7 +133,7 @@ def _load_model( return model def _check_ddp_initialized(self, model): - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py index 9649089e40af..443c0216785e 100644 --- a/nemo/utils/distributed.py +++ b/nemo/utils/distributed.py @@ -81,7 +81,7 @@ def gather_objects(partial_results_list, main_rank=None): pickle.dump(predictions, open(output_fname, "wb")) """ # do not fail when DDP is not initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): return partial_results_list rank = parallel_state.get_data_parallel_rank() diff --git a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py index 61cbbc1ae682..334b3415a93b 100644 --- a/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_prompt_learning_ckpt_to_nemo.py @@ -104,7 +104,7 @@ def main(cfg) -> None: raise ValueError("need at least a nemo file or checkpoint dir") # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return diff --git a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py index 4a373dcaf278..ee32f69bf734 100644 --- a/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py +++ b/scripts/nlp_language_modeling/service_launch_scripts/start_retro_model_service.py @@ -80,7 +80,7 @@ def main(cfg) -> None: ) # check whether the DDP is initialized - if parallel_state.is_unitialized(): + if not parallel_state.is_initialized(): def dummy(): return From f3d45fd64482b15a6b0f63e7079d6db1be4f46e6 Mon Sep 17 00:00:00 2001 From: Rachit Garg Date: Fri, 12 Apr 2024 11:30:45 -0700 Subject: [PATCH 25/39] remove fp8 checkpoints for Attention (#8875) * remove fp8 checkpoints for Attention Signed-off-by: rachitg * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: rachitg * set default value and support mha Signed-off-by: rachitg --------- Signed-off-by: rachitg Co-authored-by: rachitg Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../models/language_modeling/megatron_gpt_model.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index f5b1667be27f..d3f5a7afd631 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -89,6 +89,8 @@ from megatron.core import InferenceParams, parallel_state, tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset + from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace + from megatron.core.dist_checkpointing.mapping import LocalNonpersitentObject, ShardedObject # NeMo's implementation of the get_gpt_layer_ammo_spec function is temporarily used # from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec @@ -1739,6 +1741,15 @@ def sharded_state_dict(self, prefix: str = '') -> Dict[str, Any]: if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: parallel_state.set_virtual_pipeline_model_parallel_rank(0) + # WAR: This is a temporary fix to skip loading FP8 parameters for Dot Product Attention + def skip_fp8_load(x): + if isinstance(x, ShardedObject) and 'fused_attention' in x.key and '_extra_state' in x.key: + x = LocalNonpersitentObject(x.data) # use the FP8 state from initialization, not from ckpt + return x + + if self.cfg.get('fp8_dot_product_attention', False) or self.cfg.get('fp8_multi_head_attention', False): + dict_list_map_inplace(skip_fp8_load, sharded_state_dict) + return sharded_state_dict def parameters(self): From ac95b5c8cb49a8b3023f39d1eff75226b0478d1c Mon Sep 17 00:00:00 2001 From: anmolgupt <14880251+anmolgupt@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:49:22 -0700 Subject: [PATCH 26/39] lora a2a after linear out when sp is enabled and parallel input (#8882) * disable reduce for lora CPL bprop Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> * a2a for linear out lora Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * moved a2a tp to init Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> * add lora config option for enable a2a Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> * added custom all2all * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * code cleanup Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Anmol Gupta <14880251+anmolgupt@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../megatron/adapters/parallel_adapters.py | 51 ++++++++++++++++++- nemo/collections/nlp/parts/peft_config.py | 1 + 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py index 419126bd3f18..5037bb1b3634 100644 --- a/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py +++ b/nemo/collections/nlp/modules/common/megatron/adapters/parallel_adapters.py @@ -47,6 +47,7 @@ try: from megatron.core import ModelParallelConfig + from megatron.core.parallel_state import get_tensor_model_parallel_group, get_tensor_model_parallel_world_size from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear from megatron.core.tensor_parallel.mappings import ( gather_from_sequence_parallel_region, @@ -146,6 +147,7 @@ def __init__( model_parallel_config: Optional[ModelParallelConfig] = None, alpha: float | None = None, dropout_position: str = 'post', + a2a_experimental: bool = False, # TODO: should rename this or make it a default feature **kwargs, ): super().__init__() @@ -161,6 +163,9 @@ def __init__( self.alpha = alpha if alpha is not None else self.dim self.input_is_parallel = input_is_parallel self.dropout_position = dropout_position + self.tp_world_size = None + self.tp_group = None + self.use_a2a = a2a_experimental # megatron_gpt_peft_models will provide this arg, but deprecated ones do not. # in case this arg is not provided, use the dummy default config. @@ -202,12 +207,17 @@ def __init__( else: # (@adithyare) we use this option to mirror the behavior a column parallel layer with two low-rank column parallel layers # if the original column parallel layer uses gather_output=False, then we will use the self.liner_out layer defined below. + lin_out_gather_output = True if input_is_parallel else False + if self.use_a2a and input_is_parallel and self._sequence_parallel: + lin_out_gather_output = False + self.tp_world_size = get_tensor_model_parallel_world_size() + self.tp_group = get_tensor_model_parallel_group() self.linear_out = ColumnParallelLinear( dim, out_features, config=model_parallel_config, bias=False, - gather_output=True if input_is_parallel else False, + gather_output=lin_out_gather_output, init_method=self._get_init_fn(row_init_method), ) @@ -291,7 +301,11 @@ def forward(self, x): # layernorm after lora is impacted by sequence parallel, # hence seq dim need to be scattered right after lora linear layers # this function also handles the backward pass correctly - x = scatter_to_sequence_parallel_region(x) + if self.use_a2a: + # all2all hidden_size / TP to seq_len / TP + x = all2all_hp2sp(x, self.tp_world_size, self.tp_group) + else: + x = scatter_to_sequence_parallel_region(x) if self.norm_position == 'post': x = self.layer_norm(x) @@ -305,6 +319,38 @@ def forward(self, x): return x +class _All2AllHp2Sp(torch.autograd.Function): + """ + All-2-All from Hidden Parallel to Sequence Parallel + This is a temporary workaround and can be updated in the future + TODO: Move the functionality to MCore + """ + + @staticmethod + def forward(ctx, input_, world_size, group): + ctx.world_size = world_size + ctx.group = group + send_list = list(input_.chunk(world_size, dim=0)) + send_list = [tensor.contiguous() for tensor in send_list] + receive_list = [torch.empty_like(send_list[0]) for _ in range(world_size)] + torch.distributed.all_to_all(receive_list, send_list, group=group) + x = torch.cat(receive_list, dim=-1) + return x + + @staticmethod + def backward(ctx, grad_output): + send_list = list(grad_output.chunk(ctx.world_size, dim=-1)) + send_list = [tensor.contiguous() for tensor in send_list] + receive_list = [torch.empty_like(send_list[0]) for _ in range(ctx.world_size)] + torch.distributed.all_to_all(receive_list, send_list, group=ctx.group) + x = torch.cat(receive_list, dim=0) + return x, None, None + + +def all2all_hp2sp(input_, world_size, group): + return _All2AllHp2Sp.apply(input_, world_size, group) + + @dataclass class ParallelLinearAdapterConfig(AdapterConfig): in_features: int @@ -321,6 +367,7 @@ class ParallelLinearAdapterConfig(AdapterConfig): dropout_position: str = 'post' alpha: float | None = None network_alpha: int | None = None + a2a_experimental: bool = False _target_: str = "{0}.{1}".format(ParallelLinearAdapter.__module__, ParallelLinearAdapter.__name__) diff --git a/nemo/collections/nlp/parts/peft_config.py b/nemo/collections/nlp/parts/peft_config.py index e6f0fe267d18..63caa409b218 100644 --- a/nemo/collections/nlp/parts/peft_config.py +++ b/nemo/collections/nlp/parts/peft_config.py @@ -184,6 +184,7 @@ def _create_lora_config(self, cfg, lora_cfg, in_features, out_features, adapter_ "dropout": lora_cfg.adapter_dropout, "alpha": lora_cfg.get("alpha", lora_cfg.adapter_dim), "dropout_position": lora_cfg.get("dropout_position", "post"), + "a2a_experimental": lora_cfg.get("a2a_experimental", False), } if lora_cfg.weight_tying: From 08ea4cb15889d604652115de8e4d8544a2a76776 Mon Sep 17 00:00:00 2001 From: Wil Kong Date: Sat, 13 Apr 2024 06:50:44 +0800 Subject: [PATCH 27/39] Fix Distributed Fused Adam Issues (#8880) * Fix distributed fused adam issue with NHWC layout. * Fix the CUDA graph issue if there's kernel in zero_grad. * Add option to distribute adam states within node. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- nemo/core/optim/distributed_adam.py | 35 +++++++++++++++++++++++++++++ nemo/utils/callbacks/cuda_graph.py | 9 +++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/nemo/core/optim/distributed_adam.py b/nemo/core/optim/distributed_adam.py index 43a784cd7736..32bd7e6c1154 100644 --- a/nemo/core/optim/distributed_adam.py +++ b/nemo/core/optim/distributed_adam.py @@ -50,6 +50,11 @@ class MegatronDistributedFusedAdam(DistributedFusedAdam): disable_distributed_parameters (bool, optional): use standard data-parallel communication instead of ZeRO. (default: False) + distribute_within_nodes (bool, optional): distribute states + within the same node, e.g. DGX. This can improve performance + but requires larger memory than distributing within all + ranks, especially for pure data parallel models. + (default: False). **kwargs: keyword arguments to pass to Apex DistributedFusedAdam. @@ -59,6 +64,7 @@ def __init__( self, params: Union[Iterable[torch.nn.Parameter], Iterable[dict]], disable_distributed_parameters: bool = False, + distribute_within_nodes: bool = False, **kwargs, ): @@ -71,6 +77,28 @@ def __init__( self_groups = [torch.distributed.new_group(ranks=[i]) for i in range(world_size)] kwargs['distributed_process_group'] = self_groups[rank] kwargs['redundant_process_group'] = kwargs['process_group'] + elif distribute_within_nodes: + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + devices = torch.cuda.device_count() + nodes = world_size // devices + assert nodes * devices == world_size, "Expected all nodes have teh same amout of devices." + node_id = rank // devices + device_id = rank % devices + + distributed_pgs = [] + for i in range(nodes): + ranks = [i * devices + j for j in range(devices)] + pg = torch.distributed.new_group(ranks=ranks) + distributed_pgs.append(pg) + kwargs['distributed_process_group'] = distributed_pgs[node_id] + + redundant_pgs = [] + for i in range(devices): + ranks = [i + j * devices for j in range(nodes)] + pg = torch.distributed.new_group(ranks=ranks) + redundant_pgs.append(pg) + kwargs['redundant_process_group'] = redundant_pgs[device_id] # Make sure dtypes are in right type for keyword in ('dtype', 'grad_sync_dtype', 'param_sync_dtype'): @@ -425,6 +453,13 @@ def _param_copy_fragments(self, fragments: Iterable[DistributedFusedAdam.Paramet buffers_in.append(buffer_in) buffers_out.append(buffer_out) elif torch.is_floating_point(buffer_in) and torch.is_floating_point(param): + # Conv with NHWC layout, i.e. shape (N, C, H, W) and stride + # (HWC, 1, WC, C), can't `.view(-1)`. Here to turn it to + # tensor with shape (N, H, W, C) and stride (HWC, WC, C, 1). + # Note: https://github.com/NVIDIA/apex/pull/1794 + if param.is_contiguous(memory_format=torch.channels_last): + param = param.permute(0, 2, 3, 1) + # Cast between floating-point dtypes buffer_out = param.detach().view(-1)[param_start:param_end] buffers_in.append(buffer_in) diff --git a/nemo/utils/callbacks/cuda_graph.py b/nemo/utils/callbacks/cuda_graph.py index 247c67856c7b..77dc33e7b567 100644 --- a/nemo/utils/callbacks/cuda_graph.py +++ b/nemo/utils/callbacks/cuda_graph.py @@ -180,12 +180,19 @@ def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure=None,) - torch.cuda.current_stream().wait_stream(state.stream) if state.current_iteration == state.capture_iteration: - optimizer.zero_grad(**zero_grad_kwargs) torch.cuda.synchronize() # Sleep for one second to let environment stable time.sleep(1) rank_zero_info("CUDAGraphCallback: capturing CUDA graph for module %s.", self.__class__.__name__) with torch.cuda.graph(state.graph, stream=state.stream, capture_error_mode="global"): + # PyTorch CUDA graph doc for whole-network capturing mentions: + # + # Sets grads to None before capture, so backward() will create + # .grad attributes with allocations from the graph's private pool + # + # But it's not necessary, and it can lead to CUDA kernels inside + # `zero_grad()` being not captured. + optimizer.zero_grad(**zero_grad_kwargs) self.__orig_optimizer_step__( epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure, ) From 21913a015d28293532a4550f1138df4a6d6e26e5 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Fri, 12 Apr 2024 18:16:05 -0700 Subject: [PATCH 28/39] update mcore 24.04.12 (#8910) --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a9509fda51e9..29ea34dba197 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -114,7 +114,7 @@ jobs: # Megatron Core installation git clone https://github.com/NVIDIA/Megatron-LM.git && \ pushd Megatron-LM && \ - git checkout 7fe863f3d94f7b64a927b04b85f5c9339d3fb784 && \ + git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \ pip install . && \ pushd megatron/core/datasets && \ make && \ From ac53e2296fcd9f699c928b62948a0b673c3817bc Mon Sep 17 00:00:00 2001 From: Jie Xin <932141413@qq.com> Date: Sat, 13 Apr 2024 09:33:30 +0800 Subject: [PATCH 29/39] Support alternative mapping TP->PP->DP (#8909) * support new tp-pp-dp mapping Signed-off-by: jxin * add test Signed-off-by: jxin * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine Signed-off-by: jxin * change mcore commit Signed-off-by: jxin --------- Signed-off-by: jxin Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper --- Jenkinsfile | 2 +- .../stable_diffusion/ldm/ddpm.py | 7 +- .../language_modeling/megatron_base_model.py | 1 + .../language_modeling/megatron_gpt_model.py | 8 +- .../modules/common/megatron/megatron_init.py | 95 ++++++------- nemo/collections/nlp/parts/nlp_overrides.py | 1 + nemo/utils/app_state.py | 9 ++ tests/collections/nlp/test_initialize.py | 134 ++++++++++++++++++ 8 files changed, 189 insertions(+), 68 deletions(-) create mode 100644 tests/collections/nlp/test_initialize.py diff --git a/Jenkinsfile b/Jenkinsfile index c98d13fbed38..55e836eea13a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -87,7 +87,7 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \ + git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \ pip install . && \ cd megatron/core/datasets && \ make' diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py index 33a194500a69..a96c3c47e44e 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py @@ -1770,12 +1770,7 @@ def fwd_bwd_step(self, dataloader_iter, forward_only): # we can avoid this broadcast by updating the PTL log function to accept specific ranks if parallel_state.get_pipeline_model_parallel_world_size() > 1: if self.loss_broadcast_src_rank is None: - dp_size = parallel_state.get_data_parallel_world_size() - tp_size = parallel_state.get_tensor_model_parallel_world_size() - pp_size = parallel_state.get_pipeline_model_parallel_world_size() - rank_in_dp_tp_group = torch.distributed.get_rank() % (dp_size * tp_size) - last_pipeline_stage_offset = (tp_size * dp_size) * (pp_size - 1) - self.loss_broadcast_src_rank = last_pipeline_stage_offset + rank_in_dp_tp_group + self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank() torch.distributed.broadcast( loss_mean, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(), ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 035d194de09f..f431d43716b9 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -195,6 +195,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): pipeline_model_parallel_size=cfg.get('pipeline_model_parallel_size', 1), virtual_pipeline_model_parallel_size=vp_size, pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), + use_tp_pp_dp_mapping=cfg.get('use_tp_pp_dp_mapping', False), context_parallel_size=cfg.get('context_parallel_size', 1), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index d3f5a7afd631..ede72439615e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1310,13 +1310,7 @@ def on_validation_epoch_end(self): # it should be casted to other pipeline stages for logging. if parallel_state.get_pipeline_model_parallel_world_size() > 1: if self.loss_broadcast_src_rank is None: - dp_size = parallel_state.get_data_parallel_world_size() - cp_size = parallel_state.get_context_parallel_world_size() - tp_size = parallel_state.get_tensor_model_parallel_world_size() - pp_size = parallel_state.get_pipeline_model_parallel_world_size() - rank_in_dp_tp_group = torch.distributed.get_rank() % (dp_size * cp_size * tp_size) - last_pipeline_stage_offset = (tp_size * cp_size * dp_size) * (pp_size - 1) - self.loss_broadcast_src_rank = last_pipeline_stage_offset + rank_in_dp_tp_group + self.loss_broadcast_src_rank = parallel_state.get_pipeline_model_parallel_last_rank() torch.distributed.broadcast( averaged_loss, self.loss_broadcast_src_rank, group=parallel_state.get_pipeline_model_parallel_group(), ) diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 7ba2e28008ac..5d5b65b360ee 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -32,6 +32,7 @@ try: from megatron.core import tensor_parallel from megatron.core.parallel_state import ( + RankGenerator, get_pipeline_model_parallel_rank, set_expert_model_parallel_rank, set_expert_model_parallel_world_size, @@ -74,6 +75,7 @@ def initialize_model_parallel_for_nemo( init_mpi_proc_group=False, seed=1234, apex_transformer_log_level=30, + use_tp_pp_dp_mapping=False, ): if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED: @@ -84,6 +86,7 @@ def initialize_model_parallel_for_nemo( app_state.global_rank = global_rank app_state.world_size = world_size app_state.local_rank = local_rank + app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping app_state.expert_model_parallel_size = expert_model_parallel_size app_state.tensor_model_parallel_size = tensor_model_parallel_size app_state.pipeline_model_parallel_size = pipeline_model_parallel_size @@ -108,6 +111,7 @@ def initialize_model_parallel_for_nemo( pipeline_model_parallel_split_rank_=pipeline_model_parallel_split_rank, context_parallel_size_=context_parallel_size, expert_model_parallel_size_=expert_model_parallel_size, + use_tp_pp_dp_mapping=use_tp_pp_dp_mapping, ) # update apex.transformer globals @@ -192,6 +196,7 @@ def fake_initialize_model_parallel( virtual_pipeline_model_parallel_size_=None, expert_model_parallel_size_=1, context_parallel_size_=1, + use_tp_pp_dp_mapping=False, ): """ Fake initialize model data parallel groups so that we can instantiate model parallel models before DDP is initialized. @@ -241,24 +246,29 @@ def fake_initialize_model_parallel( if virtual_pipeline_model_parallel_size_ is not None: virtual_pipeline_model_parallel_rank = 0 + rank_generator = RankGenerator( + tp=tensor_model_parallel_size, + ep=expert_model_parallel_size_, + dp=data_parallel_size, + pp=pipeline_model_parallel_size, + cp=context_parallel_size, + order='tp-pp-dp' if use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp', + ) + # Build the data-parallel groups. all_data_parallel_group_ranks_with_cp = [] - for i in range(pipeline_model_parallel_size): - start_rank = i * num_pipeline_model_parallel_groups - end_rank = (i + 1) * num_pipeline_model_parallel_groups - for j in range(context_parallel_size * tensor_model_parallel_size): - ranks = range(start_rank + j, end_rank, context_parallel_size * tensor_model_parallel_size) - if rank in ranks: - data_parallel_group = list(ranks) - logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}') - for j in range(tensor_model_parallel_size): - ranks_with_cp = range(start_rank + j, end_rank, tensor_model_parallel_size) - all_data_parallel_group_ranks_with_cp.append(list(ranks_with_cp)) - if rank in ranks_with_cp: - data_parallel_group_with_cp = list(ranks_with_cp) - logging.info( - f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}' - ) + for ranks in rank_generator.get_ranks('dp'): + if rank in ranks: + data_parallel_group = list(ranks) + logging.info(f'Rank {rank} has data parallel group : {data_parallel_group}') + + for ranks_with_cp in rank_generator.get_ranks('dp-cp'): + all_data_parallel_group_ranks_with_cp.append(ranks_with_cp) + if rank in ranks_with_cp: + data_parallel_group_with_cp = ranks_with_cp + logging.info( + f'Rank {rank} has combined group of data parallel and context parallel : {data_parallel_group_with_cp}' + ) data_parallel_rank = data_parallel_group.index(rank) logging.info( @@ -268,20 +278,11 @@ def fake_initialize_model_parallel( # Build the context-parallel groups. all_context_parallel_group_ranks = [] - for i in range(pipeline_model_parallel_size): - for j in range(data_parallel_size): - start_rank = ( - i * num_pipeline_model_parallel_groups + j * tensor_model_parallel_size * context_parallel_size - ) - end_rank = ( - i * num_pipeline_model_parallel_groups + (j + 1) * tensor_model_parallel_size * context_parallel_size - ) - for k in range(tensor_model_parallel_size): - ranks = range(start_rank + k, end_rank, tensor_model_parallel_size) - all_context_parallel_group_ranks.append(list(ranks)) - if rank in ranks: - context_parallel_group = list(ranks) - logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}') + for ranks in rank_generator.get_ranks('cp'): + all_context_parallel_group_ranks.append(ranks) + if rank in ranks: + context_parallel_group = ranks + logging.info(f'Rank {rank} has context parallel group: {context_parallel_group}') context_parallel_rank = context_parallel_group.index(rank) logging.info(f'All context parallel group ranks: {all_context_parallel_group_ranks}') @@ -289,11 +290,7 @@ def fake_initialize_model_parallel( # Build the model-parallel groups. all_model_parallel_group_ranks = [] - for i in range(data_parallel_size * context_parallel_size): - ranks = [ - data_parallel_group_ranks_with_cp[i] - for data_parallel_group_ranks_with_cp in all_data_parallel_group_ranks_with_cp - ] + for ranks in rank_generator.get_ranks('tp-pp'): all_model_parallel_group_ranks.append(ranks) if rank in ranks: logging.info(f'Rank {rank} has model parallel group: {list(ranks)}') @@ -302,11 +299,10 @@ def fake_initialize_model_parallel( # Build the tensor model-parallel groups. all_tensor_model_parallel_group_ranks = [] tensor_model_parallel_group = None - for i in range(num_tensor_model_parallel_groups): - ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) - all_tensor_model_parallel_group_ranks.append(list(ranks)) + for ranks in rank_generator.get_ranks('tp'): + all_tensor_model_parallel_group_ranks.append(ranks) if rank in ranks: - tensor_model_parallel_group = list(ranks) + tensor_model_parallel_group = ranks logging.info(f'Rank {rank} has tensor model parallel group: {tensor_model_parallel_group}') tensor_model_parallel_rank = tensor_model_parallel_group.index(rank) @@ -317,17 +313,9 @@ def fake_initialize_model_parallel( # EP rank expert_model_parallel_rank = 0 if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1: - tensor_and_data_group_size: int = tensor_model_parallel_size * data_parallel_size - num_tensor_and_data_groups: int = world_size // tensor_and_data_group_size - tensor_and_expert_group_size: int = tensor_model_parallel_size * expert_model_parallel_size_ - num_expert_groups: int = data_parallel_size // expert_model_parallel_size_ - for i in range(num_tensor_and_data_groups): - for j in range(num_expert_groups): - start_rank = i * tensor_and_data_group_size + j * tensor_and_expert_group_size - end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size - ranks = range(start_rank, end_rank) - if rank in ranks: - expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size + for ranks in rank_generator.get_ranks('ep', independent_ep=True): + if rank in ranks: + expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size # Build the pipeline model-parallel groups and embedding groups # (first and last rank in each pipeline model-parallel group). @@ -336,11 +324,10 @@ def fake_initialize_model_parallel( pipeline_model_parallel_group = None embedding_group = None embedding_rank = None - for i in range(num_pipeline_model_parallel_groups): - ranks = range(i, world_size, num_pipeline_model_parallel_groups) - all_pipeline_model_parallel_group_ranks.append(list(ranks)) + for ranks in rank_generator.get_ranks('pp'): + all_pipeline_model_parallel_group_ranks.append(ranks) if rank in ranks: - pipeline_model_parallel_group = list(ranks) + pipeline_model_parallel_group = ranks logging.info(f'Rank {rank} has pipeline model parallel group: {pipeline_model_parallel_group}') # Setup embedding group (to exchange gradients between diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index d4a75e3353c7..983b76784a66 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -137,6 +137,7 @@ def init_model_parallel(sharp: bool, nccl_communicator_config_path: str = None) nccl_communicator_config_path=nccl_communicator_config_path, use_sharp=sharp, expert_model_parallel_size=app_state.expert_model_parallel_size, + order='tp-pp-dp' if app_state.use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp', ) # assert that fake tp and pp rank match after model parallel init diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 8ba9880219ec..34a03fc28871 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -55,6 +55,7 @@ def __init__(self): self._is_megatron_initialized = False self._data_parallel_size = None self._data_parallel_group = None + self._use_tp_pp_dp_mapping = False self._megatron_checkpoint_version = None self._use_fp8 = False self._context_parallel_size = None @@ -191,6 +192,14 @@ def pipeline_model_parallel_size(self, size): """ self._pipeline_model_parallel_size = size + @property + def use_tp_pp_dp_mapping(self): + return self._use_tp_pp_dp_mapping + + @use_tp_pp_dp_mapping.setter + def use_tp_pp_dp_mapping(self, use_new_mapping): + self._use_tp_pp_dp_mapping = use_new_mapping + @property def virtual_pipeline_model_parallel_size(self): """ Property returns the number of GPUs in each model parallel group. diff --git a/tests/collections/nlp/test_initialize.py b/tests/collections/nlp/test_initialize.py new file mode 100644 index 000000000000..b8e27573ce61 --- /dev/null +++ b/tests/collections/nlp/test_initialize.py @@ -0,0 +1,134 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel + + +def old_fake_initialize_model_parallel( + world_size, + rank, + tensor_model_parallel_size_, + pipeline_model_parallel_size_, + pipeline_model_parallel_split_rank_=None, + virtual_pipeline_model_parallel_size_=None, + expert_model_parallel_size_=1, + context_parallel_size_=1, +): + # Get world size and rank. Ensure some consistencies. + tensor_model_parallel_size = min(tensor_model_parallel_size_, world_size) + pipeline_model_parallel_size = min(pipeline_model_parallel_size_, world_size) + model_parallel_size = tensor_model_parallel_size * pipeline_model_parallel_size + context_parallel_size = min(context_parallel_size_, world_size) + + assert ( + world_size % (tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size) == 0 + ), f'world_size: {world_size} must be divisible by tensor_model_parallel_size: {tensor_model_parallel_size} times pipeline_model_parallel_size {pipeline_model_parallel_size} times context_parallel_size {context_parallel_size}' + data_parallel_size = world_size // ( + tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size + ) + + num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size + num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size + + virtual_pipeline_model_parallel_rank = None + if virtual_pipeline_model_parallel_size_ is not None: + virtual_pipeline_model_parallel_rank = 0 + + # Build the tensor model-parallel groups. + tensor_model_parallel_group = None + for i in range(num_tensor_model_parallel_groups): + ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size) + if rank in ranks: + tensor_model_parallel_group = list(ranks) + + tensor_model_parallel_rank = tensor_model_parallel_group.index(rank) + + # EP rank + expert_model_parallel_rank = 0 + if expert_model_parallel_size_ is not None and expert_model_parallel_size_ > 1: + tensor_and_data_group_size: int = tensor_model_parallel_size * data_parallel_size + num_tensor_and_data_groups: int = world_size // tensor_and_data_group_size + tensor_and_expert_group_size: int = tensor_model_parallel_size * expert_model_parallel_size_ + num_expert_groups: int = data_parallel_size // expert_model_parallel_size_ + for i in range(num_tensor_and_data_groups): + for j in range(num_expert_groups): + start_rank = i * tensor_and_data_group_size + j * tensor_and_expert_group_size + end_rank = i * tensor_and_data_group_size + (j + 1) * tensor_and_expert_group_size + ranks = range(start_rank, end_rank) + if rank in ranks: + expert_model_parallel_rank = list(ranks).index(rank) // tensor_model_parallel_size + + # Build the pipeline model-parallel groups and embedding groups + # (first and last rank in each pipeline model-parallel group). + pipeline_model_parallel_group = None + for i in range(num_pipeline_model_parallel_groups): + ranks = range(i, world_size, num_pipeline_model_parallel_groups) + if rank in ranks: + pipeline_model_parallel_group = list(ranks) + + pipeline_model_parallel_rank = pipeline_model_parallel_group.index(rank) + + return ( + tensor_model_parallel_rank, + pipeline_model_parallel_rank, + expert_model_parallel_rank, + model_parallel_size, + data_parallel_size, + pipeline_model_parallel_split_rank_, + virtual_pipeline_model_parallel_rank, + ) + + +@pytest.mark.parametrize( + 'nodes, num_gpu, tp, pp, cp, ep', + [ + (1, 1, 1, 1, 1, 1), + (4, 8, 2, 4, 1, 1), + (8, 8, 8, 8, 1, 1), + (16, 8, 4, 8, 1, 1), + (16, 8, 4, 8, 4, 1), + (32, 8, 8, 8, 1, 1), + (32, 8, 4, 8, 1, 4), + (32, 8, 8, 8, 4, 1), + ], +) +def test_fake_initialize(nodes, num_gpu, tp, pp, cp, ep): + ( + tensor_model_parallel_rank, + pipeline_model_parallel_rank, + expert_model_parallel_rank, + model_parallel_size, + data_parallel_size, + pipeline_model_parallel_split_rank, + virtual_pipeline_model_parallel_rank, + ) = old_fake_initialize_model_parallel(nodes * num_gpu, 0, tp, pp, None, None, ep, cp) + + ( + m_tensor_model_parallel_rank, + n_pipeline_model_parallel_rank, + n_expert_model_parallel_rank, + n_model_parallel_size, + n_data_parallel_size, + n_pipeline_model_parallel_split_rank, + n_virtual_pipeline_model_parallel_rank, + ) = fake_initialize_model_parallel(nodes * num_gpu, 0, tp, pp, None, None, ep, cp) + assert m_tensor_model_parallel_rank == tensor_model_parallel_rank + assert n_pipeline_model_parallel_rank == pipeline_model_parallel_rank + assert n_expert_model_parallel_rank == expert_model_parallel_rank + assert n_model_parallel_size == model_parallel_size + assert n_data_parallel_size == data_parallel_size + assert n_pipeline_model_parallel_split_rank == pipeline_model_parallel_split_rank + assert n_virtual_pipeline_model_parallel_rank == virtual_pipeline_model_parallel_rank From cb22d71e335bc25bfe09947c64a2223550fc65ae Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Fri, 12 Apr 2024 20:03:36 -0600 Subject: [PATCH 30/39] update package info (#8793) Signed-off-by: eharper --- Dockerfile | 2 +- nemo/package_info.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 970c34a690d4..fa825d61f015 100644 --- a/Dockerfile +++ b/Dockerfile @@ -141,7 +141,7 @@ COPY . . # start building the final container FROM nemo-deps as nemo -ARG NEMO_VERSION=1.23.0 +ARG NEMO_VERSION=2.0.0 # Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container # version information as runtime environment variable for introspection purposes diff --git a/nemo/package_info.py b/nemo/package_info.py index e0ff2247e6ad..b253927a6b38 100644 --- a/nemo/package_info.py +++ b/nemo/package_info.py @@ -13,8 +13,8 @@ # limitations under the License. -MAJOR = 1 -MINOR = 23 +MAJOR = 2 +MINOR = 0 PATCH = 0 PRE_RELEASE = 'rc0' From 378a9b3d9845a02eacc392e267f2e66dc62f151f Mon Sep 17 00:00:00 2001 From: Rachit Garg Date: Sat, 13 Apr 2024 10:05:18 -0700 Subject: [PATCH 31/39] Rachitg/dpa (#8911) * remove fp8 checkpoints for Attention Signed-off-by: rachitg * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixes Signed-off-by: rachitg * set default value and support mha Signed-off-by: rachitg * skip by default Signed-off-by: rachitg --------- Signed-off-by: rachitg Co-authored-by: rachitg Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../nlp/models/language_modeling/megatron_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index ede72439615e..4493532f88bf 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1741,7 +1741,7 @@ def skip_fp8_load(x): x = LocalNonpersitentObject(x.data) # use the FP8 state from initialization, not from ckpt return x - if self.cfg.get('fp8_dot_product_attention', False) or self.cfg.get('fp8_multi_head_attention', False): + if self.cfg.get('skip_fp8_attention_checkpoint_load', True): dict_list_map_inplace(skip_fp8_load, sharded_state_dict) return sharded_state_dict From de983ff6eb164944197c0e96807c3ee74119057c Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Sat, 13 Apr 2024 15:28:26 -0700 Subject: [PATCH 32/39] update mcore (#8917) --- .github/workflows/cicd-main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 29ea34dba197..c4350a42f59b 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -114,7 +114,7 @@ jobs: # Megatron Core installation git clone https://github.com/NVIDIA/Megatron-LM.git && \ pushd Megatron-LM && \ - git checkout f3a3020031f384ddafd9b7e9f3a587798c0aea21 && \ + git checkout fbb375d4b5e88ce52f5f7125053068caff47f93f && \ pip install . && \ pushd megatron/core/datasets && \ make && \ From dca6f7427b2c1c19a28d1023dcc5a1d789f523ea Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 15 Apr 2024 09:30:43 -0700 Subject: [PATCH 33/39] Remove precision args in trainer due to PTL update (#8908) * Fix precision args in trainer due to PTL update Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * roll back one change Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: Pablo Garay --- .../multimodal_llm/neva/convert_hf_llava_to_neva.py | 2 +- .../megatron_change_num_partitions.py | 10 +++------- .../convert_baichuan2_hf_to_nemo.py | 2 +- .../convert_chatglm_hf_to_nemo.py | 2 +- .../convert_mistral_7b_hf_to_nemo.py | 2 +- .../convert_mixtral_hf_to_nemo.py | 2 +- .../convert_starcoder2_hf_to_nemo.py | 2 +- 7 files changed, 9 insertions(+), 13 deletions(-) diff --git a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py b/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py index c9263ea85bbf..2cbb4c2b3b82 100644 --- a/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py +++ b/examples/multimodal/multimodal_llm/neva/convert_hf_llava_to_neva.py @@ -205,7 +205,7 @@ def convert(args): nemo_config.precision = precision print(f"nemo_config: {nemo_config}") - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = hf_config["hidden_size"] head_num = hf_config["num_attention_heads"] diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py index 436661e01b5d..c035346e3bf1 100644 --- a/examples/nlp/language_modeling/megatron_change_num_partitions.py +++ b/examples/nlp/language_modeling/megatron_change_num_partitions.py @@ -938,7 +938,7 @@ def main(): # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both # precision plugins and precision to exist precision = None - trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision) + trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu") if tp_size < 0 or pp_size < 0: logging.info(f"Loading model config from {args.model_file} to get TP and PP size") @@ -1205,9 +1205,7 @@ def main(): if vp_size > 1: set_virtual_parallel_rank_safely(None) - trainer = Trainer( - plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision - ) + trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu") with open_dict(model.cfg): if args.tokenizer_model_path is not None: @@ -1413,9 +1411,7 @@ def main(): app_state.pipeline_model_parallel_size * app_state.tensor_model_parallel_size ) - trainer = Trainer( - plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=precision - ) + trainer = Trainer(plugins=plugins, devices=1, strategy=NLPDDPStrategy(), accelerator="cpu") if args.tokenizer_model_path is not None: with open_dict(model.cfg): model.cfg.tokenizer.model = args.tokenizer_model_path diff --git a/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py index 585741de9b9a..b87f7e028cdb 100644 --- a/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_baichuan2_hf_to_nemo.py @@ -175,7 +175,7 @@ def convert(args): nemo_config.precision = precision print(f"nemo_config: {nemo_config}") - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = hf_config["hidden_size"] head_num = hf_config["num_attention_heads"] diff --git a/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py index c3f210deefac..363e4de09ef7 100644 --- a/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_chatglm_hf_to_nemo.py @@ -142,7 +142,7 @@ def convert(args): nemo_config.precision = precision - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = hf_config["hidden_size"] head_num = hf_config["num_attention_heads"] diff --git a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py index db0fe28cbf73..cb11bb5da564 100644 --- a/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py @@ -193,7 +193,7 @@ def convert(args): nemo_config.precision = precision logging.info(f"nemo_config: {nemo_config}") - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = nemo_config.hidden_size head_num = nemo_config.num_attention_heads diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py index d8ad9d5030b8..ac323757a2f6 100644 --- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py @@ -194,7 +194,7 @@ def convert(args): nemo_config.precision = precision print(f"nemo_config: {nemo_config}") - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = nemo_config.hidden_size head_num = nemo_config.num_attention_heads diff --git a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py index eccca3a04621..fc898c797a9e 100644 --- a/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_starcoder2_hf_to_nemo.py @@ -194,7 +194,7 @@ def convert(args): nemo_config.precision = precision logging.info(f"nemo_config: {nemo_config}") - trainer = Trainer(plugins=plugins, accelerator='cpu', precision=precision, strategy=NLPDDPStrategy()) + trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) hidden_size = nemo_config.hidden_size head_num = nemo_config.num_attention_heads From e9d826657d2f10f7e632f091996908f85e64fa2e Mon Sep 17 00:00:00 2001 From: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Date: Mon, 15 Apr 2024 09:30:55 -0700 Subject: [PATCH 34/39] Fix module.training for neva in FusedAttn backward (#8877) Signed-off-by: yaoyu-33 Co-authored-by: Pablo Garay --- nemo/collections/multimodal/data/neva/neva_dataset.py | 2 +- .../multimodal/models/multimodal_llm/neva/neva_model.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/nemo/collections/multimodal/data/neva/neva_dataset.py b/nemo/collections/multimodal/data/neva/neva_dataset.py index 15d755a7d59a..71d9bda12de1 100644 --- a/nemo/collections/multimodal/data/neva/neva_dataset.py +++ b/nemo/collections/multimodal/data/neva/neva_dataset.py @@ -782,7 +782,7 @@ class DataCollatorForSupervisedDataset(object): def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: max_len = max(instance['tokens'].shape[0] for instance in instances) - max_len = (max_len - 1) // 4 * 4 + 4 + max_len = (max_len - 1) // 64 * 64 + 64 for instance in instances: pad_len = max_len - instance['tokens'].shape[0] instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0) diff --git a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py index 4556ba1b3e72..cff8ab1a7b5f 100644 --- a/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py +++ b/nemo/collections/multimodal/models/multimodal_llm/neva/neva_model.py @@ -387,9 +387,6 @@ def __init__( def freeze_llm(self, mm_cfg): for param in chain(self.embedding.parameters(), self.decoder.parameters(), self.output_layer.parameters(),): param.requires_grad = False - self.embedding = self.embedding.eval() - self.decoder = self.decoder.eval() - self.output_layer = self.output_layer.eval() def forward( self, *args, **kwargs, From 32e630220d2c24550a869f43ce618949ccf1a1a5 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Tue, 16 Apr 2024 01:17:16 -0400 Subject: [PATCH 35/39] Updates for TRT-LLM 0.9 (#8873) * upgrade to trtllm0.9 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update gpt to config based export Signed-off-by: Onur Yilmaz * fix for lora checkpoint * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix for in flight batching case * Update falcon for trt-llm 0.9 Signed-off-by: Onur Yilmaz * Removed unused import and comment Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz Co-authored-by: abharwani Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/export/trt_llm/decoder/falcon.py | 6 +-- nemo/export/trt_llm/decoder/gpt.py | 46 +++++++++++++---------- nemo/export/trt_llm/decoder/llama.py | 6 +-- nemo/export/trt_llm/tensorrt_llm_build.py | 4 ++ nemo/export/trt_llm/tensorrt_llm_model.py | 18 +++------ nemo/export/trt_llm/tensorrt_llm_run.py | 5 ++- 6 files changed, 44 insertions(+), 41 deletions(-) diff --git a/nemo/export/trt_llm/decoder/falcon.py b/nemo/export/trt_llm/decoder/falcon.py index b0e69d2b99c4..91edc7794607 100644 --- a/nemo/export/trt_llm/decoder/falcon.py +++ b/nemo/export/trt_llm/decoder/falcon.py @@ -17,8 +17,7 @@ from tensorrt_llm.functional import non_gated_version from tensorrt_llm.models.falcon.model import FalconDecoderLayer -from tensorrt_llm.models.modeling_utils import PretrainedConfig -from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig from typing_extensions import override from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder @@ -119,8 +118,7 @@ def build_decoder(self, layer): world_size=self.tensor_parallel, tp_size=self.tensor_parallel, pp_size=1, - quant_mode=QuantMode(0), - quant_kwargs=None, + quantization=QuantConfig(), max_lora_rank=layer.max_lora_rank, use_parallel_embedding=False, ) diff --git a/nemo/export/trt_llm/decoder/gpt.py b/nemo/export/trt_llm/decoder/gpt.py index 294ccb737c1f..8af4e4ef01e4 100644 --- a/nemo/export/trt_llm/decoder/gpt.py +++ b/nemo/export/trt_llm/decoder/gpt.py @@ -17,6 +17,7 @@ from tensorrt_llm.layers import AttentionMaskType, PositionEmbeddingType from tensorrt_llm.models.gpt.model import GPTDecoderLayer +from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig from typing_extensions import override from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder @@ -85,13 +86,10 @@ class GPTDecoderLayerBuilder(DecoderLayerBuilder): @override def build_decoder(self, layer): rotary_pct = layer.rotary_pct - position_embedding_type = ( - PositionEmbeddingType.rope_gpt_neox - if layer.position_embedding_type == "rope" - else PositionEmbeddingType.learned_absolute - ) - assert not (position_embedding_type == PositionEmbeddingType.rope_gpt_neox and rotary_pct == 0.0) + position_embedding_type = "rope_gpt_neox" if layer.position_embedding_type == "rope" else "learned_absolute" + + assert not (position_embedding_type == "rope_gpt_neox" and rotary_pct == 0.0) bias_qkv = layer.attention.qkv.bias is not None @@ -99,23 +97,33 @@ def build_decoder(self, layer): if layer.rotary_scaling is not None: rotary_scaling = {"type": "linear", "factor": float(layer.rotary_scaling)} - return GPTDecoderLayer( + config = PretrainedConfig( + architecture=None, + dtype=self.dtype, + logits_dtype=self.dtype, + vocab_size=layer.vocab_size, + max_position_embeddings=self.max_position_embeddings, hidden_size=self.hidden_size, + num_hidden_layers=self.num_layers, num_attention_heads=self.num_attention_heads, - max_position_embeddings=self.max_position_embeddings, - num_layers=self.num_layers, - dtype=self.dtype, - apply_query_key_layer_scaling=False, - attention_mask_type=AttentionMaskType.causal, + num_key_value_heads=self.num_kv_heads, hidden_act=self.hidden_act, + intermediate_size=layer.ffn_hidden_size_local * self.tensor_parallel, + norm_epsilon=layer.norm_epsilon, position_embedding_type=position_embedding_type, - rotary_embedding_percentage=rotary_pct, - rotary_base=layer.rotary_base, - rotary_scaling=rotary_scaling, - inter_size=layer.ffn_hidden_size_local * self.tensor_parallel, - bias=bias_qkv, - num_kv_heads=self.num_kv_heads, - tp_group=self.tp_group, + world_size=self.tensor_parallel, tp_size=self.tensor_parallel, + pp_size=1, max_lora_rank=layer.max_lora_rank, + quantization=QuantConfig(), ) + + config.set_if_not_exist('hidden_act', self.hidden_act) + config.set_if_not_exist('apply_query_key_layer_scaling', False) + config.set_if_not_exist('bias', bias_qkv) + config.set_if_not_exist('rotary_base', layer.rotary_base) + config.set_if_not_exist('rotary_scaling', rotary_scaling) + config.set_if_not_exist('rotary_pct', rotary_pct) + config.set_if_not_exist('moe_num_experts', 0) + + return GPTDecoderLayer(config=config, layer_idx=self.layer_id,) diff --git a/nemo/export/trt_llm/decoder/llama.py b/nemo/export/trt_llm/decoder/llama.py index e554e18608f7..873c0306375b 100644 --- a/nemo/export/trt_llm/decoder/llama.py +++ b/nemo/export/trt_llm/decoder/llama.py @@ -18,8 +18,7 @@ from tensorrt_llm.functional import non_gated_version from tensorrt_llm.layers import MoeConfig from tensorrt_llm.models.llama.model import LLaMADecoderLayer -from tensorrt_llm.models.modeling_utils import PretrainedConfig -from tensorrt_llm.quantization import QuantMode +from tensorrt_llm.models.modeling_utils import PretrainedConfig, QuantConfig from typing_extensions import override from nemo.export.trt_llm.decoder.decoder import DecoderLayerBuilder, DecoderLayerConfigBuilder @@ -118,9 +117,8 @@ def build_decoder(self, layer): world_size=self.tensor_parallel, tp_size=self.tensor_parallel, pp_size=1, - quant_mode=QuantMode(0), - quant_kwargs=None, max_lora_rank=layer.max_lora_rank, + quantization=QuantConfig(), ) config.set_if_not_exist('mlp_bias', False) diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index 0941a6d3dbba..3ad27a2eb9a6 100644 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -27,6 +27,7 @@ from tensorrt_llm._utils import np_dtype_to_trt from tensorrt_llm.builder import Builder from tensorrt_llm.logger import logger +from tensorrt_llm.models.modeling_utils import add_lora from tensorrt_llm.network import net_guard from tensorrt_llm.plugin.plugin import ContextFMHAType from tensorrt_llm.quantization import QuantMode @@ -170,6 +171,9 @@ def _build_impl(tensorrt_llm_model, args): timing_cache_file = args.timing_cache if args.timing_cache else args.output_dir / "model.cache" timing_cache = timing_cache_file + if args.use_lora_plugin is not None: + add_lora(tensorrt_llm_model, args.max_lora_rank) + builder = Builder() apply_query_key_layer_scaling = False diff --git a/nemo/export/trt_llm/tensorrt_llm_model.py b/nemo/export/trt_llm/tensorrt_llm_model.py index b2da7855ccdc..52e9c4960fc9 100644 --- a/nemo/export/trt_llm/tensorrt_llm_model.py +++ b/nemo/export/trt_llm/tensorrt_llm_model.py @@ -144,15 +144,7 @@ def forward( if attention_mask is not None: attention_mask = expand_mask(attention_mask, shape(input_ids, -1)) - for layer_idx, (layer, past, pointer, host_pointer, max_attention_window_size) in enumerate( - zip( - self.layers, - kv_cache_params.past_key_value, - kv_cache_params.kv_cache_block_pointers, - kv_cache_params.host_kv_cache_block_pointers, - kv_cache_params.host_max_attention_window_sizes, - ) - ): + for layer_idx, (layer, past) in enumerate(zip(self.layers, kv_cache_params.past_key_value,)): decoder_params = { "hidden_states": hidden_states, @@ -161,8 +153,8 @@ def forward( "kv_cache_params": KeyValueCacheParams( past_key_value=[past], host_past_key_value_lengths=kv_cache_params.host_past_key_value_lengths, - kv_cache_block_pointers=[pointer], - host_max_attention_window_sizes=max_attention_window_size, + kv_cache_block_pointers=kv_cache_params.kv_cache_block_pointers, + host_max_attention_window_sizes=kv_cache_params.host_max_attention_window_sizes, cache_indirection=kv_cache_params.cache_indirection, host_sink_token_length=kv_cache_params.host_sink_token_length, host_kv_cache_block_pointers=kv_cache_params.host_kv_cache_block_pointers, @@ -329,8 +321,8 @@ def prepare_inputs( past_key_value=model_inputs['past_key_value'], host_past_key_value_lengths=model_inputs['host_past_key_value_lengths'], host_max_attention_window_sizes=model_inputs['host_max_attention_window_sizes'], - kv_cache_block_pointers=model_inputs['kv_cache_block_pointers_list'], - host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers_list'], + kv_cache_block_pointers=model_inputs['kv_cache_block_pointers'], + host_kv_cache_block_pointers=model_inputs['host_kv_cache_block_pointers'], cache_indirection=model_inputs['cache_indirection'], host_sink_token_length=model_inputs['host_sink_token_length'], ), diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index cdc0b78d6c18..1e24f4f207a4 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -24,12 +24,14 @@ import torch from mpi4py.futures import MPIPoolExecutor from tensorrt_llm.logger import logger +from tensorrt_llm.lora_manager import LoraManager from tensorrt_llm.quantization import QuantMode -from tensorrt_llm.runtime import LoraManager, ModelConfig, SamplingConfig +from tensorrt_llm.runtime import ModelConfig, SamplingConfig from transformers import PreTrainedTokenizer from nemo.export.trt_llm.tensor_utils import get_tensor_parallel_group from nemo.export.trt_llm.tensorrt_llm_model import LMHeadModelBuilder + from nemo.export.trt_llm.tensorrt_llm_build import get_engine_name, MODEL_NAME, refit_runtime_engine # isort:skip from nemo.export.trt_llm.nemo_utils import to_word_list_format # isort:skip @@ -90,6 +92,7 @@ def _read_config(config_path: Path): model_config = ModelConfig( model_name=config["builder_config"]["name"], max_batch_size=config["builder_config"]["max_batch_size"], + max_beam_width=config["builder_config"]["max_beam_width"], vocab_size=config["builder_config"]["vocab_size"], num_layers=config["builder_config"]["num_layers"], num_heads=num_heads, From c6c45c41ecf4ce7d115ec66d50bb3acc763ff4b0 Mon Sep 17 00:00:00 2001 From: huvunvidia <86480512+huvunvidia@users.noreply.github.com> Date: Tue, 16 Apr 2024 08:26:25 -0400 Subject: [PATCH 36/39] Huvu/mcore retro (#8861) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update branch Signed-off-by: eharper * Add dist ckpt support for regular optimizers (#7749) * Add dist ckpt support for regular optimizers Signed-off-by: Mikołaj Błaż * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * fix imports Signed-off-by: dimapihtar * imports fix Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ci imports fix Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert asr notebook Signed-off-by: dimapihtar * revert asr notebook Signed-off-by: dimapihtar --------- Signed-off-by: Mikołaj Błaż Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: dimapihtar Co-authored-by: Eric Harper Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Pin lhotse=1.19.2 in r1.23.0 (#8303) Signed-off-by: Piotr Żelasko * Cache Aware Streaming tutorial notebook (#8296) * add notebook Signed-off-by: Elena Rastorgueva * rename old notebook to Buffered_Streaming Signed-off-by: Elena Rastorgueva * call setup_streaming_params in set_default_att_context_size method Signed-off-by: Elena Rastorgueva * update links in docs Signed-off-by: Elena Rastorgueva * update links to tutorials in docs Signed-off-by: Elena Rastorgueva * remove hard-coding Signed-off-by: Elena Rastorgueva * rename var Signed-off-by: Elena Rastorgueva --------- Signed-off-by: Elena Rastorgueva * fix path location and branch (#8304) * fix path location and branch Signed-off-by: Nithin Rao Koluguri * change to a floating point number Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri Co-authored-by: Somshubra Majumdar * add deallocate pipeline output optimization (#8279) * add deallocate pipeline output optimization Signed-off-by: Jimmy Zhang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jimmy Zhang Co-authored-by: Jimmy Zhang Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Fix memory leak caused by context parallelism hanging references by omegaconf (#8299) * save cp_size to self Signed-off-by: Jimmy Zhang * use parallel_state instead of self Signed-off-by: Jimmy Zhang --------- Signed-off-by: Jimmy Zhang Co-authored-by: Jimmy Zhang Co-authored-by: Eric Harper * remove assertion (#8302) Signed-off-by: dimapihtar * Update PEFT Doc (#8262) * update peft doc Signed-off-by: Chen Cui * remove old prompt learning doc and notebook Signed-off-by: Chen Cui * fix table Signed-off-by: Chen Cui * fix table Signed-off-by: Chen Cui * fix table Signed-off-by: Chen Cui * Merge branch 'r1.23.0' into chcui/update_peft_doc Signed-off-by: Chen Cui * revert accidental changes Signed-off-by: Chen Cui * revert accidental changes Signed-off-by: Chen Cui --------- Signed-off-by: Chen Cui * Attention encoder-decoder models for multiple speech-to-text tasks (#8242) (#8324) * Rebasing canary changes at current main Signed-off-by: Piotr Żelasko * Move the changes from asr transformer to nlp transformer as originally intended Signed-off-by: Piotr Żelasko * update eval to strip spaces before punctuations Signed-off-by: stevehuang52 * update pc strip Signed-off-by: stevehuang52 * [canary] Refactor: `PromptedAudioToTextLhotseDataset` and `EncDecMultiTaskModel` (#8247) * Create a separate CanaryDataset and use it inside `transformer_bpe_models.py`. Ditches `token_sequence_format`. Signed-off-by: Piotr Żelasko * [canary] Refactor: move changes in transformer_bpe_models.py to Canar… (#8252) * [canary] Refactor: move changes in transformer_bpe_models.py to CanaryModel Signed-off-by: Piotr Żelasko * Rename `CanaryModel` to `EncDecMultiTaskModel` and remove inheritance from `EncDecTransfModelBPE`; add a separate config for this model Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko * Rename `CanaryDataset` to `PromptedAudioToTextLhotseDataset`; add `prompt_format_fn` argument; clean-up the `_canary_prompt_format` function a bit Signed-off-by: Piotr Żelasko * Move tokenization into `prompt_format_fn`, fix usage, add docs Signed-off-by: Piotr Żelasko * Backward-compatible utterance validation Signed-off-by: Piotr Żelasko * Improve type annotations Signed-off-by: Piotr Żelasko * config and prompt_fn registration changes from review Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko * fix transcribe config Signed-off-by: stevehuang52 * Refactor Canary to follow schema of remaining ASR models (#8260) * Initial draft of multi task beam decoding strategy Signed-off-by: smajumdar * Stabilize inference Signed-off-by: smajumdar * Update AED Multi Task model to mostly conform to Archetype-Type format. Update config Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add change decoding strategy Signed-off-by: smajumdar * Remove redundant imports Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Cleanup Signed-off-by: smajumdar * Cleanup Signed-off-by: smajumdar * remove asr transformer dependency on nlp Signed-off-by: stevehuang52 * clean up Signed-off-by: stevehuang52 * copy token_classifier from nlp to asr Signed-off-by: stevehuang52 * Address comments Signed-off-by: smajumdar * Add typing to beam decoding Signed-off-by: smajumdar * Make prompt format configurable Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * drop asr dependency on nlp Signed-off-by: stevehuang52 --------- Signed-off-by: smajumdar Signed-off-by: stevehuang52 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: stevehuang52 * fix transcribe, update asr evaluator Signed-off-by: stevehuang52 * Extend the docs for the canary prompt_fn Signed-off-by: Piotr Żelasko * Incorporate changes from Nithin's code review Signed-off-by: Piotr Żelasko * training bug fix and adding launch script for speech_multitask (#8270) * bug fix and adding launch script for speech_multitask Signed-off-by: Krishna Puvvada * update launch script example in speech_to_text_aed.py Signed-off-by: Krishna Puvvada --------- Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada * Fix: drop_last must be true in validation/test otherwise the training will hang Signed-off-by: Piotr Żelasko * revert to current transcribe API Signed-off-by: stevehuang52 * revert changes to NLP, update docs Signed-off-by: stevehuang52 * update eval utils Signed-off-by: stevehuang52 * update docs Signed-off-by: stevehuang52 * Remove DALI; rename compute_audio_loss to compute_loss Signed-off-by: Piotr Żelasko * set default use_model_transcribe=False Signed-off-by: stevehuang52 * change os.path.dirname to pathlib Signed-off-by: stevehuang52 * [canary] Test for CanaryTokenizer + refactoring (#8285) * Test for CanaryTokenizer Signed-off-by: Piotr Żelasko * Attempt at refactor... Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko * Update config for AED models (#8294) Signed-off-by: smajumdar * set default calculate_wer=False in transcribe_speech.py Signed-off-by: stevehuang52 * Attention encoder-decoder models for multiple speech-to-text tasks Signed-off-by: Piotr Żelasko * Apply suggestions from code review, part 1 Co-authored-by: Nithin Rao Signed-off-by: Piotr Żelasko * Apply suggestions from code review, part 2 Signed-off-by: Piotr Żelasko * Document compute_loss Signed-off-by: Piotr Żelasko * update transcribe_speech.py Signed-off-by: stevehuang52 * add docstring Signed-off-by: stevehuang52 * Attention encoder-decoder models for multiple speech-to-text tasks Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko Signed-off-by: stevehuang52 Signed-off-by: smajumdar Signed-off-by: Krishna Puvvada Signed-off-by: Piotr Żelasko Co-authored-by: stevehuang52 Co-authored-by: Somshubra Majumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com> Co-authored-by: Krishna Puvvada Co-authored-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Co-authored-by: Nithin Rao (cherry picked from commit d10726da72f74eb5a95056843d1f9e2562a5051c) Co-authored-by: Piotr Żelasko * add code for calling mcore_retro in NeMo * add code for calling mcore_retro in NeMo * runnable, training curve match retro mcore and nemo * working on retro inference * working on megatron_retro_eval.py and megatron_retro_inference.yaml * refactoring text_generation_utils code and retro inference relevant files * clean PR * resolving quick hacks (reading number of train/valid samples from workdir, discrepancy in total samples and samples with neighbors retrieved, tokenizers) * clean repository * revert changes to inference/eval code to original in main * clean code * runable training code, with already implemented eval code * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add values to en tts dict (#7879) Signed-off-by: Mariana Graterol Fuenmayor * Add Bert HF checkpoint converter (#8088) * Add Bert HF checkpoint converter Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reformat Signed-off-by: yaoyu-33 * Add BERT ONNX export * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add NeMo BERT to HF BERT script * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Clean code Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update argument names Signed-off-by: yaoyu-33 * Update build_transformer_config in Bert Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bobby Chen * revert to original eval code files * revert to original eval code files 2 * revert to original eval code files 3 * revert to original eval code files 4 * clean code * clean code * update my code to support changes from lastest main * commit before rebase r1.23.0 * Multimodal r1.23.0 bug fix (#8315) * Rename quick-gelu Signed-off-by: yaoyu-33 * ddpm config guard Signed-off-by: yaoyu-33 * Fix ddpm edit api Signed-off-by: yaoyu-33 * Fix insert_image_token cfg issue Signed-off-by: yaoyu-33 * neva updates Signed-off-by: yaoyu-33 * reformat Signed-off-by: yaoyu-33 * Add back jenkins Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix jenkins Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix bugs Signed-off-by: yaoyu-33 * Update default neva template Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Co-authored-by: Eric Harper Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * copy paste files from r1.23.0 * clean PR * Fixes for MoE parameter passing & use of AutoTokenizer/Model for mistral. (#8272) Signed-off-by: Alexandros Koumparoulis * Keep max_seqlen and cu_seqlens_argmin for later micro-batches when PP>1 (#8334) Signed-off-by: Sangkug Lym Co-authored-by: Eric Harper * Remove asr webapp (#8347) Signed-off-by: smajumdar * remove _target_ at model level in aed config (#8351) Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada * revert changes for tts and asr * Add change_vocabulary and save_tokenizers() support to Multitask ASR models (#8357) * Add change_vocabulary and save_tokenizers() support Signed-off-by: smajumdar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update nemo/collections/asr/models/aed_multitask_models.py Co-authored-by: Piotr Żelasko Signed-off-by: Somshubra Majumdar --------- Signed-off-by: smajumdar Signed-off-by: Somshubra Majumdar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Piotr Żelasko * Change default (#8371) Signed-off-by: smajumdar * implement retro's own fwd_bwd_step() and validation_step() to not have argument first_val_step, which the MLM commit doesn't support * adding megatron compile_helpers(), in future can be fixed with correct MLM commit * bug fix in fast-conformer-aed.yaml and adding jenkins test for speech_to_text_aed model (#8368) Signed-off-by: Krishna Puvvada Co-authored-by: Krishna Puvvada Co-authored-by: Somshubra Majumdar * Enable megatron core loggers for GPT pretraining (#8354) * Logging changes tested for gpt_pretraining Signed-off-by: Aishwarya Bhandare * Additional args Signed-off-by: Aishwarya Bhandare * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Aishwarya Bhandare Co-authored-by: Aishwarya Bhandare Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper * mcore ds fix (#8283) * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add values to en tts dict (#7879) Signed-off-by: Mariana Graterol Fuenmayor * mcore ds fix Signed-off-by: Dmytro Pykhtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mcore Signed-off-by: dimapihtar * revert asr files Signed-off-by: dimapihtar * add comments Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for mcore mock dataset Signed-off-by: dimapihtar * update mcore version Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gpt cfg Signed-off-by: dimapihtar * update mcore commit Signed-off-by: dimapihtar * fix Bert unit tests Signed-off-by: dimapihtar * update bert tests Signed-off-by: dimapihtar * fix bert mcore test Signed-off-by: dimapihtar * fix gpt jenkins tests Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update apex & TE commits Signed-off-by: dimapihtar * revert apex installation Signed-off-by: dimapihtar * turn off the fusion for jenkins Signed-off-by: dimapihtar --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay * addressing Eric's reviews * adding existing implementation RETRO files * adding existing implementation RETRO files * Add Finetuning tutorial with HF Datasets (#8356) * Add Finetuning tutorial with HF Datasets Signed-off-by: Nithin Rao Koluguri * update on Som comments Signed-off-by: Nithin Rao Koluguri --------- Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri * release updates (#8378) * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add values to en tts dict (#7879) Signed-off-by: Mariana Graterol Fuenmayor * mcore ds fix Signed-off-by: Dmytro Pykhtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mcore Signed-off-by: dimapihtar * revert asr files Signed-off-by: dimapihtar * add comments Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for mcore mock dataset Signed-off-by: dimapihtar * update mcore version Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gpt cfg Signed-off-by: dimapihtar * update mcore commit Signed-off-by: dimapihtar * fix Bert unit tests Signed-off-by: dimapihtar * update bert tests Signed-off-by: dimapihtar * fix bert mcore test Signed-off-by: dimapihtar * fix gpt jenkins tests Signed-off-by: dimapihtar * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for dict data input type Signed-off-by: dimapihtar * add mock ds test Signed-off-by: dimapihtar * add test for dict data input type Signed-off-by: dimapihtar * mcore ds fix Signed-off-by: dimapihtar * data input fix Signed-off-by: dimapihtar --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay * MCore dataset compatibility for tokenizers (#8390) * Add unique_identifiers for all tokenizers and eod for SentencePieceTokenizer Signed-off-by: Valerie Sarge * Add generalized token aliases to TokenizerSpec to conform with MegatronTokenizer's interface. Remove now-redundant individual fixes from AutoTokenizer and SentencePieceTokenizer. Signed-off-by: Valerie Sarge --------- Signed-off-by: Valerie Sarge Co-authored-by: Pablo Garay * Mcore customization doc (#8298) * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> * add values to en tts dict (#7879) Signed-off-by: Mariana Graterol Fuenmayor * Add Bert HF checkpoint converter (#8088) * Add Bert HF checkpoint converter Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Reformat Signed-off-by: yaoyu-33 * Add BERT ONNX export * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add NeMo BERT to HF BERT script * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Clean code Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update argument names Signed-off-by: yaoyu-33 * Update build_transformer_config in Bert Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bobby Chen * initial placeholder Signed-off-by: Huiying Li * add to intro/index.rst Signed-off-by: Huiying Li * initial content update Signed-off-by: Huiying Li * add diff images Signed-off-by: Huiying Li size Signed-off-by: Huiying Li * minor fixes * minor language change Signed-off-by: Chen Cui * clean changes --------- Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: yaoyu-33 Signed-off-by: Huiying Li Signed-off-by: Huiying Li Signed-off-by: Chen Cui Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bobby Chen Co-authored-by: Huiying Li Co-authored-by: Chen Cui * wer fix (#8404) Signed-off-by: Travis Bartley * updated link to pubmed (#8402) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri * Update NFA video download link (#8406) * update nfa nasa video link Signed-off-by: Elena Rastorgueva * update link in markdown Signed-off-by: Elena Rastorgueva --------- Signed-off-by: Elena Rastorgueva * revert changes (#8410) Signed-off-by: Chen Cui * Fix dreambooth data sampler issue (#8400) * Turn on drop last Signed-off-by: yaoyu-33 * Some neva fixes Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: yaoyu-33 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Fixed errors in the CTM gen functions (#8416) Signed-off-by: Taejin Park * add ensemble decoding fix (#8427) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri * SDE bugfix log (#8430) Signed-off-by: George * mcore customization doc minor fix (#8421) Signed-off-by: Huiying Li * NeMo-Mistral to HF converter bugfix. (#8353) Signed-off-by: Alexandros Koumparoulis * Fixing mcore bert for TP, PP and SP (#8336) * Fixing mcore bert for TP, PP and SP * Fixing mcore bert for TP, PP and SP * Fixing mcore version * Fixing mcore version * Update Jenkinsfile Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> * Update Jenkinsfile Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> * Update Jenkinsfile Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> --------- Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: Eric Harper * Add settings to suppress bf16 compile errors in CI on V100 (#8481) * Add settings to suppress bf16 compile errors in CI on V100 Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Abhishree Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * MoE parameter passing (#8255) * MoE parameter passing Signed-off-by: Alexandros Koumparoulis * Pass EP/MoE params in consumer scripts. Signed-off-by: Alexandros Koumparoulis * PR fixes Signed-off-by: Alexandros Koumparoulis * Use latest commit of mcore-0.5 Signed-off-by: Alexandros Koumparoulis * CI fix Signed-off-by: Alexandros Koumparoulis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Alexandros Koumparoulis Co-authored-by: Alexandros Koumparoulis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Update k2 version (#8478) (#8492) Signed-off-by: Vladimir Bataev * Add fp8 support for SD/Update notebook paths (#8489) * Add fp8 support for SD/Update notebook paths Signed-off-by: Mingyuan Ma * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mingyuan Ma Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper * pin to 0.5.0 (#8465) Signed-off-by: eharper * Update NeMo Multimodal Requirements (#8515) * Update requirements_multimodal.txt Signed-off-by: yaoyu-33 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: yaoyu-33 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * update github raw content link (#8517) Signed-off-by: Chen Cui * Add dep notice for notebooks (#8522) * add dep notice Signed-off-by: eharper * revert Signed-off-by: eharper --------- Signed-off-by: eharper * Revert FP8 integration (#8520) * Revert FP8 integration Signed-off-by: Mingyuan Ma * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mingyuan Ma Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * Update data prep notebook (#8532) Signed-off-by: Mingyuan Ma * before update branch with latest r1.23.0 * update to run with MLM ae2817b3dde4efb1515061a5311d01d8f85bd99c (runnable training and saving checkpoint) * remove compile_helpers * reverse changes from main branch to r1.23.0 * adding *_legacy files * update MLM commit in Jenkinsfile to latest * debugging Jenkinstest: test different mcore import in retro_dataset * update Jenkinsfile edit megatron_retro_mutransfer_pretrain_legacy.py * removing all mcore RETRO to pass the Jenkinstest * fixing import legacy problem for tests/collections/nlp/test_indexed_retrieval_dataset.py * update Jenkinsfile file to use TE v0.7 * update NeMo to work with latest mcore RETRO (solving TE problems) * update TE commit Jenkinsfile to be the same with r1.23.0's Jenkinsfile * update commit for MLM * jenkinstest debugging * temporary fix RETRO's __init__ for jenkinstest * edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster * edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster * edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster * edit splits_string in jenkinsfile to correct format; put RETRO test in front to test faster * add model.data.dataloader_type=cyclic to jenkinsfile * update code to work with latest megatron-lm main 81dab6067 * update M-LM commit in Jenkinsfile to latest main M-LM 81dab6067 * fix to by pass CI test bf16 problem (following this PR https://github.com/NVIDIA/NeMo/pull/8481/files) * isort and black * adjusting model.micro_batch_size to 1 * fix BRANCH = 'r1.23.0' * replace tutorials dir from main branch to huvu/mcore_retro * fix minor merges conflict * update Jenkinsfile * runnable with a temporary fix from Jacek (unfound -unfinished problem) * runnable with a temporary fix from Jacek (unfound -unfinished problem) * modified nlp_overrides.py back to original * fix checkpoint from Jacek Bieniusiewicz * config Jenkinsfile test * set RETRO Jenkins MBS to 1 * black fix * isort fix * update TE commit * update to latest Jenkinsfile with latest container and commits * remove new RETRO jenkinstest * merge latest main * put RETRO Jenkinstest to the right place * update code for megatron_retro_pretraining_legacy.py * untrack ipa_cmudict-0.7b_nv23.01.txt * untrack ipa_cmudict-0.7b_nv23.01.txt * set config in megatron_retro_pretraining_legacy.py to megatron_retro_config_legacy * update new RETRO jenkinstest to run faster * merging latest main, and edit Jenkinstest * update Jenkinstest for new RETRO to run faster * fix isort * fix whitespace Signed-off-by: eharper --------- Signed-off-by: eharper Signed-off-by: Mikołaj Błaż Signed-off-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Signed-off-by: dimapihtar Signed-off-by: Piotr Żelasko Signed-off-by: Elena Rastorgueva Signed-off-by: Nithin Rao Koluguri Signed-off-by: Jimmy Zhang Signed-off-by: Chen Cui Signed-off-by: Mariana Graterol Fuenmayor Signed-off-by: yaoyu-33 Signed-off-by: Alexandros Koumparoulis Signed-off-by: Sangkug Lym Signed-off-by: smajumdar Signed-off-by: Krishna Puvvada Signed-off-by: Somshubra Majumdar Signed-off-by: Aishwarya Bhandare Signed-off-by: Dmytro Pykhtar Signed-off-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Signed-off-by: Valerie Sarge Signed-off-by: Huiying Li Signed-off-by: Huiying Li Signed-off-by: Travis Bartley Signed-off-by: Taejin Park Signed-off-by: George Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Signed-off-by: Abhishree Signed-off-by: Vladimir Bataev Signed-off-by: Mingyuan Ma Co-authored-by: eharper Co-authored-by: mikolajblaz Co-authored-by: Eric Harper Co-authored-by: Xuesong Yang <1646669+XuesongYang@users.noreply.github.com> Co-authored-by: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Co-authored-by: dimapihtar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Piotr Żelasko Co-authored-by: Elena Rastorgueva <80532067+erastorgueva-nv@users.noreply.github.com> Co-authored-by: Nithin Rao Co-authored-by: Somshubra Majumdar Co-authored-by: JimmyZhang12 <67203904+JimmyZhang12@users.noreply.github.com> Co-authored-by: Jimmy Zhang Co-authored-by: Chen Cui Co-authored-by: Huy Vu2 Co-authored-by: Mariana <47233618+mgrafu@users.noreply.github.com> Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: Bobby Chen Co-authored-by: akoumpa <153118171+akoumpa@users.noreply.github.com> Co-authored-by: Sangkug Lym Co-authored-by: Krishna Puvvada <93558329+krishnacpuvvada@users.noreply.github.com> Co-authored-by: Krishna Puvvada Co-authored-by: ashbhandare Co-authored-by: Aishwarya Bhandare Co-authored-by: Dmytro Pykhtar Co-authored-by: Pablo Garay Co-authored-by: Valerie Sarge Co-authored-by: Huiying Co-authored-by: Huiying Li Co-authored-by: tbartley94 <90423858+tbartley94@users.noreply.github.com> Co-authored-by: Taejin Park Co-authored-by: George <37293288+Jorjeous@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> Co-authored-by: Shanmugam Ramasamy Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Co-authored-by: Alexandros Koumparoulis Co-authored-by: Vladimir Bataev Co-authored-by: Ming <111467530+Victor49152@users.noreply.github.com> Co-authored-by: Huy Vu2 --- Jenkinsfile | 75 +- .../conf/megatron_bert_config.yaml | 2 +- .../conf/megatron_retro_config.yaml | 248 +++++-- .../conf/megatron_retro_config_legacy.yaml | 127 ++++ .../megatron_retro_pretraining.py | 76 +- .../megatron_retro_pretraining_legacy.py | 102 +++ .../megatron/retro_dataset.py | 557 ++++----------- .../megatron/retro_dataset_legacy.py | 469 +++++++++++++ .../nlp/models/language_modeling/__init__.py | 1 + .../language_modeling/megatron_gpt_model.py | 15 +- .../megatron_retrieval_model.py | 2 +- .../language_modeling/megatron_retro_model.py | 651 ++++++++++++++++++ .../nlp/modules/common/tokenizer_utils.py | 9 + nemo/utils/callbacks/nemo_model_checkpoint.py | 15 +- .../nlp/test_indexed_retrieval_dataset.py | 2 +- 15 files changed, 1795 insertions(+), 556 deletions(-) mode change 100644 => 100755 examples/nlp/language_modeling/conf/megatron_retro_config.yaml create mode 100644 examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml create mode 100644 examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py create mode 100644 nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py create mode 100644 nemo/collections/nlp/models/language_modeling/megatron_retro_model.py diff --git a/Jenkinsfile b/Jenkinsfile index 55e836eea13a..83e6daa8ccb7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -125,6 +125,7 @@ pipeline { sh 'python tests/core_ptl/check_imports.py --domain "nlp"' } } + stage('L0: Unit Tests GPU') { steps { sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads' @@ -3517,6 +3518,64 @@ pipeline { failFast true steps { sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=10" + sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + trainer.num_nodes=1 \ + trainer.devices=2 \ + trainer.precision=bf16 \ + trainer.accelerator=gpu \ + model.data.data_prefix=['none'] \ + exp_manager.exp_dir=examples/nlp/language_modeling/mcore_retro_results \ + model.mcore_gpt=True \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.optim.name=distributed_fused_adam \ + model.retro.retro_project_dir=/home/TestData/nlp/megatron_retro/mcore_retro/micro-wiki-core \ + model.data.num_workers=4 \ + model.micro_batch_size=1 \ + model.data.shuffle_documents=False \ + trainer.val_check_interval=30 \ + +trainer.num_sanity_val_steps=0 \ + model.init_method_std=0.023 \ + model.optim.lr=6.0e-4 \ + model.megatron_amp_O2=True \ + model.data.splits_string=\'\"98,2,0\"\' \ + model.data.dataloader_type=cyclic \ + trainer.max_steps=20" + sh "rm -rf examples/nlp/language_modeling/mcore_retro_results" + } + } + stage('L2: (Legacy) Megatron RETRO Pretraining and Resume Training') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + steps { + sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ trainer.devices=2 \ trainer.num_nodes=1 \ trainer.accelerator=gpu \ @@ -3527,7 +3586,7 @@ pipeline { trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ model.data.data_prefix='' \ model.data.knn_index='' \ model.data.retrieval_prefix='' \ @@ -3546,7 +3605,7 @@ pipeline { model.enc_cross_attention=[1] \ model.dec_cross_attention=[1] \ +model.data.mock=True" - sh "python examples/nlp/language_modeling/megatron_retro_pretraining.py \ + sh "python examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py \ trainer.devices=2 \ trainer.num_nodes=1 \ trainer.accelerator=gpu \ @@ -3557,7 +3616,7 @@ pipeline { trainer.precision=16 \ trainer.gradient_clip_val=1.0 \ trainer.val_check_interval=10 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results \ model.data.data_prefix='' \ model.data.knn_index='' \ model.data.retrieval_prefix='' \ @@ -3576,10 +3635,10 @@ pipeline { model.enc_cross_attention=[1] \ model.dec_cross_attention=[1] \ +model.data.mock=True" - sh "rm -rf examples/nlp/language_modeling/retro_results" + sh "rm -rf examples/nlp/language_modeling/retro_legacy_results" } } - stage('L2: Megatron RETRO muTransfer Pretraining Performance') { + stage('L2: (Legacy) Megatron RETRO muTransfer Pretraining Performance') { when { anyOf { branch 'main' @@ -3600,7 +3659,7 @@ pipeline { trainer.limit_val_batches=0 \ trainer.gradient_clip_val=1.0 \ +trainer.num_sanity_val_steps=0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/retro_results/ \ + exp_manager.exp_dir=examples/nlp/language_modeling/retro_legacy_results/ \ +exp_manager.version=smalltest \ model.data.neighbors=2 \ model.megatron_amp_O2=False \ @@ -3651,7 +3710,7 @@ import torch if not (torch.cuda.is_available() and 'A100' in torch.cuda.get_device_name()): import sys sys.exit(0) -event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] +event_file = list(pathlib.Path('examples/nlp/language_modeling/retro_legacy_results/megatron_retro/smalltest').glob('events.out.tfevents*'))[0] ea = EventAccumulator(str(event_file)).Reload() vals = [] for i in ea.Scalars('reduced_train_loss'): @@ -3659,7 +3718,7 @@ for i in ea.Scalars('reduced_train_loss'): training_curve = pd.DataFrame({'loss': vals}) gt_curve = pd.read_csv('/home/TestData/nlp/megatron_retro/expected_learning_curve.csv') assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"''' - sh "rm -rf examples/nlp/language_modeling/retro_results" + sh "rm -rf examples/nlp/language_modeling/retro_legacy_results" } } stage('L2: BioMegatron Bert NER Task') { diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index 58e874386c44..bc66ae717ebb 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -158,4 +158,4 @@ model: name: CosineAnnealing warmup_steps: 500 constant_steps: 50000 - min_lr: 2e-5 + min_lr: 2e-5 \ No newline at end of file diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml old mode 100644 new mode 100755 index dafdcf542f11..159bb163ad0a --- a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml @@ -1,127 +1,257 @@ defaults: - - .@model: megatron_model_base_config + - _self_ + - optional tp_overlap@model.ub_tp_comm_overlap_cfg: -name: test_retro +name: megatron_retro restore_from_path: null # used when starting from a .nemo file trainer: - devices: 2 + devices: 1 num_nodes: 1 accelerator: gpu precision: 16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False - max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 100 - limit_val_batches: null - limit_test_batches: null - accumulate_grad_batches: 1 + limit_val_batches: 50 + limit_test_batches: 500 + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models gradient_clip_val: 1.0 + benchmark: False + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually exp_manager: explicit_log_dir: null exp_dir: null - name: megatron_retro + name: ${name} create_wandb_logger: False wandb_logger_kwargs: project: null name: null resume_if_exists: True resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} create_checkpoint_callback: True checkpoint_callback_params: monitor: val_loss save_top_k: 10 mode: min always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}' model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} - model: - version: 1 # indicate the retro model version + # use RETROModel from megatron.core, since RETRO model inherited from gpt, mcore_gpt is used + mcore_gpt: True - # model parallelism - micro_batch_size: 4 - tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet + # specify micro_batch_size, global_batch_size, and model parallelism + # gradient accumulation will be done automatically based on data_parallel_size + micro_batch_size: 16 # limited by GPU memory + global_batch_size: 256 # will be overrided by value from RETRO preprocessed workdir + rampup_batch_size: null # Should be a list of 3 values: [, , ] + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + virtual_pipeline_model_parallel_size: null # interleaved pipeline # model architecture - encoder_seq_length: 2048 - max_position_embeddings: ${.encoder_seq_length} - - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - dump_debug_info: False # dump out the debug information - dump_debug_info_to_file: False # dump out the debug information to files - - # retro architecture - chunk_size: 64 # the chunk size used to retrive - enc_num_layers: 4 # total number of encoder layers - dec_num_layers: 6 # total number of decoder layers - enc_cross_attention: [3] # layer numbers for cross attention in encoder - dec_cross_attention: [3, 5] # layer numbers for chunked cross attention in decoder - add_position_embedding: False # whether use the absolute position encoding - + encoder_seq_length: 512 # will be overrided by value from RETRO preprocessed workdir + max_position_embeddings: ${.encoder_seq_length} # will be overrided by value from RETRO preprocessed workdir + num_layers: 12 + hidden_size: 768 + ffn_hidden_size: 3072 # Transformer FFN hidden size. Usually 4 * hidden_size. + num_attention_heads: 12 + init_method_std: 0.023 # Standard deviation of the zero mean normal distribution used for weight initialization.') + use_scaled_init_method: True # use scaled residuals initialization + hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 # Dropout probability for attention + ffn_dropout: 0.1 # Dropout probability in the feed-forward layer. + kv_channels: 64 # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: False # scale Q * K^T by 1 / layer-number. + normalization: 'layernorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' + layernorm_epsilon: 1e-5 + do_layer_norm_weight_decay: False # True means weight decay on all params make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. pre_process: True # add embedding post_process: True # add pooler - bert_binary_head: True # BERT binary head + persist_layer_norm: True # Use of persistent fused layer norm kernel. + bias: True # Whether to use bias terms in all weight matrices. + activation: 'gelu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] + openai_gelu: False # Use OpenAI's GELU instead of the default GeLU + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. + position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'kerple' , 'xpos', 'sandwich'] xpos and sandwich are experimental. + rotary_percentage: 0.5 # If using position_embedding_type=rope, then the per head dim is multiplied by this. + attention_type: 'multihead' # Attention type. Options ['multihead'] + share_embeddings_and_output_weights: True # Share embedding and output layer weights. + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. - megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. - grad_allreduce_chunk_size_mb: 125 - - megatron_lm_compatible: False # a flag to indicate whether the model is compatible with Megatron LM + retro: # specific arguments for RETRO model + retro_project_dir: null + retro_encoder_num_layers: 2 + retro_encoder_hidden_dropout: 0.1 + retro_encoder_attention_dropout: 0.1 + retro_num_neighbors: 2 + retro_num_retrieved_chunks: 2 + retro_verify_neighbor_count: True tokenizer: library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - vocab_file: null - merge_file: null + type: null # will be overrided by value from RETRO preprocessed workdir + model: null # will be overrided by value from RETRO preprocessed workdir + vocab_file: null # will be overrided by value from RETRO preprocessed workdir + merge_file: null # will be overrided by value from RETRO preprocessed workdir delimiter: null # only used for tabular tokenizer + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. - # precision + # Mixed precision native_amp_init_scale: 4294967296 # 2 ** 32 native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - # miscellaneous - seed: 1234 + # Megatron O2-style half-precision + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + grad_allreduce_chunk_size_mb: 125 + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce. Only used with O2 and no pipeline parallelism.. + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism and O2. + bias_activation_fusion: False # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + get_attention_mask_from_fusion: False # When using fused softmax it will create the attention mask so we won't copy it to the pipeline stages. + # Miscellaneous + seed: 1234 # will be overrided by value from RETRO preprocessed workdir + resume_from_checkpoint: null # manually set the checkpoint file to load from + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + sync_batch_comm: False # Enable stream synchronization after each p2p communication between pipeline stages + + ## Activation Checkpointing + # NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. + # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+). + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + # 'full' will checkpoint the entire transformer layer. + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model. + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null + # when using 'uniform' this creates groups of transformer layers to checkpoint. Usually set to 1. Increase to save more memory. + # when using 'block' this this will checkpoint the first activations_checkpoint_num_layers per pipeline stage. + num_micro_batches_with_partial_activation_checkpoints: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value is provided, it sets the number of micro-batches where only a partial number of Transformer layers get checkpointed + # and recomputed within a window of micro-batches. The rest of micro-batches in the window checkpoint all Transformer layers. The size of window is + # set by the maximum outstanding micro-batch backpropagations, which varies at different pipeline stages. The number of partial layers to checkpoint + # per micro-batch is set by 'activations_checkpoint_num_layers' with 'activations_checkpoint_method' of 'block'. + # This feature enables using activation checkpoint at a fraction of micro-batches up to the point of full GPU memory usage. + activations_checkpoint_layers_per_pipeline: null + # This feature is valid only when used with pipeline-model-parallelism. + # When an integer value (rounded down when float is given) is provided, it sets the number of Transformer layers to skip checkpointing at later + # pipeline stages. For example, 'activations_checkpoint_layers_per_pipeline' of 3 makes pipeline stage 1 to checkpoint 3 layers less than + # stage 0 and stage 2 to checkpoint 6 layers less stage 0, and so on. This is possible because later pipeline stage + # uses less GPU memory with fewer outstanding micro-batch backpropagations. Used with 'num_micro_batches_with_partial_activation_checkpoints', + # this feature removes most of activation checkpoints at the last pipeline stage, which is the critical execution path. + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Transformer Engine + transformer_engine: True + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + reduce_amax: True # Perform reduction to sync amax tensors across GPUs after every iteration + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. + ub_tp_comm_overlap: False + # Use userbuffer backend to overlap tensor-parallel communications with computes. + # This feature is only available with Transformer Engine and squence parallelism enabled and, currently, supports only GPT models. + ub_tp_comm_overlap_cfg: null + # A yaml file with userbuffer communicator configurations. This file should provide `method`, `dtype`, `num_sm`, `num_splits`, + # `cga_size`, `num_splits`, `set_sm_margin`, and `aggregate` for the communicators to use custom settings. + # If the configuration file is not provided a default setting is used for all communicators. + + ## Flash Attention + use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True or mcore_gpt=True + data: - # Path to data must be specified by the user. - # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Path to data must be specified by the user. + # Supports List, String and Dictionary + # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", # Or see example below: # data_prefix: # - .5 # - /raid/data/pile/my-gpt3_00_text_document # - .5 # - /raid/data/pile/my-gpt3_01_text_document - data_prefix: ??? # list of training datasets - knn_index: ??? # list of KNN map index files - retrieval_prefix: ??? # a singe path to retrieval data + # Dictionary: can override from CLI "model.data.data_prefix"={"train":[1.0, /path/to/data], "validation":/path/to/data, "test":/path/to/test} + # Or see example below: + # "model.data.data_prefix: {train:[1.0,/path/to/data], validation:[/path/to/data], test:[/path/to/test]}" + data_prefix: ??? # will be overrided by value from RETRO preprocessed workdir index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_impl: retmmap # for retro model, this is the only allowed type - splits_string: 900,50,50 - seq_length: ${model.encoder_seq_length} # must be multiple of the chunk_size in your dataset + data_impl: mmap + splits_string: 98,2,0 + seq_length: ${model.encoder_seq_length} # will be overrided by value from RETRO preprocessed workdir skip_warmup: True - num_workers: 0 + num_workers: 2 dataloader_type: single # cyclic - neighbors: 2 # number of retrieved neighbors + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + validation_drop_last: True # Set to false if the last partial validation samples is to be consumed + no_seqlen_plus_one_input_tokens: False # Set to True to disable fetching (sequence length + 1) input tokens, instead get (sequence length) input tokens and mask the last token + pad_samples_to_global_batch_size: False # Set to True if you want to pad the last partial batch with -1's to equal global batch size + shuffle_documents: True # Set to False to disable documents shuffling. Sample index will still be shuffled + exchange_indices_distributed: False # Set to True to exchange indices via torch.distributed instead of filesystem + retro_data: + retro_block_size: 10000 + retro_chunk_length: 64 + retro_split_preprocessing: 98,2,0 + retro_neighbor_dirs: null + + + # Nsys profiling options + nsys_profile: + enabled: False + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes optim: - name: fused_adam - lr: 1e-4 - weight_decay: 0.01 + name: distributed_fused_adam + lr: 6.0e-4 + weight_decay: 0.1 betas: - 0.9 - - 0.98 + - 0.95 sched: name: CosineAnnealing - warmup_steps: 500 - constant_steps: 50000 - min_lr: 1e-5 + min_lr: 6.0e-5 + warmup_steps: null + max_steps: 750000 + + gc_interval: 0 + # Interval of the host memory garbage collection. When it is zero, collectiion relies on the automatic garbage collector. + # If an interger value larger than zero is set, collection is done manually by the batch step interval of `gc_interval`. diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml new file mode 100644 index 000000000000..dafdcf542f11 --- /dev/null +++ b/examples/nlp/language_modeling/conf/megatron_retro_config_legacy.yaml @@ -0,0 +1,127 @@ +defaults: + - .@model: megatron_model_base_config + +name: test_retro +restore_from_path: null # used when starting from a .nemo file + +trainer: + devices: 2 + num_nodes: 1 + accelerator: gpu + precision: 16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. + max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 + val_check_interval: 100 + limit_val_batches: null + limit_test_batches: null + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: megatron_retro + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + filename: 'megatron_retro--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} + + +model: + version: 1 # indicate the retro model version + + # model parallelism + micro_batch_size: 4 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 # has to be one. not supporting pipeline parallel yet + + # model architecture + encoder_seq_length: 2048 + max_position_embeddings: ${.encoder_seq_length} + + gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + dump_debug_info: False # dump out the debug information + dump_debug_info_to_file: False # dump out the debug information to files + + # retro architecture + chunk_size: 64 # the chunk size used to retrive + enc_num_layers: 4 # total number of encoder layers + dec_num_layers: 6 # total number of decoder layers + enc_cross_attention: [3] # layer numbers for cross attention in encoder + dec_cross_attention: [3, 5] # layer numbers for chunked cross attention in decoder + add_position_embedding: False # whether use the absolute position encoding + + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + bert_binary_head: True # BERT binary head + + megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting. + grad_allreduce_chunk_size_mb: 125 + + megatron_lm_compatible: False # a flag to indicate whether the model is compatible with Megatron LM + + tokenizer: + library: 'megatron' + type: 'GPT2BPETokenizer' + model: null + vocab_file: null + merge_file: null + delimiter: null # only used for tabular tokenizer + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # miscellaneous + seed: 1234 + + data: + # Path to data must be specified by the user. + # can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]", + # Or see example below: + # data_prefix: + # - .5 + # - /raid/data/pile/my-gpt3_00_text_document + # - .5 + # - /raid/data/pile/my-gpt3_01_text_document + data_prefix: ??? # list of training datasets + knn_index: ??? # list of KNN map index files + retrieval_prefix: ??? # a singe path to retrieval data + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_impl: retmmap # for retro model, this is the only allowed type + splits_string: 900,50,50 + seq_length: ${model.encoder_seq_length} # must be multiple of the chunk_size in your dataset + skip_warmup: True + num_workers: 0 + dataloader_type: single # cyclic + neighbors: 2 # number of retrieved neighbors + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 50000 + min_lr: 1e-5 diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining.py b/examples/nlp/language_modeling/megatron_retro_pretraining.py index c84656d4b657..2a0c04f695f6 100644 --- a/examples/nlp/language_modeling/megatron_retro_pretraining.py +++ b/examples/nlp/language_modeling/megatron_retro_pretraining.py @@ -12,88 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +# To suppress BF16 compile related issue in the CI runs with turing/V100 +import torch._dynamo +import torch.multiprocessing as mp from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.plugins.precision import MixedPrecisionPlugin -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector -from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel -from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo -from nemo.collections.nlp.parts.nlp_overrides import ( - CustomProgressBar, - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - NLPSaveRestoreConnector, -) +from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager +torch._dynamo.config.suppress_errors = True + @hydra_runner(config_path="conf", config_name="megatron_retro_config") def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True if megatron_amp_O2 else False, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_O2: - plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) - # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both - # precision plugins and precision to exist - cfg.trainer.precision = None - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - callbacks = [] - # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks - if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar: - callbacks.append(CustomProgressBar()) - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks) - + trainer = MegatronTrainerBuilder(cfg).create_trainer() exp_manager(trainer, cfg.exp_manager) - # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint) - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - - # load existing nemo retro model - if cfg.get("restore_from_path", None) is not None: - save_restore_connector = NLPSaveRestoreConnector() - if os.path.isdir(cfg.restore_from_path): - save_restore_connector.model_extracted_dir = cfg.restore_from_path - model = MegatronRetrievalModel.restore_from( - restore_path=cfg.restore_from_path, - trainer=trainer, - override_config_path=cfg.model, - save_restore_connector=save_restore_connector, - strict=False, - ) - else: - model = MegatronRetrievalModel(cfg.model, trainer) + model = MegatronRetroModel(cfg.model, trainer) trainer.fit(model) diff --git a/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py b/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py new file mode 100644 index 000000000000..4653222b3438 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_retro_pretraining_legacy.py @@ -0,0 +1,102 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from omegaconf.omegaconf import OmegaConf, open_dict +from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from pytorch_lightning.plugins.precision import MixedPrecisionPlugin +from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector + +from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import initialize_model_parallel_for_nemo +from nemo.collections.nlp.parts.nlp_overrides import ( + CustomProgressBar, + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + NLPSaveRestoreConnector, +) +from nemo.core.config import hydra_runner +from nemo.utils import logging +from nemo.utils.exp_manager import exp_manager + + +@hydra_runner(config_path="conf", config_name="megatron_retro_config_legacy") +def main(cfg) -> None: + logging.info("\n\n************** Experiment configuration ***********") + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + megatron_amp_O2 = cfg.model.get('megatron_amp_O2', False) + plugins = [] + strategy = NLPDDPStrategy( + no_ddp_communication_hook=True if megatron_amp_O2 else False, + gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) + + if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: + scaler = None + if cfg.trainer.precision in [16, '16', '16-mixed']: + scaler = GradScaler( + init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), + growth_interval=cfg.model.get('native_amp_growth_interval', 1000), + hysteresis=cfg.model.get('hysteresis', 2), + ) + plugin_precision = '16-mixed' + else: + plugin_precision = 'bf16-mixed' + if megatron_amp_O2: + plugins.append(MegatronHalfPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) + else: + plugins.append(MixedPrecisionPlugin(plugin_precision, device='cuda', scaler=scaler)) + # Set precision None after precision plugins are created as PTL >= 2.1 does not allow both + # precision plugins and precision to exist + cfg.trainer.precision = None + + if cfg.get('cluster_type', None) == 'BCP': + plugins.append(TorchElasticEnvironment()) + + callbacks = [] + # enable_progress_bar is True by default. If cfg.trainer.enable_progress_bar=False, CustomProgressBar is not appended to callbacks + if 'enable_progress_bar' not in cfg.trainer or cfg.trainer.enable_progress_bar: + callbacks.append(CustomProgressBar()) + trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=callbacks) + + exp_manager(trainer, cfg.exp_manager) + + # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint) + logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') + + # load existing nemo retro model + if cfg.get("restore_from_path", None) is not None: + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.restore_from_path): + save_restore_connector.model_extracted_dir = cfg.restore_from_path + model = MegatronRetrievalModel.restore_from( + restore_path=cfg.restore_from_path, + trainer=trainer, + override_config_path=cfg.model, + save_restore_connector=save_restore_connector, + strict=False, + ) + else: + model = MegatronRetrievalModel(cfg.model, trainer) + + trainer.fit(model) + + +if __name__ == '__main__': + main() diff --git a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py index f0a501d7cc13..377bff309b7c 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset.py @@ -12,32 +12,37 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""RETRO Style dataset.""" +"""RETRO style dataset.""" import os -from typing import List +import time import numpy as np import torch +from omegaconf.dictconfig import DictConfig from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( get_datasets_weights_and_num_samples, get_train_valid_test_split_, ) from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset -from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import ( - _build_index_mappings, - get_indexed_dataset_, -) -from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import ( - KNNIndex, - MMapRetrievalIndexedDataset, -) +from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import deallocate_indexed_dataset_memory +from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_dataset as make_indexed_dataset from nemo.core import Dataset from nemo.utils import logging try: - from megatron.core import parallel_state + from megatron.core import mpu, tensor_parallel + from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder + from megatron.core.datasets.retro.config import RetroGPTChunkDatasets + from megatron.core.datasets.retro.query.multi_split_gpt_dataset import ( + MultiSplitGPTDataset, + MultiSplitGPTDatasetConfig, + ) + from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets + from megatron.core.models.retro import RetroConfig + + from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids HAVE_MEGATRON_CORE = True @@ -45,425 +50,163 @@ HAVE_MEGATRON_CORE = False -__all__ = [ - "RETRODataset", - "build_train_valid_test_datasets", - "MockRETRODataset", - "build_mock_train_valid_test_datasets", -] - class RETRODataset(Dataset): - """ - Dataset for RETRO model. - - It constructs single data record from the training/retrieval indexed retrieval dataset and knn index file. - The KNN index file maps data chunk id to K-nearest neighbors in the the retrieval dataset chunk ids. - First, it loads a long sequence (2048) from training dataset. Then for each chunk in the sequence, it finds the kNN - chunks from the retrieval dataset using the KNN index. Lastly, compute the masks based on pad id. - """ - - def __init__( - self, - cfg, - trainer, - tokenizer, - name: str, - data_prefix: str, - documents, # document ids in the indexed_dataset used for this dataset - indexed_dataset: MMapRetrievalIndexedDataset, - num_samples: int, # number of data samples, max_steps * global_batch_size - seq_length: int, # input seq length - seed: int, - knn_index: KNNIndex, - retrieval_index: MMapRetrievalIndexedDataset, - ): - if not HAVE_MEGATRON_CORE: - raise ImportError( - "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." - ) - + def __init__(self, cfg, retro_config: RetroConfig, tokenizer, mcore_retro_dataset, number_samples_with_neighbors): super().__init__() - self.name = name - self.indexed_dataset: MMapRetrievalIndexedDataset = indexed_dataset - self.knn_index: KNNIndex = knn_index - self.retrieval_index: MMapRetrievalIndexedDataset = retrieval_index - self.chunk_size = self.indexed_dataset.chunk_size - - # make sure seq_length is a multiple of chunk_size - assert seq_length % self.chunk_size == 0 - # Checks - assert np.min(documents) >= 0 - assert np.max(documents) < indexed_dataset.sizes.shape[0] + self.reset_position_ids = cfg.data.get('reset_position_ids', False) + self.reset_attention_mask = cfg.data.get('reset_attention_mask', False) + self.eod_mask_loss = cfg.data.get('eod_mask_loss', False) self.eos_id = tokenizer.eos_id - self.pad_id = tokenizer.pad_id - - assert self.retrieval_index._index.retrieval_db - self._validate_pad_id() - - # save index mappings to a configurable dir - self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) - self.neighbors = cfg.data.get('neighbors', self.knn_index.K) - # the number of neighbors cannot exceed the max number of neighbors in the index - assert self.neighbors <= self.knn_index.K - # create index_mapping_dir on rank 0 - if torch.distributed.is_available() and torch.distributed.is_initialized(): - if torch.distributed.get_rank() == 0: - if self.index_mapping_dir is not None and not os.path.isdir(self.index_mapping_dir): - os.makedirs(self.index_mapping_dir) - torch.distributed.barrier() - - # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - self.name, - data_prefix, - documents, - self.indexed_dataset.sizes, - num_samples, - seq_length, - seed, - index_mapping_dir=self.index_mapping_dir, - ) - if len(self.doc_idx) > np.iinfo('int32').max: - raise "number of epochs exceeds the maximum number for int32 used by sample_idx" - self.padding_context = np.ones(2 * self.chunk_size, dtype=self.retrieval_index._index.dtype) * self.pad_id - - def _validate_pad_id(self): - # validate the pad_id matches the dataset pad_id - ptr, size = self.retrieval_index._index[0] - ptr += size * np.dtype(self.retrieval_index._index.dtype).itemsize - # padded chunk_size of pad_ids at the end of the doc - retrieval_paddings = np.frombuffer( - self.retrieval_index._bin_buffer, - dtype=self.retrieval_index._index.dtype, - count=self.chunk_size, - offset=ptr, - ) - assert (retrieval_paddings == self.pad_id).all() + self.retro_config = retro_config + self.mcore_retro_dataset = mcore_retro_dataset + self.number_samples_with_neighbors = number_samples_with_neighbors # quick fix for problems of mismatch in processed/indexed retro data, # of GPT samples is different from # of samples with neighbors retrieved + self.tokenizer = tokenizer - ptr, size = self.indexed_dataset._index[0] - ptr += (size - 1) * np.dtype(self.indexed_dataset._index.dtype).itemsize - data_paddings = np.frombuffer( - self.indexed_dataset._bin_buffer, dtype=self.indexed_dataset._index.dtype, count=1, offset=ptr - ) - # the last element is either a padding or an eos - assert (data_paddings == self.pad_id).all() or (data_paddings == self.eos_id).all() + return def __len__(self): - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - return self.sample_idx.shape[0] - 1 - - def _get_chunks(self, chunk_id: int, num_chunks: int, chunks: List): - """ - starting from chunk_id, loop for num_chunks, get the - KNN chunk ids from retrieval dataset, and get the chunk token ids, - put them into the chunks list - """ - for i in range(chunk_id, chunk_id + num_chunks): - knn = self.knn_index.get_KNN_chunk_ids(i) - for rid in knn[: self.neighbors]: - if rid < 0: - # no neighbor, just pad it - one_chunk = self.padding_context - else: - one_chunk = self.retrieval_index.get_chunk(rid) - chunks.append(one_chunk) - - def _get_text(self, idx: int) -> np.ndarray: - # Get the shuffled index. - idx = self.shuffle_idx[idx] - # Start and end documents and offsets. - doc_index_f = self.sample_idx[idx][0] - doc_index_l = self.sample_idx[idx + 1][0] - offset_f = self.sample_idx[idx][1] - offset_l = self.sample_idx[idx + 1][1] - # If we are within the same document, just extract the chunk. - if doc_index_f == doc_index_l: - sample = self.indexed_dataset.get( - self.doc_idx[doc_index_f], offset=offset_f, length=offset_l - offset_f + 1 - ) - chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f) - num_chunks = (offset_l - offset_f) // self.chunk_size - chunks = [] - self._get_chunks(chunk_id, num_chunks, chunks) - chunks = np.stack(chunks, axis=0).reshape(num_chunks, self.neighbors, -1).astype(np.int64) - else: - # Otherwise, get the rest of the initial document. - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)] - num_chunks = (self.indexed_dataset._index.sizes[self.doc_idx[doc_index_f]] - offset_f) // self.chunk_size - total_chunks = num_chunks - chunks = [] - chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f) - self._get_chunks(chunk_id, num_chunks, chunks) - # Loop over all in between documents and add the entire document. - for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[i], 0) - num_chunks = self.indexed_dataset._index.sizes[self.doc_idx[i]] // self.chunk_size - total_chunks += num_chunks - self._get_chunks(chunk_id, num_chunks, chunks) - # And finally add the relevant portion of last document. - chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_l], 0) - num_chunks = (offset_l) // self.chunk_size - total_chunks += num_chunks - self._get_chunks(chunk_id, num_chunks, chunks) - sample_list.append(self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)) - sample = np.concatenate(sample_list) - chunks = np.stack(chunks, axis=0).reshape(total_chunks, self.neighbors, -1).astype(np.int64) - return sample.astype(np.int64), chunks + return len(self.mcore_retro_dataset.chunk_dataset.sample_dataset) - def __getitem__(self, idx): - text, retrieved = self._get_text(idx) - text = torch.from_numpy(text) - retrieved = torch.from_numpy(retrieved) - tokens = text[:-1].contiguous() - labels = text[1:].contiguous() - hidden_mask = tokens != self.pad_id - context_mask = retrieved != self.pad_id - return { - 'tokens': tokens, - 'labels': labels, - 'tokens_mask': hidden_mask, - 'loss_mask': hidden_mask, - 'retrieved_emb_mask': context_mask, - 'retrieved_ids': retrieved, - } + def _get_text(self, idx: int): + # return the tokens ids of idx + # Caveat: these tokens are got from the already pre-tokenized data file, mcore's GPTDataset doesn't run __getitem__, only run _query_document_sample_shuffle_indices + return self.mcore_retro_dataset[idx] + def __getitem__(self, idx): -def build_train_valid_test_datasets( - cfg, - trainer, - data_prefix: List[str], - data_impl: str, - splits_string: str, - train_valid_test_num_samples, - seq_length: int, - seed: int, - skip_warmup: bool, - tokenizer, - retrieval_prefix: str, - knn_map_path: List[str], -): - """Build train, valid, and test RETRO datasets. - There is one to one mapping between data_prefix and knn_map_path. - Currently only supports one retrieval dataset. - """ - # make sure there is one to one mapping between data_prefix and knn_map_path - assert len(data_prefix) == len(knn_map_path) - - # Single dataset. - if len(data_prefix) == 1: - return _build_train_valid_test_datasets( - cfg, - trainer, - data_prefix[0], - data_impl, - splits_string, - train_valid_test_num_samples, - seq_length, - seed, - skip_warmup, - tokenizer, - retrieval_prefix, - knn_map_path[0], - ) - - # Blending dataset. - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - train_n, valid_n, test_n = map(sum, zip(*datasets_train_valid_test_num_samples)) - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - cfg, - trainer, - prefixes[i], - data_impl, - splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, - skip_warmup, - tokenizer, - retrieval_prefix, - knn_map_path[i], + # quick fix for problems of mismatch in processed/indexed retro data, # of GPT samples is different from # of samples with neighbors retrieved + idx = idx % self.number_samples_with_neighbors + + sample = self._get_text(idx) + + # Unpack + tokens_ = torch.from_numpy(sample['text']) + tokens_ = tokens_.long() # size should be [seq_length] + labels = tokens_[1:].contiguous() + tokens = tokens_[:-1].contiguous() + neighbor_tokens = torch.from_numpy(sample['neighbor_tokens']) + neighbor_tokens = neighbor_tokens.long() # size should be [l, k, r] + + # note: [l, k, r] => [l*k, r] + # note: 2x == neighbor, continuation + neighbor_tokens = neighbor_tokens.view(-1, self.retro_config.retro_retrieved_length).long() + + # Get the masks and postition ids for tokens and neighbor_tokens + tokens = torch.unsqueeze( + tokens, 0 + ) # get_ltor_masks_and_position_ids takes as input tokens arguments as a batch (2D tensor), so need to convert tokens from 1D to 2D + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss ) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights, train_n) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_n) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights, test_n) - - return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) - - -def _build_train_valid_test_datasets( - cfg, - trainer, - data_prefix: str, - data_impl: str, - splits_string: str, - train_valid_test_num_samples, - seq_length: int, - seed: int, - skip_warmup: bool, - tokenizer, - retrieval_prefix: str, - knn_map_path: str, -): - """Build train, valid, and test datasets.""" - - # Indexed dataset. - indexed_dataset: MMapRetrievalIndexedDataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) - knn_index: KNNIndex = KNNIndex(knn_map_path, skip_warmup) - retrieval_index: MMapRetrievalIndexedDataset = get_indexed_dataset_(retrieval_prefix, data_impl, skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - - # Print stats about the splits. - logging.info(' > dataset split:') - - def print_split_stats(name, index): - logging.info(' {}:'.format(name)) - logging.info( - ' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index]) + tokens, attention_mask, loss_mask, position_ids = tokens[0], attention_mask[0], loss_mask[0], position_ids[0] + _, _, neighbor_position_ids = get_ltor_masks_and_position_ids( # neighbor_tokens is already a 2D array + neighbor_tokens, self.eos_id, self.reset_position_ids, self.reset_attention_mask, self.eod_mask_loss ) - - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32) - dataset = RETRODataset( - cfg, - trainer, - tokenizer, - name, - data_prefix, - documents, - indexed_dataset, - train_valid_test_num_samples[index], - seq_length, - seed, - knn_index, - retrieval_index, - ) - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - -class MockRETRODataset(torch.utils.data.Dataset): - def __init__(self, cfg, trainer, tokenizer, name, size): - super().__init__() - self.name = name - self.tokenizer = tokenizer - self._cfg = cfg - self.size = size - seed_val = parallel_state.get_data_parallel_rank() * 131 + 97 - torch.manual_seed(seed_val) - - def __len__(self): - return self.size - - def __getitem__(self, idx): - vocab_size = self.tokenizer.vocab_size - - neighbors = self._cfg.data.neighbors - input_length = self._cfg.data.seq_length - chunks = input_length // self._cfg.chunk_size - chunk_size = self._cfg.chunk_size - pad_id = self.tokenizer.pad_id - - all_tokens = torch.randint(0, vocab_size, (input_length + 1,)) - # make sure the eod happens at the end of each chunk, can add paddings to it - # e.g. [..., id, id, pad, pad, pad, eod] each has chunk_size, each sentence - # has length of multiple of chunk_size - hidden = all_tokens[:-1] - labels = all_tokens[1:] - - hidden_mask = hidden != pad_id - # to mask out the token ids [id, id, eod, id, pad, eod, id, id] - # so attention is not across eod, mask should be: - # [false, true, true, true, true, true, true, true] - # [false, false, true, true, true, true, true, true] - # [false, false, false,true, true, true, true, true] - # [true, true, true, false, true, true, true, true] - # [true, true, true, true, true, true, true, true] - # [true, true, true, false, true, false, true, true] - # [true, true, true, true, true, true, false, true] - # [true, true, true, true, true, true, false, false] - retrieved = torch.randint(0, vocab_size, (chunks, neighbors, 2 * chunk_size)) - - context_mask = retrieved != pad_id + neighbor_attention_mask = torch.zeros( + [1, 1] + ) # just a dummy values, since the batch neighbor_attention_mask will be set to None in megatron_retro_model.py following Lawrence's implementation return { - 'tokens': hidden, + 'tokens': tokens, 'labels': labels, - 'tokens_mask': hidden_mask, - 'loss_mask': hidden_mask, - 'retrieved_emb_mask': context_mask, - 'retrieved_ids': retrieved, + 'loss_mask': loss_mask, + 'attention_mask': attention_mask, + 'position_ids': position_ids, + 'context_input_ids': neighbor_tokens, + 'context_attention_mask': neighbor_attention_mask, + 'context_position_ids': neighbor_position_ids, } -def build_mock_train_valid_test_datasets( - cfg, trainer, splits_string, tokenizer, mock_data_size, +def build_train_valid_test_datasets( + cfg, retro_config: RetroConfig, train_valid_test_num_samples, seq_length, tokenizer, ): - """Build train, valid, and test datasets.""" - - splits = get_train_valid_test_split_(splits_string, mock_data_size) - # Print stats about the splits. - logging.info(' > dataset split:') - - def print_split_stats(name, index): - logging.info(' {}:'.format(name)) - logging.info( - ' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index]) + # gpt dataset + train_ds, valid_ds, test_ds = gpt_train_valid_test_datasets_provider(cfg, train_valid_test_num_samples, tokenizer) + + gpt_datasets = { + "train": (train_ds, train_valid_test_num_samples[0]), + "valid": (valid_ds, train_valid_test_num_samples[1]), + "test": (test_ds, train_valid_test_num_samples[2]), + } + + retro_train_ds, retro_valid_ds, retro_test_ds = get_retro_datasets( + config=retro_config, gpt_datasets=gpt_datasets, sample_length=seq_length, eod_token_id=tokenizer.eos_id, + ) + + train_ds = ( + RETRODataset( + cfg=cfg, + retro_config=retro_config, + tokenizer=tokenizer, + mcore_retro_dataset=retro_train_ds, + number_samples_with_neighbors=train_valid_test_num_samples[0], + ) + if retro_train_ds + else None + ) + valid_ds = ( + RETRODataset( + cfg=cfg, + retro_config=retro_config, + tokenizer=tokenizer, + mcore_retro_dataset=retro_valid_ds, + number_samples_with_neighbors=train_valid_test_num_samples[1], + ) + if retro_valid_ds + else None + ) + test_ds = ( + RETRODataset( + cfg=cfg, + retro_config=retro_config, + tokenizer=tokenizer, + mcore_retro_dataset=retro_test_ds, + number_samples_with_neighbors=train_valid_test_num_samples[2], ) + if retro_test_ds + else None + ) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) + return train_ds, valid_ds, test_ds - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - dataset = MockRETRODataset(cfg, trainer, tokenizer, name, splits[index + 1] - splits[index],) - return dataset - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') +def gpt_train_valid_test_datasets_provider(cfg, train_val_test_num_samples, tokenizer): + """Build the train test and validation datasets. + Implemented from train_valid_test_datasets_provider in M-LM/pretrain_gpt.py + + Args: + train_val_test_num_samples : A list containing the number of samples in train test and validation. + """ - return (train_dataset, valid_dataset, test_dataset) + def is_dataset_built_on_rank(): + return ( + mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage() + ) and mpu.get_tensor_model_parallel_rank() == 0 + + data_config = MultiSplitGPTDatasetConfig( + random_seed=cfg.seed, + sequence_length=cfg.data.seq_length, + blend=cfg.data.data_prefix, + split=cfg.data.splits_string, + split_preprocessing=cfg.data.retro_data.retro_split_preprocessing, + path_to_cache=None, + return_document_ids=False, + reset_position_ids=cfg.data.get('reset_position_ids', False), + reset_attention_mask=cfg.data.get('reset_attention_mask', False), + eod_mask_loss=cfg.data.get('eod_mask_loss', False), + tokenizer=tokenizer, + ) + + print("> building train, validation, and test datasets for GPT ...") + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + MultiSplitGPTDataset, train_val_test_num_samples, is_dataset_built_on_rank, data_config + ).build() + + print("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds diff --git a/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py new file mode 100644 index 000000000000..f0a501d7cc13 --- /dev/null +++ b/nemo/collections/nlp/data/language_modeling/megatron/retro_dataset_legacy.py @@ -0,0 +1,469 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RETRO Style dataset.""" + +import os +from typing import List + +import numpy as np +import torch + +from nemo.collections.nlp.data.language_modeling.megatron.base_dataset_utils import ( + get_datasets_weights_and_num_samples, + get_train_valid_test_split_, +) +from nemo.collections.nlp.data.language_modeling.megatron.blendable_dataset import BlendableDataset +from nemo.collections.nlp.data.language_modeling.megatron.gpt_dataset import ( + _build_index_mappings, + get_indexed_dataset_, +) +from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import ( + KNNIndex, + MMapRetrievalIndexedDataset, +) +from nemo.core import Dataset +from nemo.utils import logging + +try: + from megatron.core import parallel_state + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + HAVE_MEGATRON_CORE = False + +__all__ = [ + "RETRODataset", + "build_train_valid_test_datasets", + "MockRETRODataset", + "build_mock_train_valid_test_datasets", +] + + +class RETRODataset(Dataset): + """ + Dataset for RETRO model. + + It constructs single data record from the training/retrieval indexed retrieval dataset and knn index file. + The KNN index file maps data chunk id to K-nearest neighbors in the the retrieval dataset chunk ids. + First, it loads a long sequence (2048) from training dataset. Then for each chunk in the sequence, it finds the kNN + chunks from the retrieval dataset using the KNN index. Lastly, compute the masks based on pad id. + """ + + def __init__( + self, + cfg, + trainer, + tokenizer, + name: str, + data_prefix: str, + documents, # document ids in the indexed_dataset used for this dataset + indexed_dataset: MMapRetrievalIndexedDataset, + num_samples: int, # number of data samples, max_steps * global_batch_size + seq_length: int, # input seq length + seed: int, + knn_index: KNNIndex, + retrieval_index: MMapRetrievalIndexedDataset, + ): + if not HAVE_MEGATRON_CORE: + raise ImportError( + "megatron-core was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt." + ) + + super().__init__() + self.name = name + self.indexed_dataset: MMapRetrievalIndexedDataset = indexed_dataset + self.knn_index: KNNIndex = knn_index + self.retrieval_index: MMapRetrievalIndexedDataset = retrieval_index + self.chunk_size = self.indexed_dataset.chunk_size + + # make sure seq_length is a multiple of chunk_size + assert seq_length % self.chunk_size == 0 + # Checks + assert np.min(documents) >= 0 + assert np.max(documents) < indexed_dataset.sizes.shape[0] + + self.eos_id = tokenizer.eos_id + self.pad_id = tokenizer.pad_id + + assert self.retrieval_index._index.retrieval_db + self._validate_pad_id() + + # save index mappings to a configurable dir + self.index_mapping_dir = cfg.data.get('index_mapping_dir', None) + self.neighbors = cfg.data.get('neighbors', self.knn_index.K) + # the number of neighbors cannot exceed the max number of neighbors in the index + assert self.neighbors <= self.knn_index.K + # create index_mapping_dir on rank 0 + if torch.distributed.is_available() and torch.distributed.is_initialized(): + if torch.distributed.get_rank() == 0: + if self.index_mapping_dir is not None and not os.path.isdir(self.index_mapping_dir): + os.makedirs(self.index_mapping_dir) + torch.distributed.barrier() + + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, + data_prefix, + documents, + self.indexed_dataset.sizes, + num_samples, + seq_length, + seed, + index_mapping_dir=self.index_mapping_dir, + ) + if len(self.doc_idx) > np.iinfo('int32').max: + raise "number of epochs exceeds the maximum number for int32 used by sample_idx" + self.padding_context = np.ones(2 * self.chunk_size, dtype=self.retrieval_index._index.dtype) * self.pad_id + + def _validate_pad_id(self): + # validate the pad_id matches the dataset pad_id + ptr, size = self.retrieval_index._index[0] + ptr += size * np.dtype(self.retrieval_index._index.dtype).itemsize + # padded chunk_size of pad_ids at the end of the doc + retrieval_paddings = np.frombuffer( + self.retrieval_index._bin_buffer, + dtype=self.retrieval_index._index.dtype, + count=self.chunk_size, + offset=ptr, + ) + assert (retrieval_paddings == self.pad_id).all() + + ptr, size = self.indexed_dataset._index[0] + ptr += (size - 1) * np.dtype(self.indexed_dataset._index.dtype).itemsize + data_paddings = np.frombuffer( + self.indexed_dataset._bin_buffer, dtype=self.indexed_dataset._index.dtype, count=1, offset=ptr + ) + # the last element is either a padding or an eos + assert (data_paddings == self.pad_id).all() or (data_paddings == self.eos_id).all() + + def __len__(self): + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + return self.sample_idx.shape[0] - 1 + + def _get_chunks(self, chunk_id: int, num_chunks: int, chunks: List): + """ + starting from chunk_id, loop for num_chunks, get the + KNN chunk ids from retrieval dataset, and get the chunk token ids, + put them into the chunks list + """ + for i in range(chunk_id, chunk_id + num_chunks): + knn = self.knn_index.get_KNN_chunk_ids(i) + for rid in knn[: self.neighbors]: + if rid < 0: + # no neighbor, just pad it + one_chunk = self.padding_context + else: + one_chunk = self.retrieval_index.get_chunk(rid) + chunks.append(one_chunk) + + def _get_text(self, idx: int) -> np.ndarray: + # Get the shuffled index. + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # If we are within the same document, just extract the chunk. + if doc_index_f == doc_index_l: + sample = self.indexed_dataset.get( + self.doc_idx[doc_index_f], offset=offset_f, length=offset_l - offset_f + 1 + ) + chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f) + num_chunks = (offset_l - offset_f) // self.chunk_size + chunks = [] + self._get_chunks(chunk_id, num_chunks, chunks) + chunks = np.stack(chunks, axis=0).reshape(num_chunks, self.neighbors, -1).astype(np.int64) + else: + # Otherwise, get the rest of the initial document. + sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)] + num_chunks = (self.indexed_dataset._index.sizes[self.doc_idx[doc_index_f]] - offset_f) // self.chunk_size + total_chunks = num_chunks + chunks = [] + chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_f], offset_f) + self._get_chunks(chunk_id, num_chunks, chunks) + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[i], 0) + num_chunks = self.indexed_dataset._index.sizes[self.doc_idx[i]] // self.chunk_size + total_chunks += num_chunks + self._get_chunks(chunk_id, num_chunks, chunks) + # And finally add the relevant portion of last document. + chunk_id = self.indexed_dataset.get_chunk_id(self.doc_idx[doc_index_l], 0) + num_chunks = (offset_l) // self.chunk_size + total_chunks += num_chunks + self._get_chunks(chunk_id, num_chunks, chunks) + sample_list.append(self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)) + sample = np.concatenate(sample_list) + chunks = np.stack(chunks, axis=0).reshape(total_chunks, self.neighbors, -1).astype(np.int64) + return sample.astype(np.int64), chunks + + def __getitem__(self, idx): + text, retrieved = self._get_text(idx) + text = torch.from_numpy(text) + retrieved = torch.from_numpy(retrieved) + tokens = text[:-1].contiguous() + labels = text[1:].contiguous() + hidden_mask = tokens != self.pad_id + context_mask = retrieved != self.pad_id + return { + 'tokens': tokens, + 'labels': labels, + 'tokens_mask': hidden_mask, + 'loss_mask': hidden_mask, + 'retrieved_emb_mask': context_mask, + 'retrieved_ids': retrieved, + } + + +def build_train_valid_test_datasets( + cfg, + trainer, + data_prefix: List[str], + data_impl: str, + splits_string: str, + train_valid_test_num_samples, + seq_length: int, + seed: int, + skip_warmup: bool, + tokenizer, + retrieval_prefix: str, + knn_map_path: List[str], +): + """Build train, valid, and test RETRO datasets. + There is one to one mapping between data_prefix and knn_map_path. + Currently only supports one retrieval dataset. + """ + # make sure there is one to one mapping between data_prefix and knn_map_path + assert len(data_prefix) == len(knn_map_path) + + # Single dataset. + if len(data_prefix) == 1: + return _build_train_valid_test_datasets( + cfg, + trainer, + data_prefix[0], + data_impl, + splits_string, + train_valid_test_num_samples, + seq_length, + seed, + skip_warmup, + tokenizer, + retrieval_prefix, + knn_map_path[0], + ) + + # Blending dataset. + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + train_n, valid_n, test_n = map(sum, zip(*datasets_train_valid_test_num_samples)) + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + cfg, + trainer, + prefixes[i], + data_impl, + splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, + skip_warmup, + tokenizer, + retrieval_prefix, + knn_map_path[i], + ) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights, train_n) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights, valid_n) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights, test_n) + + return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) + + +def _build_train_valid_test_datasets( + cfg, + trainer, + data_prefix: str, + data_impl: str, + splits_string: str, + train_valid_test_num_samples, + seq_length: int, + seed: int, + skip_warmup: bool, + tokenizer, + retrieval_prefix: str, + knn_map_path: str, +): + """Build train, valid, and test datasets.""" + + # Indexed dataset. + indexed_dataset: MMapRetrievalIndexedDataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup) + knn_index: KNNIndex = KNNIndex(knn_map_path, skip_warmup) + retrieval_index: MMapRetrievalIndexedDataset = get_indexed_dataset_(retrieval_prefix, data_impl, skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + + # Print stats about the splits. + logging.info(' > dataset split:') + + def print_split_stats(name, index): + logging.info(' {}:'.format(name)) + logging.info( + ' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index]) + ) + + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32) + dataset = RETRODataset( + cfg, + trainer, + tokenizer, + name, + data_prefix, + documents, + indexed_dataset, + train_valid_test_num_samples[index], + seq_length, + seed, + knn_index, + retrieval_index, + ) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +class MockRETRODataset(torch.utils.data.Dataset): + def __init__(self, cfg, trainer, tokenizer, name, size): + super().__init__() + self.name = name + self.tokenizer = tokenizer + self._cfg = cfg + self.size = size + seed_val = parallel_state.get_data_parallel_rank() * 131 + 97 + torch.manual_seed(seed_val) + + def __len__(self): + return self.size + + def __getitem__(self, idx): + vocab_size = self.tokenizer.vocab_size + + neighbors = self._cfg.data.neighbors + input_length = self._cfg.data.seq_length + chunks = input_length // self._cfg.chunk_size + chunk_size = self._cfg.chunk_size + pad_id = self.tokenizer.pad_id + + all_tokens = torch.randint(0, vocab_size, (input_length + 1,)) + # make sure the eod happens at the end of each chunk, can add paddings to it + # e.g. [..., id, id, pad, pad, pad, eod] each has chunk_size, each sentence + # has length of multiple of chunk_size + hidden = all_tokens[:-1] + labels = all_tokens[1:] + + hidden_mask = hidden != pad_id + # to mask out the token ids [id, id, eod, id, pad, eod, id, id] + # so attention is not across eod, mask should be: + # [false, true, true, true, true, true, true, true] + # [false, false, true, true, true, true, true, true] + # [false, false, false,true, true, true, true, true] + # [true, true, true, false, true, true, true, true] + # [true, true, true, true, true, true, true, true] + # [true, true, true, false, true, false, true, true] + # [true, true, true, true, true, true, false, true] + # [true, true, true, true, true, true, false, false] + retrieved = torch.randint(0, vocab_size, (chunks, neighbors, 2 * chunk_size)) + + context_mask = retrieved != pad_id + + return { + 'tokens': hidden, + 'labels': labels, + 'tokens_mask': hidden_mask, + 'loss_mask': hidden_mask, + 'retrieved_emb_mask': context_mask, + 'retrieved_ids': retrieved, + } + + +def build_mock_train_valid_test_datasets( + cfg, trainer, splits_string, tokenizer, mock_data_size, +): + """Build train, valid, and test datasets.""" + + splits = get_train_valid_test_split_(splits_string, mock_data_size) + + # Print stats about the splits. + logging.info(' > dataset split:') + + def print_split_stats(name, index): + logging.info(' {}:'.format(name)) + logging.info( + ' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index]) + ) + + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + dataset = MockRETRODataset(cfg, trainer, tokenizer, name, splits[index + 1] - splits[index],) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) diff --git a/nemo/collections/nlp/models/language_modeling/__init__.py b/nemo/collections/nlp/models/language_modeling/__init__.py index f63d289f8925..437a7003483b 100644 --- a/nemo/collections/nlp/models/language_modeling/__init__.py +++ b/nemo/collections/nlp/models/language_modeling/__init__.py @@ -17,4 +17,5 @@ MegatronGPTPromptLearningModel, ) from nemo.collections.nlp.models.language_modeling.megatron_retrieval_model import MegatronRetrievalModel +from nemo.collections.nlp.models.language_modeling.megatron_retro_model import MegatronRetroModel from nemo.collections.nlp.models.language_modeling.transformer_lm_model import TransformerLMModel diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 4493532f88bf..43cc8c26444f 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -1481,7 +1481,7 @@ def setup(self, stage=None): f'Pipeline model parallel rank: {parallel_state.get_pipeline_model_parallel_rank()}, ' f'Tensor model parallel rank: {parallel_state.get_tensor_model_parallel_rank()}, ' f'Number of model parameters on device: {num_parameters_on_device:.2e}. ' - f'Total number of model parameters: {total_num_parameters:.2e}.' + f'Number of precise model parameters on device: {total_num_parameters}.' ) resume_checkpoint_path = self.trainer.ckpt_path @@ -1548,11 +1548,14 @@ def setup_validation_data(self, cfg): def setup_test_data(self, cfg): if hasattr(self, '_test_ds'): - consumed_samples = 0 - logging.info( - f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}' - ) - self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) + if self._test_ds is not None: + consumed_samples = 0 + logging.info( + f'Setting up test dataloader with len(len(self._test_ds)): {len(self._test_ds)} and consumed samples: {consumed_samples}' + ) + self._test_dl = self.build_pretraining_data_loader(self._test_ds, consumed_samples) + else: + self._test_dl = None def generate( self, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py index acd85261f7e5..42323e503f7d 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retrieval_model.py @@ -23,7 +23,7 @@ MegatronPretrainingRandomSampler, MegatronPretrainingSampler, ) -from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import ( +from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset_legacy import ( build_mock_train_valid_test_datasets, build_train_valid_test_datasets, ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py new file mode 100644 index 000000000000..8cc39056554c --- /dev/null +++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py @@ -0,0 +1,651 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +import json +import os +import queue +import types +import warnings +from dataclasses import fields +from functools import partial +from typing import Any, Dict, Iterator, List, Optional, Union + +import torch +from omegaconf import OmegaConf, open_dict +from omegaconf.dictconfig import DictConfig +from pytorch_lightning.accelerators import CPUAccelerator +from pytorch_lightning.trainer.trainer import Trainer + +from nemo.collections.nlp.data.language_modeling.megatron.data_samplers import ( + MegatronPretrainingRandomSampler, + MegatronPretrainingSampler, +) + +# from nemo.collections.nlp.data.language_modeling.megatron.retro_dummy_dataset import build_train_valid_test_datasets as dummy_build_train_valid_test_datasets # turn on when running with dummy data +from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import build_train_valid_test_datasets +from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.build_model import build_model +from nemo.collections.nlp.modules.common.megatron.module import Float16Module +from nemo.collections.nlp.modules.common.megatron.utils import ( + ApexGuardDefaults, + average_losses_across_data_parallel_group, + get_all_params_for_weight_decay_optimization, + get_ltor_masks_and_position_ids, + get_params_for_weight_decay_optimization, +) +from nemo.collections.nlp.modules.common.text_generation_strategy import TextGenerationStrategy +from nemo.collections.nlp.modules.common.text_generation_utils import ( + generate, + get_computeprob_response, + get_default_length_params, + get_default_sampling_params, + megatron_gpt_generate, +) +from nemo.collections.nlp.modules.common.transformer.text_generation import ( + LengthParam, + OutputType, + SamplingParam, + TextGeneration, +) +from nemo.collections.nlp.parts import utils_funcs +from nemo.collections.nlp.parts.utils_funcs import activation_to_func, get_last_rank +from nemo.core.classes import Exportable +from nemo.core.classes.common import PretrainedModelInfo +from nemo.core.neural_types import ChannelType, NeuralType +from nemo.utils import logging + +try: + import apex.transformer.pipeline_parallel.utils + from apex.transformer.pipeline_parallel.utils import get_num_microbatches + + HAVE_APEX = True + +except (ImportError, ModuleNotFoundError): + + HAVE_APEX = False + +try: + from megatron.core import InferenceParams, parallel_state + from megatron.core.models.retro import RetroModel as MCoreRetroModel + from megatron.core.models.retro.config import RetroConfig + from megatron.core.models.retro.decoder_spec import get_retro_decoder_block_spec + from megatron.core.models.retro.utils import get_config_path as get_retro_config_path + from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir + from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.module import Float16Module as MCoreFloat16Module + from megatron.core.transformer.transformer_config import TransformerConfig + from megatron.core.utils import init_method_normal, scaled_init_method_normal + + # TODO @tmoon: Use once available in Megatron-LM + # from megatron.core.pipeline_parallel.schedules import DataIteratorList + + HAVE_MEGATRON_CORE = True + +except (ImportError, ModuleNotFoundError): + + TransformerConfig = ApexGuardDefaults + + HAVE_MEGATRON_CORE = False + +try: + import transformer_engine + from transformer_engine.pytorch import module as te_module + + HAVE_TE = True + +except (ImportError, ModuleNotFoundError): + HAVE_TE = False + + +class MegatronRetroModel(MegatronGPTModel): + """ + Megatron Retro pretraining + """ + + def load_retro_config(self, cfg: DictConfig): + assert cfg.retro.get('retro_project_dir') is not None, "`--retro-project-dir` must be set to use Retro." + + # Retro config path. + retro_config_path = get_retro_config_path(cfg.retro.get('retro_project_dir')) + assert os.path.exists(retro_config_path), "retro project dir missing config.json." + + # Load retro config. + with open(retro_config_path) as f: + + # Parse config. + retro_preprocess_config = types.SimpleNamespace(**json.load(f)) + + # Retro data path is relative to data path (via hard or soft links). + data_dir = get_retro_data_dir(cfg.retro.get('retro_project_dir')) + data_path = list(retro_preprocess_config.retro_gpt_data_path) + if len(data_path) % 2 == 0: + for i in range(len(data_path) - 1, -1, -2): + data_path[i] = os.path.join(data_dir, data_path[i]) + else: + assert len(data_path) == 1 + data_path[0] = os.path.join(data_dir, data_path[0]) + + # Update args. + cfg.global_batch_size = retro_preprocess_config.retro_gpt_global_batch_size + cfg.seed = retro_preprocess_config.retro_gpt_seed + cfg.data.data_prefix = data_path + cfg.encoder_seq_length = retro_preprocess_config.retro_gpt_seq_length + cfg.data.seq_length = retro_preprocess_config.retro_gpt_seq_length + cfg.max_position_embeddings = retro_preprocess_config.retro_gpt_seq_length + # cfg.data.splits_string = retro_preprocess_config.retro_gpt_split # remove because lastest RETRO data-object have separate RETRO training split and RETRO preprocessing split + cfg.tokenizer.model = ( + cfg.retro.get('retro_project_dir') + '/' + retro_preprocess_config.retro_gpt_tokenizer_model + ) + cfg.tokenizer.type = retro_preprocess_config.retro_gpt_tokenizer_type + cfg.tokenizer.vocab_file = retro_preprocess_config.retro_gpt_vocab_file + cfg.tokenizer.merge_file = retro_preprocess_config.retro_gpt_merge_file + with open_dict(cfg): + cfg.retro_train_samples_with_neighbors = retro_preprocess_config.retro_gpt_train_samples + cfg.retro_valid_samples_with_neighbors = retro_preprocess_config.retro_gpt_valid_samples + cfg.data.retro_data.retro_block_size = retro_preprocess_config.retro_block_size + cfg.data.retro_data.retro_chunk_length = retro_preprocess_config.retro_gpt_chunk_length + cfg.data.retro_data.retro_split_preprocessing = retro_preprocess_config.retro_gpt_split + cfg.data.retro_data.retro_neighbor_dirs = retro_preprocess_config.retro_neighbor_dirs + + return cfg + + def __init__(self, cfg: DictConfig, trainer: Trainer): + + # override pre-processing arguments with retro pre-processing arguments + cfg = self.load_retro_config(cfg) + + super().__init__(cfg, trainer=trainer) + + logging.info( + "\n\n************** Experiment configuration (after overriding with RETRO's workdir values) ***********" + ) + logging.info(f'\n{OmegaConf.to_yaml(cfg)}') + + return + + def model_provider_func(self, pre_process, post_process): + """Model depends on pipeline paralellism.""" + if self.mcore_gpt: + self.retro_model_config = self.build_retro_config() + model = MCoreRetroModel( + config=self.retro_model_config, + transformer_layer_spec=get_retro_decoder_block_spec( + self.retro_model_config, use_transformer_engine=True + ), + vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size), + max_sequence_length=self.cfg.data.get('seq_length', 512), + pre_process=pre_process, + post_process=post_process, + parallel_output=True, + share_embeddings_and_output_weights=self.cfg.get('share_embeddings_and_output_weights', True), + position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'), + rotary_percent=self.cfg.get('rotary_percentage', 1.0), + seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), + ) + + return model + else: + assert self.mcore_gpt == True, "Currently only support mcore Retro." + + def forward( + self, tokens, text_position_ids, attention_mask, labels, context_input_ids, context_position_ids, context_mask + ): + output_tensor = self.model( + tokens, + text_position_ids, + attention_mask, + context_input_ids=context_input_ids, + context_position_ids=context_position_ids, + context_mask=context_mask, + labels=labels, + ) + return output_tensor + + def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: Optional[int] = None, **extra) -> Any: + # batch = {'prompts': List, 'neighbors': List[List]} + + inference_config = self.get_inference_config() + + if torch.distributed.get_rank() == 0: + logging.info("inference_config: ") + logging.info(inference_config) + + if inference_config is None: + return None + else: + # need to overwrite some configuration, make it immutable + inference_config = inference_config.copy() + compute_logprob = inference_config['compute_logprob'] + if compute_logprob: + inference_config['inputs'] = batch['prompts'] + inference_config['neighbors'] = batch['neighbors'] + inference_config['tokens_to_generate'] = 1 + inference_config['all_probs'] = True + inference_config["add_BOS"] = False + inference_config['greedy'] = True + inference_config['retro_inference'] = inference_config['retro_inference'] + response = generate(self, **inference_config) + compute_prob_response = get_computeprob_response(self.tokenizer, response, batch) + return compute_prob_response + else: + inference_config['inputs'] = batch['prompts'] + inference_config['neighbors'] = batch['neighbors'] + inference_config['retro_inference'] = inference_config['retro_inference'] + return generate(self, **inference_config) + + def get_batch(self, data_iterator): + """Generate a batch.""" + + # Broadcast data. + if data_iterator is not None: + # If tuple, 1st element in it is the batch since dataloader_iter returns batch, batch_idx, dataloader_idx + data = next(data_iterator) + if isinstance(data, tuple): + data = data[0] + else: + data = None + + batch = { + 'tokens': data["tokens"], + 'labels': data["labels"], + 'loss_mask': data["loss_mask"], + 'attention_mask': data["attention_mask"], + 'position_ids': data["position_ids"], + 'context_input_ids': data["context_input_ids"], + 'context_attention_mask': data["context_attention_mask"], + 'context_position_ids': data["context_position_ids"], + } + + return batch + + def get_forward_output_and_loss_func(self, validation_step=False): + def fwd_output_and_loss_func(dataloader_iter, model, checkpoint_activations_all_layers=None): + + # Get data batch + batch = self.get_batch(dataloader_iter) + + # Transfer needed data to GPU + required_keys = set() + if parallel_state.get_pipeline_model_parallel_world_size() == 1: + required_keys.update(batch.keys()) + else: + required_keys.add('attention_mask') + if parallel_state.is_pipeline_first_stage(): + required_keys.update( + ('tokens', 'position_ids', 'context_input_ids', 'context_position_ids', 'context_mask') + ) + if parallel_state.is_pipeline_last_stage(): + required_keys.update(('labels', 'loss_mask')) + if self.get_attention_mask_from_fusion: + required_keys.remove('attention_mask') + batch = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in batch.items()} + + # reshape context_input_ids and context_position_ids for RETRO from [bs, l*k, r] => [bs*l*k, r] + context_input_ids = batch['context_input_ids'] + context_position_ids = batch['context_position_ids'] + context_input_ids = context_input_ids.view(-1, context_input_ids.shape[-1]).long() + context_position_ids = context_position_ids.view(-1, context_position_ids.shape[-1]).long() + batch['context_input_ids'] = context_input_ids + batch['context_position_ids'] = context_position_ids + + # slice batch along sequence dimension for context parallelism + batch = self.get_batch_on_this_context_parallel_rank(batch) + + # Model forward pass + forward_args = { + 'input_ids': batch['tokens'], + 'position_ids': batch['position_ids'], + 'attention_mask': batch['attention_mask'], + 'context_input_ids': batch['context_input_ids'], + 'context_position_ids': batch['context_position_ids'], + 'context_mask': None, # batch neighbor_attention_mask will be set to None following Lawrence's implementation + 'labels': batch['labels'], + 'loss_mask': batch['loss_mask'], + } + + if not self.mcore_gpt: + forward_args['checkpoint_activations_all_layers'] = checkpoint_activations_all_layers + if not self.use_loss_mask: + forward_args.pop('loss_mask') + else: + # TODO: @eharper can we add this to mcore? + forward_args.pop('loss_mask') + output_tensor = model(**forward_args) + + def loss_func(output_tensor): + # Loss for a micro-batch (ub) + loss_for_ub = self.loss_func(batch['loss_mask'], batch['num_valid_tokens_in_ub'], output_tensor) + if validation_step and not self.cfg.data.get('validation_drop_last', True): + num_valid_tokens_in_ub = batch['loss_mask'].sum() + if loss_for_ub.isnan(): + assert batch['loss_mask'].count_nonzero() == 0, 'Got NaN loss with non-empty input' + loss_sum_for_ub = torch.zeros_like(num_valid_tokens_in_ub) + else: + loss_sum_for_ub = num_valid_tokens_in_ub * loss_for_ub + + loss_sum_and_ub_size_all_gpu = torch.cat( + [ + loss_sum_for_ub.clone().detach().view(1), + torch.tensor([num_valid_tokens_in_ub]).cuda().clone().detach(), + ] + ) + # Could potentially reduce num_valid_samples_in_microbatch and use that to aggregate instead of len(self._validation_ds) + torch.distributed.all_reduce( + loss_sum_and_ub_size_all_gpu, group=parallel_state.get_data_parallel_group() + ) + return loss_for_ub, {'loss_sum_and_ub_size': loss_sum_and_ub_size_all_gpu} + else: + reduced_loss = average_losses_across_data_parallel_group([loss_for_ub]) + return loss_for_ub, {'avg': reduced_loss} + + return output_tensor, loss_func + + return fwd_output_and_loss_func + + def get_forward_output_only_func(self): + def fwd_output_only_func(dataloader_iter, model): + batch = next(dataloader_iter) + extra_arg = {} + if len(batch) == 5: + batch = [x.cuda() for x in batch] + tokens, attention_mask, position_ids, context_input_ids, context_position_ids, context_mask = batch + attention_mask = attention_mask[0:1] + else: + ( + tokens, + attention_mask, + position_ids, + context_input_ids, + context_position_ids, + context_mask, + set_inference_key_value_memory, + inference_max_sequence_len, + ) = batch + tokens = tokens.cuda() + position_ids = position_ids.cuda() + if attention_mask is not None: + attention_mask = attention_mask.cuda() + attention_mask = attention_mask[0:1] + context_input_ids = context_input_ids.cuda() + context_position_ids = context_position_ids.cuda() + context_mask = None + if self.mcore_gpt: + # if first step, then clear KV cache, otherwise reuse inference_paarms + if set_inference_key_value_memory[0].item(): + self.inference_params = InferenceParams( + max_batch_size=tokens.size(0), max_sequence_length=inference_max_sequence_len[0].item() + ) + extra_arg['inference_params'] = self.inference_params + else: + extra_arg['set_inference_key_value_memory'] = set_inference_key_value_memory[0].item() + extra_arg['inference_max_sequence_len'] = inference_max_sequence_len[0].item() + output_tensor = model( + tokens, + position_ids, + attention_mask, + context_input_ids=context_input_ids, + context_position_ids=context_position_ids, + context_mask=None, # batch neighbor_attention_mask will be set to None following Lawrence's implementation + **extra_arg, + ) + + # Advance inference sequence offset. + if self.inference_params: + # if last stage, then (final) output is [b, s, h], otherwise it's [s, b, h] + if parallel_state.is_pipeline_last_stage(): + self.inference_params.sequence_len_offset += output_tensor.size(1) + else: + self.inference_params.sequence_len_offset += output_tensor.size(0) + + def id_func(output_tensor): + return output_tensor, {'logits': output_tensor} + + return output_tensor, id_func + + return fwd_output_only_func + + def build_retro_config(self) -> RetroConfig: + """ This method build RetroConfig from the already built TransformerConfig + by adding Retro relevant variables. This method runs after running build_transformer_config() method. + """ + retro_config = self.transformer_config + + # retro model args + retro_config.retro_project_dir = self.cfg.retro.get('retro_project_dir') + retro_config.retro_block_size = self.cfg.data.retro_data.get('retro_block_size') + retro_config.retro_chunk_length = self.cfg.data.retro_data.get('retro_chunk_length') + retro_config.retro_encoder_num_layers = self.cfg.retro.get('retro_encoder_num_layers', 2) + retro_config.retro_encoder_hidden_dropout = self.cfg.retro.get('retro_encoder_hidden_dropout', 0.1) + retro_config.retro_encoder_attention_dropout = self.cfg.retro.get('retro_encoder_attention_dropout', 0.1) + retro_config.retro_num_neighbors = self.cfg.retro.get('retro_num_neighbors', 2) + retro_config.retro_num_retrieved_chunks = self.cfg.retro.get('retro_num_retrieved_chunks', 2) + retro_config.retro_verify_neighbor_count = self.cfg.retro.get('retro_verify_neighbor_count', True) + retro_config.retro_retrieved_length = retro_config.retro_num_retrieved_chunks * retro_config.retro_chunk_length + retro_config.retro_split_preprocessing = self.cfg.data.retro_data.get('retro_split_preprocessing') + retro_config.retro_neighbor_dirs = self.cfg.data.retro_data.get('retro_neighbor_dirs') + logging.info("retro_config: ") + logging.info(retro_config) + + # Validate Transformer Engine version. + from importlib.metadata import version + + from pkg_resources import packaging + + te_version = packaging.version.Version(version("transformer-engine")) + if te_version >= packaging.version.Version("1.3"): + try: + os.environ["NVTE_FLASH_ATTN"] = "0" + os.environ["NVTE_FUSED_ATTN"] = "0" + assert os.getenv("NVTE_FLASH_ATTN") == "0" + assert os.getenv("NVTE_FUSED_ATTN") == "0" + except Exception as e: + raise Exception( + "When using Transformer Engine >= 1.3, environment vars NVTE_FLASH_ATTN and NVTE_FUSED_ATTN most both be defined and set to '0'. Currently, NVTE_FLASH_ATTN == %s, NVTE_FUSED_ATTN == %s." + % (os.getenv("NVTE_FLASH_ATTN", "[unset]"), os.getenv("NVTE_FUSED_ATTN", "[unset]"),) + ) + + return retro_config + + def build_train_valid_test_datasets(self): + # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step + # self._reconfigure_val_batches() + logging.info('Building mcore RETRO datasets.') + if self.trainer.limit_val_batches > 1.0 and isinstance(self.trainer.limit_val_batches, float): + raise ValueError("limit_val_batches must be an integer or float less than or equal to 1.0.") + global_batch_size = self.cfg.global_batch_size + # max_train_steps = self.trainer.max_steps + # eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches # check this carefully, we want to match mcore dataset value, should this computed, or overriden? + # test_iters = self.trainer.limit_test_batches + + # getting train_valid_test_num_samples from values in RETRO's workdir + train_valid_test_num_samples = [ # compute the number of training/validating samples from workdir/query/train_*; dividing number of chunks for (2048/64) + self.cfg.retro_train_samples_with_neighbors, + self.cfg.retro_valid_samples_with_neighbors, + 0, + ] + + if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float): + train_valid_test_num_samples[ + 1 + ] = 1 # This is to make sure we only have one epoch on every validation iteration + + self._train_ds, self._validation_ds, self._test_ds = build_train_valid_test_datasets( + cfg=self.cfg, + retro_config=self.retro_model_config, + train_valid_test_num_samples=train_valid_test_num_samples, + seq_length=self.cfg.data.seq_length, + tokenizer=self.tokenizer, + ) + + if self._train_ds is not None: + logging.info(f'Length of train dataset: {len(self._train_ds)}') + if self._validation_ds is not None: + logging.info(f'Length of val dataset: {len(self._validation_ds)}') + if self._test_ds is not None: + logging.info(f'Length of test dataset: {len(self._test_ds)}') + logging.info(f'Finished building mcore RETRO datasets.') + + return self._train_ds, self._validation_ds, self._test_ds + + def build_pretraining_data_loader( + self, dataset, consumed_samples, dataset_type=None, drop_last=True, pad_samples_to_global_batch_size=False + ): + """Buld dataloader given an input dataset.""" + + logging.info(f'Building dataloader with consumed samples: {consumed_samples}') + # Megatron sampler + if hasattr(self.cfg.data, 'dataloader_type') and self.cfg.data.dataloader_type is not None: + if self.cfg.data.dataloader_type == 'single': + batch_sampler = MegatronPretrainingSampler( + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=self.cfg.micro_batch_size, + data_parallel_rank=parallel_state.get_data_parallel_rank(), + data_parallel_size=parallel_state.get_data_parallel_world_size(), + drop_last=drop_last, + global_batch_size=self.cfg.global_batch_size, + rampup_batch_size=self.cfg.get('rampup_batch_size', None), + pad_samples_to_global_batch_size=pad_samples_to_global_batch_size, + ) + elif self.cfg.data.dataloader_type == 'cyclic': + batch_sampler = MegatronPretrainingRandomSampler( + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=self.cfg.micro_batch_size, + data_parallel_rank=parallel_state.get_data_parallel_rank(), + data_parallel_size=parallel_state.get_data_parallel_world_size(), + drop_last=self.cfg.get('drop_last', True), + ) + else: + raise ValueError('cfg.data.dataloader_type must be "single" or "cyclic"') + else: + raise ValueError('cfg.data.dataloader_type not found. Must be "single" or "cyclic"') + + return torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=self.cfg.data.num_workers, + pin_memory=True, + persistent_workers=True if self.cfg.data.num_workers > 0 else False, + ) + + def fwd_bwd_step(self, dataloader_iter, forward_only): + + # handle asynchronous grad reduction + no_sync_func = None + grad_sync_func = None + param_sync_func = None + if not forward_only and self.with_distributed_adam: + no_sync_func = partial(self._optimizer.no_sync, greedy_grad_copy=self.megatron_amp_O2,) + grad_sync_func = self.reduce_overlap_gradients + param_sync_func = self.sync_overlap_parameters + + # pipeline schedules will get these from self.model.config + for module in self.get_model_module_list(): + module.config.no_sync_func = no_sync_func + module.config.grad_sync_func = grad_sync_func + module.config.param_sync_func = param_sync_func + + # run forward and backwards passes for an entire global batch + # we do this inside training_step to support pipeline parallelism + fwd_bwd_function = get_forward_backward_func() + + # TODO @akhattar: add num_micro_batches_with_partial_activation_checkpoints when ready + losses_reduced_per_micro_batch = fwd_bwd_function( + forward_step_func=self.get_forward_output_and_loss_func(forward_only), + data_iterator=self._make_data_iterator_list(dataloader_iter), + model=self.model, + num_microbatches=get_num_microbatches(), + forward_only=forward_only, + seq_length=self.cfg.encoder_seq_length, + micro_batch_size=self.cfg.micro_batch_size, + ) + + # only the last stages of the pipeline return losses + if losses_reduced_per_micro_batch: + if (not forward_only) or self.cfg.data.get('validation_drop_last', True): + # average loss across micro batches + loss_tensors_list = [loss_reduced['avg'] for loss_reduced in losses_reduced_per_micro_batch] + loss_tensor = torch.concat(loss_tensors_list) + loss_mean = loss_tensor.mean() + else: + # Get the total loss since micro batches sizes are not uniform + loss_sum_tensors_list = [ + loss_sum['loss_sum_and_ub_size'] + for loss_sum in losses_reduced_per_micro_batch + if loss_sum['loss_sum_and_ub_size'][1] > 0 + ] + loss_sum = ( + torch.vstack(loss_sum_tensors_list).sum(axis=0) + if len(loss_sum_tensors_list) > 0 + else torch.tensor([0.0, 0.0]).cuda() + ) + return loss_sum + else: + # we're not on the last pipeline stage so no losses + if forward_only: + loss_mean = [] + else: + loss_mean = torch.tensor(0.0).cuda() + + return loss_mean + + def validation_step(self, dataloader_iter, dataloader_idx=0): + """ + Our dataloaders produce a micro-batch and then we fetch + a number of microbatches depending on the global batch size and model parallel size + from the dataloader to produce a list of microbatches. + The list of microbatches is then piped through the pipeline using megatron-core fwd/bwd functions. + """ + mode = 'test' if self.trainer.testing else 'val' + # Initialize userbuffer communicators. + if self.initialize_ub: + self.initialize_ub_func() + + if isinstance(self.model, list): + for model_module in self.model: + model_module.eval() + else: + self.model.eval() + + if self.cfg.get('fp8', False): + first_val_step = self.prev_step_training and not self.training + self.prev_step_training = self.training + else: + first_val_step = None + + with torch.no_grad(): + loss = self.fwd_bwd_step(dataloader_iter, True) + + if isinstance(self.model, list): + for model_module in self.model: + model_module.train() + else: + self.model.train() + + if mode == 'val': + # Append with the correct dataloader_idx in case of multiple dataloaders + if type(self.trainer.val_dataloaders) == list and len(self.trainer.val_dataloaders) > 1: + self.validation_step_outputs[dataloader_idx].append(loss) + else: + self.validation_step_outputs.append(loss) + else: + if type(self.trainer.test_dataloaders) == list and len(self.trainer.test_dataloaders) > 1: + self.test_step_outputs[dataloader_idx].append(loss) + else: + self.test_step_outputs.append(loss) + + return loss diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py index 84df4a6965e1..67c94ae5d608 100644 --- a/nemo/collections/nlp/modules/common/tokenizer_utils.py +++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py @@ -114,6 +114,7 @@ def get_tokenizer( tokenizer_name = get_megatron_tokenizer(tokenizer_name) if tokenizer_name == 'sentencepiece': + logging.info("tokenizer_model: " + str(tokenizer_model)) return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( model_path=tokenizer_model, special_tokens=special_tokens, legacy=True ) @@ -195,6 +196,14 @@ def get_nmt_tokenizer( logging.info(f'Using regex tokenization') return RegExTokenizer().load_tokenizer(regex_file=tokenizer_model, vocab_file=vocab_file) elif library == 'megatron': + + if model_name == 'GPTSentencePieceTokenizer': + logging.info("tokenizer_model: ") + logging.info(tokenizer_model) + return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer( + model_path=tokenizer_model, legacy=legacy + ) + if model_name in megatron_tokenizer_model_map: model_name = megatron_tokenizer_model_map[model_name] logging.info( diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py index 059ce4455977..e532297d9747 100644 --- a/nemo/utils/callbacks/nemo_model_checkpoint.py +++ b/nemo/utils/callbacks/nemo_model_checkpoint.py @@ -357,12 +357,15 @@ def remove_checkpoint_unfinished_marker(checkpoint_path: Union[Path, str], barri barrier_before: Synchronize ranks before removing the marker file. Defaults to False. """ - if barrier_before and torch.distributed.is_initialized(): - torch.distributed.barrier() - if is_global_rank_zero(): - marker_path = NeMoModelCheckpoint.format_checkpoint_unfinished_marker_path(checkpoint_path) - if marker_path.exists(): - marker_path.unlink() + try: + if barrier_before and torch.distributed.is_initialized(): + torch.distributed.barrier() + if is_global_rank_zero(): + marker_path = NeMoModelCheckpoint.format_checkpoint_unfinished_marker_path(checkpoint_path) + if marker_path.exists(): + marker_path.unlink() + except: + return def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) -> None: # barrier_after=True, so all ranks continue after the unfinished checkpoint marker is placed. diff --git a/tests/collections/nlp/test_indexed_retrieval_dataset.py b/tests/collections/nlp/test_indexed_retrieval_dataset.py index e35c3ab36840..5110651b34a6 100644 --- a/tests/collections/nlp/test_indexed_retrieval_dataset.py +++ b/tests/collections/nlp/test_indexed_retrieval_dataset.py @@ -28,7 +28,7 @@ MMapRetrievalIndexedDatasetBuilder, merge_knn_files, ) -from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import RETRODataset +from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset_legacy import RETRODataset try: from megatron.core import parallel_state From d2ab8435222e4928b057ff271a957f143d3b75b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Tue, 16 Apr 2024 09:18:49 -0400 Subject: [PATCH 37/39] Extended input configuration + Lhotse multimodal (mixed audio and text-only) dataloading (#8581) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * wip Signed-off-by: Piotr Żelasko * Partially working config groups Signed-off-by: Piotr Żelasko * Working test with abasic group in the input config Signed-off-by: Piotr Żelasko * Working test with nested groups in input config Signed-off-by: Piotr Żelasko * Working test with specifying a YAML path for input_cfg Signed-off-by: Piotr Żelasko * a very rough example of text dataloading via lhotse Signed-off-by: Piotr Żelasko * Cleaner integration of multimodal audio/text loading that allows to control the effective audio vs text size (requires latest lhotse) Signed-off-by: Piotr Żelasko * remove obsolete test Signed-off-by: Piotr Żelasko * Fix an import in export_utils.py (#8571) Signed-off-by: w4-jinhyeonkim <131935801+w4-jinhyeonkim@users.noreply.github.com> Signed-off-by: Piotr Żelasko * Yttm deprecation (#8322) * yttm deprecation init commit Signed-off-by: AlexGrinch * removed tests Signed-off-by: AlexGrinch * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * bug fix Signed-off-by: AlexGrinch * path fix Signed-off-by: AlexGrinch * fixing path Signed-off-by: AlexGrinch * updated tests to spm Signed-off-by: AlexGrinch * updated Jenkinsfile Signed-off-by: AlexGrinch * new model with spm in tests Signed-off-by: AlexGrinch * yttm removed Signed-off-by: AlexGrinch * updated aayn config Signed-off-by: AlexGrinch --------- Signed-off-by: AlexGrinch Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Piotr Żelasko * Fixed missing copy import in rnnt_decoder.py (#8580) * Added copy import to rnnt_decoding.py Signed-off-by: Isaac McFadyen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Isaac McFadyen Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Piotr Żelasko * Fix bug in RNNT Joint WER calculation for fused batch (#8587) Signed-off-by: smajumdar Signed-off-by: Piotr Żelasko * Fixed Context Parallel HtoD sync (#8557) * Fixed cp HtoD sync Signed-off-by: Selvaraj Anandaraj * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Piotr Żelasko * change default and add key to config files (#8594) Signed-off-by: Chen Cui Signed-off-by: Piotr Żelasko * Fix triton import guards (#8552) * Fix triton import guards Signed-off-by: Michal Futrega * Update attention.py Signed-off-by: Michal Futrega --------- Signed-off-by: Michal Futrega Signed-off-by: Piotr Żelasko * Add config key for dropout position in LoRA adapter (#8583) Signed-off-by: Michal Futrega Signed-off-by: Piotr Żelasko * fix ia3 mlp infused adapter (#8597) Signed-off-by: Chen Cui Signed-off-by: Piotr Żelasko * Prevent Redundant Gather for LoRA Sequence Parallel (#8602) * enable layernorm output gathered Signed-off-by: Chen Cui * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Chen Cui Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Piotr Żelasko * Accelerate `transcribe_speech.py` for short-form data: pre-sorting support (#8564) * POC using bucketing in transcribe_speech.py Signed-off-by: Piotr Żelasko * extend to multi task aed Signed-off-by: Piotr Żelasko * fixes for aed multi task text/lang field selectors Signed-off-by: Piotr Żelasko * remove assert Signed-off-by: Piotr Żelasko * fix Signed-off-by: Piotr Żelasko * expose option for bucket buffer size Signed-off-by: Piotr Żelasko * fixes, ctc support Signed-off-by: Piotr Żelasko * support pre-sorting manifests in transcribe_speech.py Signed-off-by: Piotr Żelasko * cleanup Signed-off-by: Piotr Żelasko * reorder transcriptions back to original manifest order Signed-off-by: Piotr Żelasko * remove bucketing entirely Signed-off-by: Piotr Żelasko * code review changes Signed-off-by: Piotr Żelasko * code review changes--amend Signed-off-by: Piotr Żelasko * refactor text_field/lang_field passing Signed-off-by: Piotr Żelasko * Fix reordering bug; disable presorting for multi task for now Signed-off-by: Piotr Żelasko * Add support for presort + multi task model Signed-off-by: Piotr Żelasko * Code reviews Signed-off-by: Piotr Żelasko * Fix jenkins tests, add user-friendly error msg for canary Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko * fix tests Signed-off-by: Piotr Żelasko * Bump min required lhotse version Signed-off-by: Piotr Żelasko * Add some documentation about this config format and the multimodal features Signed-off-by: Piotr Żelasko * Add caution about multiple shards Signed-off-by: Piotr Żelasko * Address Tom's code review Signed-off-by: Piotr Żelasko * Add copyright header Signed-off-by: Piotr Żelasko * Fix (hopefully) issue with forced ascii encoding in CI Signed-off-by: Piotr Żelasko * Support resolving input_cfg path into config contents Signed-off-by: Piotr Żelasko * Code review changes in docs Signed-off-by: Piotr Żelasko * Fix unicode decode error Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko Signed-off-by: w4-jinhyeonkim <131935801+w4-jinhyeonkim@users.noreply.github.com> Signed-off-by: AlexGrinch Signed-off-by: Isaac McFadyen Signed-off-by: smajumdar Signed-off-by: Selvaraj Anandaraj Signed-off-by: Chen Cui Signed-off-by: Michal Futrega Co-authored-by: w4-jinhyeonkim <131935801+w4-jinhyeonkim@users.noreply.github.com> Co-authored-by: Aleksey Grinchuk (Oleksii Hrinchuk) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Isaac McFadyen Co-authored-by: Somshubra Majumdar Co-authored-by: Selvaraj Anandaraj Co-authored-by: Selvaraj Anandaraj Co-authored-by: Chen Cui Co-authored-by: Michal Futrega Co-authored-by: Pablo Garay --- docs/source/asr/datasets.rst | 135 ++++++ nemo/collections/common/data/lhotse/cutset.py | 220 ++++++++- .../common/data/lhotse/dataloader.py | 156 +++++- .../common/data/lhotse/nemo_adapters.py | 16 +- .../common/data/lhotse/text_adapters.py | 97 ++++ .../tokenizers/sentencepiece_tokenizer.py | 6 +- requirements/requirements_asr.txt | 2 +- .../common/test_lhotse_dataloading.py | 453 +++++++++++++++++- 8 files changed, 1042 insertions(+), 43 deletions(-) create mode 100644 nemo/collections/common/data/lhotse/text_adapters.py diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst index 7612c6a3f630..ca16d0538a31 100644 --- a/docs/source/asr/datasets.rst +++ b/docs/source/asr/datasets.rst @@ -664,6 +664,141 @@ Some other Lhotse related arguments we support: The full and always up-to-date list of supported options can be found in ``LhotseDataLoadingConfig`` class. +Extended multi-dataset configuration format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Combining a large number of datasets and defining weights for them can be tricky. +We offer an extended configuration format that allows you to explicitly define datasets, +dataset groups, and their weights either inline in the experiment configuration, +or as a path to a separate YAML file. + +In addition to the features above, this format introduces a special ``tags`` dict-like field. +The keys and values in ``tags`` are automatically attached to every sampled example, which +is very useful when combining multiple datasets with different properties. +The dataset class which converts these examples to tensors can partition the mini-batch and apply +different processing to each group. +For example, you may want to construct different prompts for the model using metadata in ``tags``. + +.. note:: When fine-tuning a model that was trained with ``input_cfg`` option, typically you'd only need + to override the following options: ``input_cfg=null`` and ``manifest_filepath=path/to/manifest.json``. + +Example 1. Combine two datasets with equal weights and attach custom metadata in ``tags`` to each cut: + +.. code-block:: yaml + + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.4 + tags: + lang: en + pnc: no + - type: nemo_tarred + manifest_filepath: /path/to/other/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/other/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.6 + tags: + lang: pl + pnc: yes + +Example 2. Combine multiple (4) datasets, corresponding to different tasks (ASR, AST). +Each task gets its own group and its own weight. +Then within each task, each dataset get its own within-group weight as well. +The final weight is the product of outer and inner weight: + +.. code-block:: yaml + + input_cfg: + - type: group + weight: 0.7 + tags: + task: asr + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/asr1/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/asr1/audio__OP_0..512_CL_.tar + weight: 0.6 + tags: + source_lang: en + target_lang: en + - type: nemo_tarred + manifest_filepath: /path/to/asr2/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/asr2/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.4 + tags: + source_lang: pl + target_lang: pl + - type: group + weight: 0.3 + tags: + task: ast + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/ast1/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/ast1/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.2 + tags: + source_lang: en + target_lang: pl + - type: nemo_tarred + manifest_filepath: /path/to/ast2/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/ast2/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.8 + tags: + source_lang: pl + target_lang: en + +Configuring multi-modal dataloading +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Our configuration format supports specifying data sources from other modalities than just audio. +At this time, this support is extended to text-only data. We provide the following parser types: + +* ``txt`` for raw text files, sharded or unsharded. This can represent, for example, language modeling data. +* ``txt_pair`` for pairs of raw text files, sharded or unsharded. This can represent, for example, machine translation data. + +The key strength of this approach is that we can easily combine audio datasets and text datasets, +and benefit from every other technique we described above such as dynamic data mixing, data weighting, dynamic bucketing, and so on. +To enable multimodal dataloading, we provide several configuration options: + +* ``use_multimodal_sampling`` when set to True, we'll discard the settings of ``batch_duration`` and ``quadratic_duration`` and consider the settings below instead. + +* ``batch_tokens`` is the maximum number of tokens we want to find inside a mini-batch. Similarly to ``batch_duration``, this number does consider padding tokens too, therefore enabling bucketing is recommended to maximize the ratio of real vs padding tokens. + +* ``token_equivalent_duration`` is used to be able to measure audio examples in the number of "tokens". For example, if we're using fbank with 0.01s frame shift and an acoustic model that has a subsampling factor of 0.08, then a reasonable setting for this could be 0.08 (which means every subsampled frame counts as one token). Calibrate this value to fit your needs. Note that this value acts as a "balancer" between how much audio data vs text data gets sampled into a mini-batch. + +* ``quadratic_factor`` works the same way as ``quadratic_duration``, but is defined in the number of tokens. + +Example 3. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so that mini-batches have some examples from both datasets. Provide a custom prompt field for both datasets (to be leveraged by a relevant dataset class): + +```yaml +use_multimodal_sampling: true +batch_tokens: 1024 +token_equivalent_duration: 0.08 # 0.01 frame shift * 8 subsampling factor +quadratic_factor: 50 +num_buckets: 30 +use_bucketing: true +input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.5 + tags: + lang: en + prompt: "Given the following recording, transcribe what the person is saying:" + - type: txt_pair + source_path: /path/to/en__OP_0..512_CL_.txt + target_path: /path/to/pl__OP_0..512_CL_.txt + source_language: en + target_language: pl + weight: 0.5 + tags: + prompt: "Translate the following text to Polish:" +``` + +.. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples. + Pre-computing bucket duration bins ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py index 028ea8bfef90..fa5ae5804c4b 100644 --- a/nemo/collections/common/data/lhotse/cutset.py +++ b/nemo/collections/common/data/lhotse/cutset.py @@ -14,30 +14,36 @@ import logging import warnings +from functools import partial from itertools import repeat from pathlib import Path from typing import Sequence, Tuple from lhotse import CutSet +from omegaconf import DictConfig, ListConfig, OmegaConf from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator, LazyNeMoTarredIterator +from nemo.collections.common.data.lhotse.text_adapters import LhotseTextAdapter, LhotseTextPairAdapter -def read_cutset_from_config(config) -> Tuple[CutSet, bool]: +def read_cutset_from_config(config: DictConfig) -> Tuple[CutSet, bool]: """ Reads NeMo configuration and creates a CutSet either from Lhotse or NeMo manifests. Returns a tuple of ``CutSet`` and a boolean indicating whether the data is tarred (True) or not (False). """ - # First, we'll figure out if we should read Lhotse manifest or NeMo manifest. - use_nemo_manifest = all(config[opt] is None for opt in ("cuts_path", "shar_path")) + # First, check if the dataset is specified in the new configuration format and use it if possible. + if config.get("input_cfg") is not None: + return read_dataset_config(config) + # Now, we'll figure out if we should read Lhotse manifest or NeMo manifest. + use_nemo_manifest = all(config.get(opt) is None for opt in ("cuts_path", "shar_path")) if use_nemo_manifest: assert ( - config.manifest_filepath is not None - ), "You must specify either: manifest_filepath, lhotse.cuts_path, or lhotse.shar_path" - is_tarred = config.tarred_audio_filepaths is not None + config.get("manifest_filepath") is not None + ), "You must specify either: manifest_filepath, cuts_path, or shar_path" + is_tarred = config.get("tarred_audio_filepaths") is not None else: - is_tarred = config.shar_path is not None + is_tarred = config.get("shar_path") is not None if use_nemo_manifest: # Read NeMo manifest -- use the right wrapper depending on tarred/non-tarred. cuts = read_nemo_manifest(config, is_tarred) @@ -47,6 +53,193 @@ def read_cutset_from_config(config) -> Tuple[CutSet, bool]: return cuts, is_tarred +KNOWN_DATASET_CONFIG_TYPES = frozenset(("nemo", "nemo_tarred", "lhotse", "lhotse_shar", "txt", "txt_pair", "group")) + + +def read_dataset_config(config) -> tuple[CutSet, bool]: + """ + Input configuration format examples. + Example 1. Combine two datasets with equal weights and attach custom metadata in ``tags`` to each cut:: + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.5 + tags: + lang: en + some_metadata: some_value + - type: nemo_tarred + manifest_filepath: /path/to/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.5 + tags: + lang: pl + some_metadata: some_value + Example 2. Combine multiple (4) datasets, with 2 corresponding to different tasks (ASR, AST). + There are two levels of weights: per task (outer) and per dataset (inner). + The final weight is the product of outer and inner weight:: + input_cfg: + - type: group + weight: 0.7 + tags: + task: asr + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/asr1/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/tarred_audio/asr1/audio__OP_0..512_CL_.tar + weight: 0.6 + tags: + lang: en + some_metadata: some_value + - type: nemo_tarred + manifest_filepath: /path/to/asr2/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/asr2/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.4 + tags: + lang: pl + some_metadata: some_value + - type: group + weight: 0.3 + tags: + task: ast + input_cfg: + - type: nemo_tarred + manifest_filepath: /path/to/ast1/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/ast1/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.2 + tags: + src_lang: en + tgt_lang: pl + - type: nemo_tarred + manifest_filepath: /path/to/ast2/manifest__OP_0..512_CL_.json + tarred_audio_filepath: /path/to/ast2/tarred_audio/audio__OP_0..512_CL_.tar + weight: 0.8 + tags: + src_lang: pl + tgt_lang: en + """ + propagate_attrs = { + "shuffle": config.shuffle, + "shard_seed": config.shard_seed, + "text_field": config.text_field, + "lang_field": config.lang_field, + "missing_sampling_rate_ok": config.missing_sampling_rate_ok, + "max_open_streams": config.max_open_streams, + } + input_cfg = config.input_cfg + if isinstance(input_cfg, (str, Path)): + # Resolve /path/to/input_cfg.yaml into config contents if needed. + input_cfg = OmegaConf.load(input_cfg) + cuts, is_tarred = parse_and_combine_datasets(input_cfg, propagate_attrs=propagate_attrs) + return cuts, is_tarred + + +def parse_group(grp_cfg: DictConfig, propagate_attrs: dict) -> [CutSet, bool]: + assert grp_cfg.type in KNOWN_DATASET_CONFIG_TYPES, f"Unknown item type in dataset config list: {grp_cfg.type=}" + if grp_cfg.type == "nemo_tarred": + is_tarred = True + cuts = read_nemo_manifest(grp_cfg, is_tarred=is_tarred) + elif grp_cfg.type == "nemo": + is_tarred = False + cuts = read_nemo_manifest(grp_cfg, is_tarred=is_tarred) + elif grp_cfg.type == "lhotse_shar": + is_tarred = True + cuts = read_lhotse_manifest(grp_cfg, is_tarred=is_tarred) + elif grp_cfg.type == "lhotse": + is_tarred = False + cuts = read_lhotse_manifest(grp_cfg, is_tarred=is_tarred) + # Note: "txt" and "txt_pair" have "is_tarred" set to True. + # The main reason is to enable combination of tarred audio and text dataloading, + # since we don't allow combination of tarred and non-tarred datasets. + # We choose to treat text as-if it was tarred, which also tends to be more + # efficient as it moves the text file iteration into dataloading subprocess. + elif grp_cfg.type == "txt": + is_tarred = True + cuts = read_txt_paths(grp_cfg) + elif grp_cfg.type == "txt_pair": + is_tarred = True + cuts = read_txt_pair_paths(grp_cfg) + elif grp_cfg.type == "group": + cuts, is_tarred = parse_and_combine_datasets(grp_cfg.input_cfg, propagate_attrs=propagate_attrs,) + else: + raise ValueError(f"Unrecognized group: {grp_cfg.type}") + # Attach extra tags to every utterance dynamically, if provided. + if (extra_tags := grp_cfg.get("tags")) is not None: + cuts = cuts.map(partial(attach_tags, tags=extra_tags), apply_fn=None) + return cuts, is_tarred + + +def read_txt_paths(config: DictConfig) -> CutSet: + return CutSet( + LhotseTextAdapter( + paths=config.paths, language=config.language, shuffle_shards=config.shuffle, shard_seed=config.shard_seed, + ) + ).repeat() + + +def read_txt_pair_paths(config: DictConfig) -> CutSet: + return CutSet( + LhotseTextPairAdapter( + source_paths=config.source_paths, + target_paths=config.target_paths, + source_language=config.source_language, + target_language=config.target_language, + shuffle_shards=config.shuffle, + shard_seed=config.shard_seed, + ) + ).repeat() + + +def attach_tags(cut, tags: dict): + for key, val in tags.items(): + setattr(cut, key, val) + return cut + + +def parse_and_combine_datasets( + config_list: list[DictConfig] | ListConfig, propagate_attrs: dict +) -> tuple[CutSet, bool]: + cuts = [] + weights = [] + tarred_status = [] + assert len(config_list) > 0, "Empty group in dataset config list." + + for item in config_list: + + # Check if we have any attributes that are propagated downwards to each item in the group. + # If a key already exists in the item, it takes precedence (we will not overwrite); + # otherwise we will assign it. + # We also update propagate_atts for the next sub-groups based on what's present in this group + next_propagate_attrs = propagate_attrs.copy() + for k, v in propagate_attrs.items(): + if k not in item: + item[k] = v + else: + next_propagate_attrs[k] = item[k] + + # Load the item (which may also be another group) as a CutSet. + item_cuts, item_is_tarred = parse_group(item, next_propagate_attrs) + cuts.append(item_cuts) + tarred_status.append(item_is_tarred) + if (w := item.get("weight")) is not None: + weights.append(w) + + assert all(t == tarred_status[0] for t in tarred_status), "Mixing tarred and non-tarred datasets is not supported." + assert len(weights) == 0 or len(cuts) == len( + weights + ), "Missing dataset weight. When weighting datasets, every dataset must have a specified weight." + if len(cuts) > 1: + cuts = mux( + *cuts, + weights=weights if weights else None, + max_open_streams=propagate_attrs["max_open_streams"], + seed=propagate_attrs["shard_seed"], + ) + else: + (cuts,) = cuts + return cuts, tarred_status[0] + + def read_lhotse_manifest(config, is_tarred: bool) -> CutSet: if is_tarred: # Lhotse Shar is the equivalent of NeMo's native "tarred" dataset. @@ -64,7 +257,7 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet: # - integer means we'll set a specific seed in every worker, and data would be duplicated across them. # This is mostly useful for unit testing or debugging. shard_seed = config.shard_seed - if config.cuts_path is not None: + if config.get("cuts_path") is not None: warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.") if isinstance(config.shar_path, (str, Path)): logging.info(f"Initializing Lhotse Shar CutSet (tarred) from a single data source: '{config.shar_path}'") @@ -119,9 +312,10 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet: config.manifest_filepath, tar_paths=config.tarred_audio_filepaths, shuffle_shards=config.shuffle, + shard_seed=config.shard_seed, **common_kwargs, ) - ) + ).repeat() else: cuts = CutSet(LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **common_kwargs)) else: @@ -132,7 +326,7 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet: # this ensures that we distribute the data from each source uniformly throughout each epoch. # Setting equal weights would exhaust the shorter data sources closer the towards the beginning # of an epoch (or over-sample it in the case of infinite CutSet iteration with .repeat()). - # Format option 1: + # Format option 2: # Assume it's [[path1, weight1], [path2, weight2], ...] (while tarred_audio_filepaths remain unchanged). # Note: this option allows to manually set the weights for multiple datasets. logging.info( @@ -148,7 +342,11 @@ def read_nemo_manifest(config, is_tarred: bool) -> CutSet: manifest_path = manifest_info[0] if is_tarred: nemo_iter = LazyNeMoTarredIterator( - manifest_path=manifest_path, tar_paths=tar_path, shuffle_shards=config.shuffle, **common_kwargs + manifest_path=manifest_path, + tar_paths=tar_path, + shuffle_shards=config.shuffle, + shard_seed=config.shard_seed, + **common_kwargs, ) else: nemo_iter = LazyNeMoIterator(manifest_path, **notar_kwargs, **common_kwargs) diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index 9eeb8800066a..fd2a69725a0e 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -15,12 +15,14 @@ import logging import warnings from dataclasses import dataclass -from functools import partial +from functools import partial, singledispatch from typing import Any, Optional +import numpy as np import torch from lhotse import CutSet from lhotse.cut import Cut +from lhotse.cut.text import TextExample, TextPairExample from lhotse.dataset import ( CutConcatenate, DynamicBucketingSampler, @@ -28,11 +30,14 @@ IterableDatasetWrapper, make_worker_init_fn, ) +from lhotse.dataset.sampling.base import SamplingConstraint, TimeConstraint, TokenConstraint from lhotse.lazy import LazyFlattener from lhotse.utils import fastcopy from omegaconf import DictConfig, OmegaConf +from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config +from nemo.collections.common.tokenizers import TokenizerSpec @dataclass @@ -46,6 +51,7 @@ class LhotseDataLoadingConfig: # 1. Data inputs. # a. "Classic" NeMo input path fields. + input_cfg: Any = None # TODO(pzelasko): typing manifest_filepath: Any = None # str | list[list[str | float]] | None = None tarred_audio_filepaths: Any = None # str | list[list[str]] | None = None # b. Lhotse CutSet manifest / Lhotse Shar tar dir paths. @@ -70,6 +76,12 @@ class LhotseDataLoadingConfig: shard_seed: int | str = "trng" max_open_streams: int | None = None + # 2.1 Multimodal sampling override options + use_multimodal_sampling: bool = False + token_equivalent_duration: float | None = None + batch_tokens: int | None = None + quadratic_factor: float | None = None + # 3. Supported existing NeMo options. shuffle: bool = False sample_rate: int = 16000 @@ -102,12 +114,16 @@ class LhotseDataLoadingConfig: def get_lhotse_dataloader_from_config( - config: DictConfig, global_rank: int, world_size: int, dataset: torch.utils.data.Dataset + config: DictConfig, + global_rank: int, + world_size: int, + dataset: torch.utils.data.Dataset, + tokenizer: TokenizerSpec | TokenizerWrapper = None, ) -> torch.utils.data.DataLoader: """ Set up a Lhotse training dataloder. - Expects a typical NeMo dataset configuration format, with additional fields: "use_lhotse=True" and "lhotse: ". + Expects a typical NeMo dataset configuration format, with additional fields: "use_lhotse=True". Some fields in the original NeMo configuration may be ignored. The ``dataset`` parameter should be an instance of a Lhotse-compatible PyTorch Dataset class. @@ -115,8 +131,15 @@ def get_lhotse_dataloader_from_config( This dataset is not expected to hold a reference to any actual data; it may be interpreted as a function mapping a Lhotse CutSet into a mini-batch of tensors. - For example, see: :class:`nemo.collections.asr.data.audio_to_text_lhotse.LhotseSpeechToTextBpeDataset`, + For an example, see: :class:`nemo.collections.asr.data.audio_to_text_lhotse.LhotseSpeechToTextBpeDataset`, which is constructed from just a tokenizer and essentially loads and collates audio and tokenizes the transcript. + + The ``tokenizer`` is used when text-only datasets are included in dataloading. + In these cases we will tokenize ``TextExample``s before sampling mini-batches so that + we can account for their number of tokens. + Note: this behaviour might eventually be extended to audio datasets too. + + Note that ``tokenizer`` can be any tokenizer type (e.g. both SentencePiece and Aggregate tokenizers work). """ logging.info("We will be using a Lhotse DataLoader.") @@ -132,7 +155,16 @@ def get_lhotse_dataloader_from_config( cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration)) # Expands cuts if multiple translations are provided. - cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text))) + cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None))) + + if config.use_multimodal_sampling: + assert ( + tokenizer is not None + ), "You must pass a tokenizer to `get_lhotse_dataloader_from_config` in order to read text-only datasets (enabled via use_multimodal_dataloading)" + if not isinstance(tokenizer, TokenizerWrapper): + tokenizer = TokenizerWrapper(tokenizer) + # Note this code can also pre-tokenize the text in cuts, but for now we disable it with apply_fn. + cuts = cuts.map(partial(tokenize, tokenizer=tokenizer), apply_fn=is_text) # 2. Optional augmentations. # 2.a. Noise mixing. @@ -149,6 +181,20 @@ def get_lhotse_dataloader_from_config( if config.perturb_speed: cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),) + if config.use_multimodal_sampling: + constraint = MultimodalSamplingConstraint( + token_equivalent_duration=config.token_equivalent_duration, + batch_size=config.batch_size, + batch_tokens=config.batch_tokens, + quadratic_factor=config.quadratic_factor, + ) + else: + constraint = TimeConstraint( + max_cuts=config.batch_size, + max_duration=config.batch_duration, + quadratic_duration=config.quadratic_duration, + ) + # 3. The sampler. if config.use_bucketing: # Bucketing. Some differences from NeMo's native bucketing: @@ -161,12 +207,10 @@ def get_lhotse_dataloader_from_config( ) sampler = DynamicBucketingSampler( cuts, - max_duration=config.batch_duration, - max_cuts=config.batch_size, + constraint=constraint, shuffle=config.shuffle, drop_last=config.drop_last, shuffle_buffer_size=config.shuffle_buffer_size, - quadratic_duration=config.quadratic_duration, seed=config.seed, num_buckets=config.num_buckets, duration_bins=config.bucket_duration_bins, @@ -185,12 +229,10 @@ def get_lhotse_dataloader_from_config( ) sampler = DynamicCutSampler( cuts, - max_duration=config.batch_duration, - max_cuts=config.batch_size, + constraint=constraint, shuffle=config.shuffle, drop_last=config.drop_last, shuffle_buffer_size=config.shuffle_buffer_size, - quadratic_duration=config.quadratic_duration, seed=config.seed, rank=0 if is_tarred else global_rank, world_size=1 if is_tarred else world_size, @@ -260,6 +302,89 @@ def make_structured_with_schema_warnings(config: DictConfig) -> DictConfig: return OmegaConf.merge(default, config) +@dataclass +class MultimodalSamplingConstraint(SamplingConstraint): + # how many seconds of audio is a text token worth; balances audio to text ratio in a mini-batch + token_equivalent_duration: float + + # defines maximum batch size (may be lower than that if batch_length is also specified) + batch_size: int | None = None + + # defines the total number of tokens in a mini-batch + # setting this enables dynamic batch sizes + # we will use ``token_equivalent_duration`` to convert audio examples to token sizes + batch_tokens: int | None = None + + # when specified, this value is inversely proportional to the penalty we assign + # to longer examples when measuring their length/duration; + # i.e. large quadratic factor is a small penalty, small quadratic factor is a large penalty + # tweaking this helps equalize the GPU memory usage for dynamic batch sizes when using bucketing + quadratic_factor: float | None = None + + _internal = None + + def __post_init__(self): + self._internal = TokenConstraint( + max_tokens=self.batch_tokens, max_examples=self.batch_size, quadratic_length=self.quadratic_factor, + ) + + def add(self, example: Any) -> None: + if isinstance(example, Cut): + num_tokens = self.measure_length(example) + example.num_tokens = num_tokens + self._internal.add(example) + + def exceeded(self) -> bool: + return self._internal.exceeded() + + def close_to_exceeding(self) -> bool: + return self._internal.close_to_exceeding() + + def reset(self) -> None: + self._internal.reset() + + def measure_length(self, example: Any) -> float: + if isinstance(example, Cut): + return example.duration / self.token_equivalent_duration + if isinstance(example, (TextExample, TextPairExample)): + return example.num_tokens + raise RuntimeError(f"Unsupported example type: {type(example)}") + + +# The functions below are overloads for different types of examples. +# This is required for multi-modal dataloading since we will iterate +# over a union type now. + + +def is_text(example) -> bool: + return isinstance(example, (TextExample, TextPairExample)) + + +@singledispatch +def tokenize(example, tokenizer): + raise RuntimeError(f"Unsupported type of example: {type(example)}") + + +@tokenize.register +def _(example: Cut, tokenizer) -> Cut: + for s in example.supervisions: + s.tokens = np.asarray(tokenizer(s.text, s.language)) + return example + + +@tokenize.register +def _(example: TextExample, tokenizer) -> TextExample: + example.tokens = np.asarray(tokenizer(example.text, example.language)) + return example + + +@tokenize.register +def _(example: TextPairExample, tokenizer) -> TextPairExample: + example.source.tokens = np.asarray(tokenizer(example.source.text, example.source.language)) + example.target.tokens = np.asarray(tokenizer(example.source.text, example.target.language)) + return example + + # The helper callables below exist to avoid passing lambdas into lhotse CutSet map/filter methods. # Lambdas are not serializable across processes by pickle. # Note: lhotse offers LHOTSE_DILL_ENABLED=1 and ``lhotse.lazy.set_dill_enabled(True)`` @@ -273,8 +398,11 @@ def __init__(self, d_min: float, d_max: float) -> None: self.d_min = d_min self.d_max = d_max - def __call__(self, cut: Cut) -> bool: - return self.d_min <= cut.duration <= self.d_max + def __call__(self, example) -> bool: + if isinstance(example, Cut): + return self.d_min <= example.duration <= self.d_max + else: + return True # does not apply to text etc. def _normalize_loudness(cuts: CutSet, db_norm: float) -> CutSet: @@ -287,7 +415,7 @@ def _merge_supervisions(cuts: CutSet) -> CutSet: def _flatten_alt_text(cut) -> list: ans = [cut] - if cut.custom is None or cut.custom.get("alt_text") is None: + if not isinstance(cut, Cut) or cut.custom is None or cut.custom.get("alt_text") is None: return ans cut = cut.move_to_memory(audio_format="wav") # performs I/O once and holds audio in memory from now on # Popping to ease eyesight on debug. diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py index 4fae72e6f467..02b3e1f4edda 100644 --- a/nemo/collections/common/data/lhotse/nemo_adapters.py +++ b/nemo/collections/common/data/lhotse/nemo_adapters.py @@ -18,12 +18,13 @@ import tarfile from io import BytesIO from pathlib import Path -from typing import Generator, Iterable, List +from typing import Generator, Iterable, List, Literal import soundfile from cytoolz import groupby from lhotse import AudioSource, Recording, SupervisionSegment from lhotse.cut import Cut +from lhotse.dataset.dataloading import resolve_seed from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator from lhotse.serialization import open_best from lhotse.utils import compute_num_samples @@ -147,6 +148,12 @@ class LazyNeMoTarredIterator: Args ``manifest_path`` and ``tar_paths`` can be either a path/string to a single file, or a string in NeMo format that indicates multiple paths (e.g. "[[data/bucket0/tarred_audio_paths.json],[data/bucket1/...]]"). + The ``shard_seed`` argument is used to seed the RNG shuffling the shards. + By default it's ``trng`` which samples a seed number from OS-provided TRNG (see Python ``secrets`` module). + Seed is resolved lazily so that every dataloading worker may sample a different one. + Override with an integer value for deterministic behaviour and consult Lhotse documentation for details: + https://lhotse.readthedocs.io/en/latest/datasets.html#handling-random-seeds + Example of CutSet with inter-shard shuffling enabled:: >>> cuts = lhotse.CutSet(LazyNeMoTarredIterator( @@ -161,6 +168,7 @@ def __init__( manifest_path: str | Path, tar_paths: str | list, shuffle_shards: bool = False, + shard_seed: int | Literal["trng", "randomized"] = "trng", text_field: str = "text", lang_field: str = "lang", ) -> None: @@ -189,6 +197,7 @@ def strip_pipe(p): tar_paths = expand_sharded_filepaths(tar_paths) self.shard_id_to_tar_path: dict[int, str] = {int(strip_pipe(p).stem.split("_")[1]): p for p in tar_paths} self.shuffle_shards = shuffle_shards + self.shard_seed = shard_seed self.text_field = text_field self.lang_field = lang_field self._validate() @@ -205,6 +214,7 @@ def to_shards(self) -> List["LazyNeMoTarredIterator"]: manifest_path=path, tar_paths=tarpath, shuffle_shards=False, + shard_seed=self.shard_seed, text_field=self.text_field, lang_field=self.lang_field, ) @@ -227,8 +237,8 @@ def __iter__(self) -> Generator[Cut, None, None]: shard_ids = self.shard_ids if self.shuffle_shards: - # Use TRNG for 100% randomness - random.Random(secrets.randbelow(2 ** 32)).shuffle(shard_ids) + seed = resolve_seed(self.shard_seed) + random.Random(seed).shuffle(shard_ids) for sid in shard_ids: shard_manifest = self.shard_id_to_manifest[sid] diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py new file mode 100644 index 000000000000..805ef5dd542f --- /dev/null +++ b/nemo/collections/common/data/lhotse/text_adapters.py @@ -0,0 +1,97 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +from dataclasses import dataclass +from pathlib import Path +from typing import Iterator, Literal + +from lhotse.cut.text import TextExample, TextPairExample +from lhotse.dataset.dataloading import resolve_seed +from lhotse.utils import Pathlike + +from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths + + +@dataclass +class LhotseTextAdapter: + """ + ``LhotseTextAdapter`` is used to read a text file and wrap + each line into a ``TextExample``. + """ + + paths: Pathlike | list[Pathlike] + language: str | None = None + shuffle_shards: bool = False + shard_seed: int | Literal["trng", "randomized"] = "trng" + + def __post_init__(self): + self.paths = expand_sharded_filepaths(self.paths) + + def __iter__(self) -> Iterator[TextExample]: + paths = self.paths + if self.shuffle_shards: + seed = resolve_seed(self.shard_seed) + random.Random(seed).shuffle(paths) + for path in paths: + with open(path) as f: + for line in f: + example = TextExample(line) + if self.language is not None: + example.language = self.language + yield example + + +@dataclass +class LhotseTextPairAdapter: + """ + ``LhotseTextAdapter`` is used to read a tuple of N text files + (e.g., a pair of files with translations in different languages) + and wrap them in a ``TextExample`` object to enable dataloading + with Lhotse together with training examples in audio modality. + """ + + source_paths: Pathlike | list[Pathlike] + target_paths: Pathlike | list[Pathlike] + source_language: str | None = None + target_language: str | None = None + shuffle_shards: bool = False + shard_seed: int | Literal["trng", "randomized"] = "trng" + + def __post_init__(self): + ASSERT_MSG = "Both source and target must be a single path or lists of paths" + if isinstance(self.source_paths, (str, Path)): + assert isinstance(self.target_paths, (str, Path)), ASSERT_MSG + else: + assert isinstance(self.source_paths, list) and isinstance(self.target_paths, list), ASSERT_MSG + assert len(self.source_paths) == len( + self.target_paths + ), f"Source ({len(self.source_paths)}) and target ({len(self.target_paths)}) path lists must have the same number of items." + self.source_paths = expand_sharded_filepaths(self.source_paths) + self.target_paths = expand_sharded_filepaths(self.target_paths) + + def __iter__(self) -> Iterator[TextPairExample]: + paths = list(zip(self.source_paths, self.target_paths)) + if self.shuffle_shards: + seed = resolve_seed(self.shard_seed) + random.Random(seed).shuffle(paths) + for source_path, target_path in paths: + with open(source_path) as fs, open(target_path) as ft: + for ls, lt in zip(fs, ft): + example = TextPairExample(source=TextExample(ls.strip()), target=TextExample(lt.strip())) + if self.source_language is not None: + example.source.language = self.source_language + if self.target_language is not None: + example.target.language = self.target_language + yield example diff --git a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py index bc10b67af880..b686322c0882 100644 --- a/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py +++ b/nemo/collections/common/tokenizers/sentencepiece_tokenizer.py @@ -376,7 +376,9 @@ def create_spt_model( # Add BERT control symbols tokens = [] - with open(f"{output_dir}/tokenizer.vocab", "r") as f: + # Encoding arg is added for compatibility with systems which enforce + # ASCII encoding in Python. Sentencepiece always uses Unicode (UTF8). + with open(f"{output_dir}/tokenizer.vocab", "r", encoding="utf8") as f: # Read tokens from each line and parse for vocab for line in f: piece = line.split("\t")[0] @@ -394,7 +396,7 @@ def create_spt_model( # Save vocabulary to output file vocab_file = f'{output_dir}/vocab.txt' - with open(vocab_file, "w") as f: + with open(vocab_file, "w", encoding="utf8") as f: for token in vocab: f.write(f"{token}\n") return f'{output_dir}/tokenizer.model', vocab_file diff --git a/requirements/requirements_asr.txt b/requirements/requirements_asr.txt index 6df223209cc1..b7863714eb2d 100644 --- a/requirements/requirements_asr.txt +++ b/requirements/requirements_asr.txt @@ -5,7 +5,7 @@ ipywidgets jiwer kaldi-python-io kaldiio -lhotse>=1.20.0 +lhotse>=1.22.0 librosa>=0.10.0 marshmallow matplotlib diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py index 955765019133..4e89a93e83e4 100644 --- a/tests/collections/common/test_lhotse_dataloading.py +++ b/tests/collections/common/test_lhotse_dataloading.py @@ -11,19 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +from collections import Counter from io import BytesIO from itertools import islice from pathlib import Path from typing import Dict, List, Tuple +import lhotse +import numpy as np import pytest import torch +from lhotse.cut import Cut +from lhotse.cut.text import TextPairExample from omegaconf import OmegaConf +from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config - -lhotse = pytest.importorskip("lhotse", reason="Lhotse + NeMo tests require Lhotse (pip install lhotse).") +from nemo.collections.common.data.lhotse.text_adapters import TextExample +from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model requires_torchaudio = pytest.mark.skipif( not lhotse.utils.is_torchaudio_available(), reason="Lhotse Shar format support requires torchaudio." @@ -332,7 +337,7 @@ def test_dataloader_from_tarred_nemo_manifest(nemo_tarred_manifest_path: tuple[s config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset() ) - batches = [batch for batch in dl] + batches = [batch for batch in islice(dl, 4)] assert len(batches) == 4 b = batches[0] @@ -349,7 +354,7 @@ def test_dataloader_from_tarred_nemo_manifest(nemo_tarred_manifest_path: tuple[s b = batches[3] assert set(b.keys()) == {"audio", "audio_lens", "ids"} - assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1 + assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 3 def test_dataloader_from_tarred_nemo_manifest_weighted_combination(nemo_tarred_manifest_path: tuple[str, str]): @@ -411,7 +416,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi(nemo_tarred_manifest_path_mu config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset() ) - batches = [batch for batch in dl] + batches = [batch for batch in islice(dl, 4)] assert len(batches) == 4 b = batches[0] @@ -428,7 +433,7 @@ def test_dataloader_from_tarred_nemo_manifest_multi(nemo_tarred_manifest_path_mu b = batches[3] assert set(b.keys()) == {"audio", "audio_lens", "ids"} - assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1 + assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 3 def test_dataloader_from_tarred_nemo_manifest_multi_max_open_streams(nemo_tarred_manifest_path_multi: tuple[str, str]): @@ -489,7 +494,7 @@ def test_dataloader_from_tarred_nemo_manifest_concat(nemo_tarred_manifest_path: config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset() ) - batches = [batch for batch in dl] + batches = [batch for batch in islice(dl, 4)] assert len(batches) == 4 @@ -513,8 +518,8 @@ def test_dataloader_from_tarred_nemo_manifest_concat(nemo_tarred_manifest_path: b = batches[3] assert set(b.keys()) == {"audio", "audio_lens", "ids"} - assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1 - torch.testing.assert_close(b["audio_lens"], torch.tensor([16000], dtype=torch.int32)) + assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 2 + torch.testing.assert_close(b["audio_lens"], expected_audio_lens) @requires_torchaudio @@ -728,7 +733,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path): assert cut.supervisions[0].text == "irrelevant" audio = cut.load_audio() assert audio.shape == (1, 8000) - np.testing.assert_equal(audio[0], expected_audio[8000:]) + np.testing.assert_allclose(audio[0], expected_audio[8000:], atol=5e-5) assert cuts[0].id != cuts[1].id @@ -736,6 +741,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path): def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path): import numpy as np import soundfile as sf + from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator # Have to generate as INT16 to avoid quantization error after saving to 16-bit WAV @@ -764,4 +770,427 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path): assert cut.num_samples == 8000 assert cut.supervisions[0].text == "irrelevant" assert audio.shape == (1, 8000) - np.testing.assert_equal(audio[0], expected_audio[:8000]) + np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5) + + +class Identity(torch.utils.data.Dataset): + def __getitem__(self, cuts: lhotse.CutSet) -> lhotse.CutSet: + return cuts + + +def test_extended_data_input_cfg(cutset_shar_path, nemo_tarred_manifest_path_multi): + config = OmegaConf.create( + { + "input_cfg": [ + { + "type": "nemo_tarred", + "manifest_filepath": nemo_tarred_manifest_path_multi[0], + "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + }, + { + "type": "lhotse_shar", + "shar_path": cutset_shar_path, + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + }, + ], + "sample_rate": 16000, + "shuffle": True, + "num_workers": 0, + "batch_size": 4, + "seed": 0, + "shard_seed": 0, + } + ) + + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity()) + + # Note: we use islice here because the dataloader will be infinite. + batches = [batch for batch in islice(dl, 2)] + + b = batches[0] + assert isinstance(b, lhotse.CutSet) + assert all(c.custom["language"] == "en" for c in b) + assert all(c.custom["modality"] == "audio" for c in b) + assert sum(c.custom["dataset_name"] == "D1" for c in b) == 2 + assert sum(c.custom["dataset_name"] == "D2" for c in b) == 2 + + b = batches[1] + assert isinstance(b, lhotse.CutSet) + assert all(c.custom["language"] == "en" for c in b) + assert all(c.custom["modality"] == "audio" for c in b) + assert sum(c.custom["dataset_name"] == "D1" for c in b) == 1 + assert sum(c.custom["dataset_name"] == "D2" for c in b) == 3 + + +def test_extended_data_input_cfg_subgroup(cutset_shar_path, nemo_tarred_manifest_path_multi): + config = OmegaConf.create( + { + "input_cfg": [ + { + "type": "group", + "input_cfg": [ + { + "type": "nemo_tarred", + "manifest_filepath": nemo_tarred_manifest_path_multi[0], + "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + }, + { + "type": "lhotse_shar", + "shar_path": cutset_shar_path, + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + }, + ], + "weight": 0.2, + "tags": {"group_name": "G1",}, + }, + { + "type": "group", + "weight": 0.8, + "input_cfg": [ + { + "type": "nemo_tarred", + "manifest_filepath": nemo_tarred_manifest_path_multi[0], + "tarred_audio_filepaths": nemo_tarred_manifest_path_multi[1], + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D3",}, + }, + { + "type": "lhotse_shar", + "shar_path": cutset_shar_path, + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D4",}, + }, + ], + "tags": {"group_name": "G2",}, + }, + ], + "sample_rate": 16000, + "shuffle": True, + "num_workers": 0, + "batch_size": 32, + "seed": 0, + "shard_seed": 0, + } + ) + + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity()) + + # Sample 100 mini-batches and test statistical properties + group_occurrences = Counter() + dataset_occurrences = Counter() + for batch in islice(dl, 100): + for cut in batch: + group_occurrences[cut.group_name] += 1 + dataset_occurrences[cut.dataset_name] += 1 + + tot = sum(group_occurrences.values()) + for k in group_occurrences: + group_occurrences[k] /= tot + for k in dataset_occurrences: + dataset_occurrences[k] /= tot + + def almost(number): + return pytest.approx(number, abs=0.02) + + assert group_occurrences["G1"] == almost(0.2) # group weight: 0.2 + assert group_occurrences["G2"] == almost(0.8) # group weight: 0.8 + assert dataset_occurrences["D1"] == almost(0.1) # group weight: 0.2 * dataset weight 0.5 => 0.1 + assert dataset_occurrences["D2"] == almost(0.1) # group weight: 0.2 * dataset weight 0.5 => 0.1 + assert dataset_occurrences["D3"] == almost(0.4) # group weight: 0.8 * dataset weight 0.5 => 0.4 + assert dataset_occurrences["D4"] == almost(0.4) # group weight: 0.8 * dataset weight 0.5 => 0.4 + + +def test_extended_data_input_cfg_yaml_path(tmp_path, cutset_shar_path, nemo_tarred_manifest_path_multi): + input_cfg = [ + { + "type": "nemo_tarred", + "manifest_filepath": str(nemo_tarred_manifest_path_multi[0]), + "tarred_audio_filepaths": str(nemo_tarred_manifest_path_multi[1]), + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D1",}, + }, + { + "type": "lhotse_shar", + "shar_path": str(cutset_shar_path), + "weight": 0.5, + "tags": {"language": "en", "modality": "audio", "dataset_name": "D2",}, + }, + ] + + yaml_path = tmp_path / "input_cfg.yaml" + lhotse.serialization.save_to_yaml(input_cfg, yaml_path) + + config = OmegaConf.create( + { + "input_cfg": input_cfg, + "sample_rate": 16000, + "shuffle": True, + "num_workers": 0, + "batch_size": 32, + "seed": 0, + "shard_seed": 0, + } + ) + + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity()) + + batch = next(iter(dl)) + assert isinstance(batch, lhotse.CutSet) + for cut in batch: + assert cut.dataset_name in ("D1", "D2") + + +@pytest.fixture(scope="session") +def txt_en_path(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("text_data") + en_path = tmp_path / "text.en" + en_path.write_text( + """Example text in English. +Another sentence. + """ + ) + return en_path + + +@pytest.fixture(scope="session") +def txt_es_path(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("text_data") + es_path = tmp_path / "text.es" + es_path.write_text( + """Otro texto en ingles. +Otra frase.""" + ) + return es_path + + +def test_text_file_input(txt_en_path, txt_es_path): + config = OmegaConf.create( + { + "input_cfg": [{"type": "txt", "paths": txt_en_path, "language": "en",},], + "shuffle": True, + "num_workers": 0, + "batch_size": 4, + "seed": 0, + "shard_seed": 0, + } + ) + + # Note: this test does not need to pass a tokenizer because we use static batch sizes + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity()) + + # Note: we use islice here because the dataloader will be infinite. + batches = [batch for batch in islice(dl, 2)] + + b = batches[0] + assert isinstance(b, lhotse.CutSet) + assert all(isinstance(c, TextExample) for c in b) + assert all(c.language == "en" for c in b) + + b = batches[1] + assert isinstance(b, lhotse.CutSet) + assert all(isinstance(c, TextExample) for c in b) + assert all(c.language == "en" for c in b) + + +def test_text_file_pairs_input(txt_en_path, txt_es_path): + config = OmegaConf.create( + { + "input_cfg": [ + { + "type": "txt_pair", + "source_paths": txt_en_path, + "target_paths": txt_es_path, + "source_language": "en", + "target_language": "es", + }, + ], + "shuffle": True, + "num_workers": 0, + "batch_size": 4, + "seed": 0, + "shard_seed": 0, + } + ) + + # Note: this test does not need to pass a tokenizer because we use static batch sizes + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity()) + + # Note: we use islice here because the dataloader will be infinite. + batches = [batch for batch in islice(dl, 2)] + + b = batches[0] + assert isinstance(b, lhotse.CutSet) + assert all(isinstance(c, TextPairExample) for c in b) + assert all(c.source.language == "en" for c in b) + assert all(c.target.language == "es" for c in b) + + b = batches[1] + assert isinstance(b, lhotse.CutSet) + assert all(isinstance(c, TextPairExample) for c in b) + assert all(c.source.language == "en" for c in b) + assert all(c.target.language == "es" for c in b) + + +@pytest.fixture(scope="session") +def txt_pair_paths_shards(tmp_path_factory, txt_en_path, txt_es_path): + tmp_path = tmp_path_factory.mktemp("text_data_shards") + + en_text = txt_en_path.read_text().splitlines() + (tmp_path / "en_0.txt").write_text("\n".join(en_text[:5])) + (tmp_path / "en_1.txt").write_text("\n".join(en_text[5:])) + + es_text = txt_es_path.read_text().splitlines() + (tmp_path / "es_0.txt").write_text("\n".join(es_text[:5])) + (tmp_path / "es_1.txt").write_text("\n".join(es_text[5:])) + + return f"{tmp_path}/en__OP_0..1_CL_.txt", f"{tmp_path}/es__OP_0..1_CL_.txt" + + +def test_text_file_pairs_shards_input(txt_pair_paths_shards: tuple[str, str]): + en_paths, es_paths = txt_pair_paths_shards + + config = OmegaConf.create( + { + "input_cfg": [ + { + "type": "txt_pair", + "source_paths": en_paths, + "target_paths": es_paths, + "source_language": "en", + "target_language": "es", + }, + ], + "shuffle": True, + "num_workers": 0, + "batch_size": 4, + "seed": 0, + "shard_seed": 0, + } + ) + + # Note: this test does not need to pass a tokenizer because we use static batch sizes + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=Identity()) + + # Note: we use islice here because the dataloader will be infinite. + batches = [batch for batch in islice(dl, 2)] + + b = batches[0] + assert isinstance(b, lhotse.CutSet) + assert all(isinstance(c, TextPairExample) for c in b) + assert all(c.source.language == "en" for c in b) + assert all(c.target.language == "es" for c in b) + + b = batches[1] + assert isinstance(b, lhotse.CutSet) + assert all(isinstance(c, TextPairExample) for c in b) + assert all(c.source.language == "en" for c in b) + assert all(c.target.language == "es" for c in b) + + +@pytest.fixture(scope="session") +def en_es_tokenizer(tmp_path_factory, txt_en_path, txt_es_path) -> TokenizerWrapper: + tmpdir = tmp_path_factory.mktemp("en_es_tokenizer") + text_path = tmpdir / "text.txt" + text_path.write_text(txt_en_path.read_text() + "\n" + txt_es_path.read_text()) + create_spt_model(text_path, vocab_size=128, sample_size=-1, do_lower_case=False, output_dir=str(tmpdir)) + return TokenizerWrapper(SentencePieceTokenizer(str(tmpdir / "tokenizer.model"))) + + +def test_multimodal_text_audio_dataloading( + txt_pair_paths_shards: tuple[str, str], + nemo_tarred_manifest_path_multi: tuple[str, str], + en_es_tokenizer: TokenizerWrapper, +): + en_paths, es_paths = txt_pair_paths_shards + manifest_filepath, tarred_audio_filepaths = nemo_tarred_manifest_path_multi + config = OmegaConf.create( + { + "input_cfg": [ + { + "type": "txt_pair", + "source_paths": en_paths, + "target_paths": es_paths, + "source_language": "en", + "target_language": "es", + "tags": {"modality": "text",}, + }, + { + "type": "nemo_tarred", + "manifest_filepath": manifest_filepath, + "tarred_audio_filepaths": tarred_audio_filepaths, + "tags": {"modality": "audio",}, + }, + ], + "shuffle": True, + "num_workers": 0, + "use_multimodal_sampling": True, + "batch_tokens": 1024, + # How to set token equivalent duration in actual training? + # assuming fbank frames: 0.01 is the base due to frame shift; + # + subsampling x8 gives us 0.08 + # assuming discrete audio tokens, with frame rate 50Hz, + # we'd get 0.02 + # in this test we'll just use 0.1 for simplicity + "token_equivalent_duration": 0.1, + "quadratic_factor": 50, + "seed": 0, + "shard_seed": 0, + } + ) + + dl = get_lhotse_dataloader_from_config( + config=config, global_rank=0, world_size=1, dataset=Identity(), tokenizer=en_es_tokenizer, + ) + + # Note: we use islice here because the dataloader will be infinite. + batches = [batch for batch in islice(dl, 2)] + + b = batches[0] + assert isinstance(b, lhotse.CutSet) + assert len(b) == 48 + assert sum(ex.num_tokens for ex in b) == pytest.approx(574.0) + assert min(ex.num_tokens for ex in b) == pytest.approx(10) + assert max(ex.num_tokens for ex in b) == pytest.approx(16) + assert sum(isinstance(ex, Cut) for ex in b) == 29 + assert sum(isinstance(ex, TextPairExample) for ex in b) == 19 + for ex in b: + if isinstance(ex, Cut): + assert ex.modality == "audio" + assert isinstance(ex.load_audio(), np.ndarray) + assert isinstance(ex.supervisions[0].text, str) + if isinstance(ex, TextPairExample): + assert ex.modality == "text" + assert ex.source.language == "en" + assert ex.target.language == "es" + assert isinstance(ex.source.text, str) + assert isinstance(ex.target.text, str) + assert isinstance(ex.source.tokens, np.ndarray) + assert isinstance(ex.target.tokens, np.ndarray) + + b = batches[1] + assert isinstance(b, lhotse.CutSet) + assert len(b) == 48 + assert sum(ex.num_tokens for ex in b) == pytest.approx(614.0) + assert min(ex.num_tokens for ex in b) == pytest.approx(10) + assert max(ex.num_tokens for ex in b) == pytest.approx(16) + assert sum(isinstance(ex, Cut) for ex in b) == 21 + assert sum(isinstance(ex, TextPairExample) for ex in b) == 27 + for ex in b: + if isinstance(ex, Cut): + assert ex.modality == "audio" + assert isinstance(ex.load_audio(), np.ndarray) + assert isinstance(ex.supervisions[0].text, str) + if isinstance(ex, TextPairExample): + assert ex.modality == "text" + assert ex.source.language == "en" + assert ex.target.language == "es" + assert isinstance(ex.source.text, str) + assert isinstance(ex.target.text, str) + assert isinstance(ex.source.tokens, np.ndarray) + assert isinstance(ex.target.tokens, np.ndarray) From 12e7cf9d24070e26a7d4e0fb4d3ebf0f4a7c09c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Tue, 16 Apr 2024 13:55:51 -0400 Subject: [PATCH 38/39] Lhotse AudioToAudio dataset (supports ref recording and embedding) (#8477) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Draft for Lhotse AudioToAudio dataset (supports ref recording and embedding) Signed-off-by: Piotr Żelasko * Integrate with speech enhancement models Signed-off-by: Piotr Żelasko * Fix absolute path + write cuts in the output manifest Signed-off-by: Ante Jukić * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Support channel selectors for input, reference, and target recordings Signed-off-by: Piotr Żelasko * Support on the fly truncation and/or cutting into windows Signed-off-by: Piotr Żelasko * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Bump min required lhotse version Signed-off-by: Piotr Żelasko * Add copyright headers Signed-off-by: Piotr Żelasko * Added unit tests checking lhotse dataloader is matching the existing dataset (#8619) Signed-off-by: Ante Jukić * Fix batch unpacking, test_ds, use nemo logging Signed-off-by: Ante Jukić * fixed some code scanning issues Signed-off-by: Ante Jukić * Fixed a couple CI issues Signed-off-by: Ante Jukić * Support NeMo-style resolution of relative paths in native lhotse cuts Signed-off-by: Piotr Żelasko * Added option to leave original paths or force absolute paths in the converted manifests Signed-off-by: Ante Jukić * Fix support for relative path resolution in lhotse arrays Signed-off-by: Piotr Żelasko * Fix unit tests Signed-off-by: Piotr Żelasko --------- Signed-off-by: Piotr Żelasko Signed-off-by: Ante Jukić Co-authored-by: Ante Jukić Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: anteju <108555623+anteju@users.noreply.github.com> Co-authored-by: Pablo Garay --- examples/audio_tasks/audio_to_audio_eval.py | 7 + examples/audio_tasks/speech_enhancement.py | 2 +- .../asr/data/audio_to_audio_lhotse.py | 207 +++++++++++ .../asr/models/enhancement_models.py | 25 +- nemo/collections/common/data/lhotse/cutset.py | 56 ++- .../common/data/lhotse/dataloader.py | 32 +- .../audio_to_audio/convert_nemo_to_lhotse.py | 77 ++++ tests/collections/asr/test_asr_datasets.py | 336 ++++++++++++------ .../common/test_lhotse_dataloading.py | 99 +++++- 9 files changed, 727 insertions(+), 114 deletions(-) create mode 100644 nemo/collections/asr/data/audio_to_audio_lhotse.py create mode 100644 scripts/audio_to_audio/convert_nemo_to_lhotse.py diff --git a/examples/audio_tasks/audio_to_audio_eval.py b/examples/audio_tasks/audio_to_audio_eval.py index 57d7095057a9..4ac68dfc84e7 100644 --- a/examples/audio_tasks/audio_to_audio_eval.py +++ b/examples/audio_tasks/audio_to_audio_eval.py @@ -73,7 +73,9 @@ from tqdm import tqdm from nemo.collections.asr.data import audio_to_audio_dataset +from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset from nemo.collections.asr.metrics.audio import AudioMetricWrapper +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.collections.common.parts.preprocessing import manifest from nemo.core.config import hydra_runner from nemo.utils import logging @@ -103,6 +105,11 @@ class AudioEvaluationConfig(process_audio.ProcessConfig): def get_evaluation_dataloader(config): """Prepare a dataloader for evaluation. """ + if config.get("use_lhotse", False): + return get_lhotse_dataloader_from_config( + config, global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset() + ) + dataset = audio_to_audio_dataset.get_audio_to_target_dataset(config=config) return torch.utils.data.DataLoader( diff --git a/examples/audio_tasks/speech_enhancement.py b/examples/audio_tasks/speech_enhancement.py index 5b32d9b95298..250d212d2a25 100644 --- a/examples/audio_tasks/speech_enhancement.py +++ b/examples/audio_tasks/speech_enhancement.py @@ -51,7 +51,7 @@ def main(cfg): trainer.fit(model) # Run on test data, if available - if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: + if hasattr(cfg.model, 'test_ds'): if trainer.is_global_zero: # Destroy the current process group and let the trainer initialize it again with a single device. if torch.distributed.is_initialized(): diff --git a/nemo/collections/asr/data/audio_to_audio_lhotse.py b/nemo/collections/asr/data/audio_to_audio_lhotse.py new file mode 100644 index 000000000000..6317d8a929c2 --- /dev/null +++ b/nemo/collections/asr/data/audio_to_audio_lhotse.py @@ -0,0 +1,207 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import numpy as np +import torch +from lhotse import AudioSource, CutSet, Recording +from lhotse.array import Array +from lhotse.audio import info +from lhotse.cut import MixedCut +from lhotse.dataset.collation import collate_audio, collate_custom_field +from lhotse.serialization import load_jsonl + +from nemo.collections.common.parts.preprocessing.manifest import get_full_path + +INPUT_CHANNEL_SELECTOR = "input_channel_selector" +TARGET_CHANNEL_SELECTOR = "target_channel_selector" +REFERENCE_CHANNEL_SELECTOR = "reference_channel_selector" +LHOTSE_TARGET_CHANNEL_SELECTOR = "target_recording_channel_selector" +LHOTSE_REFERENCE_CHANNEL_SELECTOR = "reference_recording_channel_selector" + + +class LhotseAudioToTargetDataset(torch.utils.data.Dataset): + """ + A dataset for audio-to-audio tasks where the goal is to use + an input signal to recover the corresponding target signal. + + .. note:: This is a Lhotse variant of :class:`nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset`. + """ + + TARGET_KEY = "target_recording" + REFERENCE_KEY = "reference_recording" + EMBEDDING_KEY = "embedding_vector" + + def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]: + src_audio, src_audio_lens = collate_audio(cuts) + ans = { + "input_signal": src_audio, + "input_length": src_audio_lens, + } + if _key_available(cuts, self.TARGET_KEY): + tgt_audio, tgt_audio_lens = collate_audio(cuts, recording_field=self.TARGET_KEY) + ans.update(target_signal=tgt_audio, target_length=tgt_audio_lens) + if _key_available(cuts, self.REFERENCE_KEY): + ref_audio, ref_audio_lens = collate_audio(cuts, recording_field=self.REFERENCE_KEY) + ans.update(reference_signal=ref_audio, reference_length=ref_audio_lens) + if _key_available(cuts, self.EMBEDDING_KEY): + emb = collate_custom_field(cuts, field=self.EMBEDDING_KEY) + ans.update(embedding_signal=emb) + return ans + + +def _key_available(cuts: CutSet, key: str) -> bool: + for cut in cuts: + if isinstance(cut, MixedCut): + cut = cut._first_non_padding_cut + if cut.custom is not None and key in cut.custom: + continue + else: + return False + return True + + +def create_recording(path_or_paths: str | list[str]) -> Recording: + if isinstance(path_or_paths, list): + cur_channel_idx = 0 + sources = [] + infos = [] + for p in path_or_paths: + i = info(p) + infos.append(i) + sources.append( + AudioSource(type="file", channels=list(range(cur_channel_idx, cur_channel_idx + i.channels)), source=p) + ) + cur_channel_idx += i.channels + assert all( + i.samplerate == infos[0].samplerate for i in infos[1:] + ), f"Mismatched sampling rates for individual audio files in: {path_or_paths}" + recording = Recording( + id=path_or_paths[0], + sources=sources, + sampling_rate=infos[0].samplerate, + num_samples=infos[0].frames, + duration=infos[0].duration, + channel_ids=list(range(0, cur_channel_idx)), + ) + else: + recording = Recording.from_file(path_or_paths) + return recording + + +def create_array(path: str) -> Array: + assert path.endswith(".npy"), f"Currently only conversion of numpy files is supported (got: {path})" + arr = np.load(path) + parent, path = os.path.split(path) + return Array(storage_type="numpy_files", storage_path=parent, storage_key=path, shape=list(arr.shape),) + + +def convert_manifest_nemo_to_lhotse( + input_manifest: str, + output_manifest: str, + input_key: str = 'input_filepath', + target_key: str = 'target_filepath', + reference_key: str = 'reference_filepath', + embedding_key: str = 'embedding_filepath', + force_absolute_paths: bool = False, +): + """ + Convert an audio-to-audio manifest from NeMo format to Lhotse format. + + Args: + input_manifest: Path to the input NeMo manifest. + output_manifest: Path where we'll write the output Lhotse manifest (supported extensions: .jsonl.gz and .jsonl). + input_key: Key of the input recording, mapped to Lhotse's 'Cut.recording'. + target_key: Key of the target recording, mapped to Lhotse's 'Cut.target_recording'. + reference_key: Key of the reference recording, mapped to Lhotse's 'Cut.reference_recording'. + embedding_key: Key of the embedding, mapped to Lhotse's 'Cut.embedding_vector'. + force_absolute_paths: If True, the paths in the output manifest will be absolute. + """ + with CutSet.open_writer(output_manifest) as writer: + for item in load_jsonl(input_manifest): + + # Create Lhotse recording and cut object, apply offset and duration slicing if present. + item_input_key = item.pop(input_key) + recording = create_recording(get_full_path(audio_file=item_input_key, manifest_file=input_manifest)) + cut = recording.to_cut().truncate(duration=item.pop("duration"), offset=item.pop("offset", 0.0)) + + _as_relative(cut.recording, item_input_key, enabled=not force_absolute_paths) + + if (channels := item.pop(INPUT_CHANNEL_SELECTOR, None)) is not None: + if cut.num_channels == 1: + assert ( + len(channels) == 1 and channels[0] == 0 + ), f"The input recording has only a single channel, but manifest specified {INPUT_CHANNEL_SELECTOR}={channels}" + else: + cut = cut.with_channels(channels) + + if target_key in item: + item_target_key = item.pop(target_key) + cut.target_recording = create_recording( + get_full_path(audio_file=item_target_key, manifest_file=input_manifest) + ) + + _as_relative(cut.target_recording, item_target_key, enabled=not force_absolute_paths) + + if (channels := item.pop(TARGET_CHANNEL_SELECTOR, None)) is not None: + if cut.target_recording.num_channels == 1: + assert ( + len(channels) == 1 and channels[0] == 0 + ), f"The target recording has only a single channel, but manifest specified {TARGET_CHANNEL_SELECTOR}={channels}" + else: + cut = cut.with_custom(LHOTSE_TARGET_CHANNEL_SELECTOR, channels) + + if reference_key in item: + item_reference_key = item.pop(reference_key) + cut.reference_recording = create_recording( + get_full_path(audio_file=item_reference_key, manifest_file=input_manifest) + ) + + _as_relative(cut.reference_recording, item_target_key, enabled=not force_absolute_paths) + + if (channels := item.pop(REFERENCE_CHANNEL_SELECTOR, None)) is not None: + if cut.reference_recording.num_channels == 1: + assert ( + len(channels) == 1 and channels[0] == 0 + ), f"The reference recording has only a single channel, but manifest specified {REFERENCE_CHANNEL_SELECTOR}={channels}" + else: + cut = cut.with_custom(LHOTSE_REFERENCE_CHANNEL_SELECTOR, channels) + + if embedding_key in item: + item_embedding_key = item.pop(embedding_key) + cut.embedding_vector = create_array( + get_full_path(audio_file=item_embedding_key, manifest_file=input_manifest) + ) + + if not force_absolute_paths: + # Use the same format for paths as in the original manifest + cut.embedding_vector.storage_path = "" + cut.embedding_vector.storage_key = item_embedding_key + + if item: + cut.custom.update(item) # any field that's still left goes to custom fields + + writer.write(cut) + + +def _as_relative(recording: Recording, paths: list[str] | str, enabled: bool) -> None: + if not enabled: + return + if isinstance(paths, str): + paths = [paths] + assert len(recording.sources) == len( + paths + ), f"Mismatched number of sources for lhotse Recording and the override list. Got {recording=} and {paths=}" + for source, path in zip(recording.sources, paths): + source.source = path diff --git a/nemo/collections/asr/models/enhancement_models.py b/nemo/collections/asr/models/enhancement_models.py index 7cc5c3d8459f..b80c357364aa 100644 --- a/nemo/collections/asr/models/enhancement_models.py +++ b/nemo/collections/asr/models/enhancement_models.py @@ -24,9 +24,11 @@ from tqdm import tqdm from nemo.collections.asr.data import audio_to_audio_dataset +from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset from nemo.collections.asr.data.audio_to_text_dataset import inject_dataloader_value_from_model_config from nemo.collections.asr.models.audio_to_audio_model import AudioToAudioModel from nemo.collections.asr.parts.utils.audio_utils import ChannelSelectorType +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types import AudioSignal, LengthsType, NeuralType from nemo.utils import logging @@ -198,6 +200,11 @@ def process( def _setup_dataloader_from_config(self, config: Optional[Dict]): + if config.get("use_lhotse", False): + return get_lhotse_dataloader_from_config( + config, global_rank=self.global_rank, world_size=self.world_size, dataset=LhotseAudioToTargetDataset() + ) + is_concat = config.get('is_concat', False) if is_concat: raise NotImplementedError('Concat not implemented') @@ -398,7 +405,14 @@ def forward(self, input_signal, input_length=None): # PTL-specific methods def training_step(self, batch, batch_idx): - input_signal, input_length, target_signal, target_length = batch + + if isinstance(batch, dict): + # lhotse batches are dictionaries + input_signal = batch['input_signal'] + input_length = batch['input_length'] + target_signal = batch['target_signal'] + else: + input_signal, input_length, target_signal, _ = batch # Expand channel dimension, if necessary # For consistency, the model uses multi-channel format, even if the channel dimension is 1 @@ -426,7 +440,14 @@ def training_step(self, batch, batch_idx): return loss def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'): - input_signal, input_length, target_signal, target_length = batch + + if isinstance(batch, dict): + # lhotse batches are dictionaries + input_signal = batch['input_signal'] + input_length = batch['input_length'] + target_signal = batch['target_signal'] + else: + input_signal, input_length, target_signal, _ = batch # Expand channel dimension, if necessary # For consistency, the model uses multi-channel format, even if the channel dimension is 1 diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py index fa5ae5804c4b..6dd01479a501 100644 --- a/nemo/collections/common/data/lhotse/cutset.py +++ b/nemo/collections/common/data/lhotse/cutset.py @@ -19,11 +19,14 @@ from pathlib import Path from typing import Sequence, Tuple -from lhotse import CutSet +from lhotse import CutSet, Features, Recording +from lhotse.array import Array, TemporalArray +from lhotse.cut import Cut, MixedCut, PaddingCut from omegaconf import DictConfig, ListConfig, OmegaConf from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator, LazyNeMoTarredIterator from nemo.collections.common.data.lhotse.text_adapters import LhotseTextAdapter, LhotseTextPairAdapter +from nemo.collections.common.parts.preprocessing.manifest import get_full_path def read_cutset_from_config(config: DictConfig) -> Tuple[CutSet, bool]: @@ -291,10 +294,59 @@ def read_lhotse_manifest(config, is_tarred: bool) -> CutSet: cuts = mux(*cutsets, weights=weights, max_open_streams=config.max_open_streams, seed=config.shard_seed) else: # Regular Lhotse manifest points to individual audio files (like native NeMo manifest). - cuts = CutSet.from_file(config.cuts_path) + path = config.cuts_path + cuts = CutSet.from_file(path).map(partial(resolve_relative_paths, manifest_path=path)) return cuts +def resolve_relative_paths(cut: Cut, manifest_path: str) -> Cut: + if isinstance(cut, PaddingCut): + return cut + + if isinstance(cut, MixedCut): + for track in cut.tracks: + track.cut = resolve_relative_paths(track.cut, manifest_path) + return cut + + def resolve_recording(value): + for audio_source in value.sources: + if audio_source.type == "file": + audio_source.source = get_full_path(audio_source.source, manifest_file=manifest_path) + + def resolve_array(value): + if isinstance(value, TemporalArray): + value.array = resolve_array(value.array) + else: + if value.storage_type in ("numpy_files", "lilcom_files"): + abspath = Path( + get_full_path(str(Path(value.storage_path) / value.storage_key), manifest_file=manifest_path) + ) + value.storage_path = str(abspath.parent) + value.storage_key = str(abspath.name) + elif value.storage_type in ( + "kaldiio", + "chunked_lilcom_hdf5", + "lilcom_chunky", + "lilcom_hdf5", + "numpy_hdf5", + ): + value.storage_path = get_full_path(value.storage_path, manifest_file=manifest_path) + # ignore others i.e. url, in-memory data, etc. + + if cut.has_recording: + resolve_recording(cut.recording) + if cut.has_features: + resolve_array(cut.features) + if cut.custom is not None: + for key, value in cut.custom.items(): + if isinstance(value, Recording): + resolve_recording(value) + elif isinstance(value, (Array, TemporalArray, Features)): + resolve_array(value) + + return cut + + def read_nemo_manifest(config, is_tarred: bool) -> CutSet: common_kwargs = { "text_field": config.text_field, diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py index fd2a69725a0e..83920660302b 100644 --- a/nemo/collections/common/data/lhotse/dataloader.py +++ b/nemo/collections/common/data/lhotse/dataloader.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import warnings from dataclasses import dataclass from functools import partial, singledispatch @@ -38,6 +37,7 @@ from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper from nemo.collections.common.data.lhotse.cutset import read_cutset_from_config from nemo.collections.common.tokenizers import TokenizerSpec +from nemo.utils import logging @dataclass @@ -104,6 +104,15 @@ class LhotseDataLoadingConfig: concatenate_duration_factor: float = 1.0 concatenate_merge_supervisions: bool = True db_norm: Optional[float] = -25.0 # from CodeSwitchingDataset + # d. On-the-fly cut truncation or window slicing + # I) truncate: select one chunk of a fixed duration for each cut + truncate_duration: Optional[float] = None # set this to enable + truncate_offset_type: str = "random" # "random" | "start" (fixed) | "end" (fixed, counted back) + # II) cut_into_windows: convert each cut to smaller cut using a sliding window (define hop for overlapping windows) + cut_into_windows_duration: Optional[float] = None # set this to enable + cut_into_windows_hop: Optional[float] = None + # III) common options + keep_excessive_supervisions: bool = True # when a cut is truncated in the middle of a supervision, should we keep them. # 5. Other Lhotse options. text_field: str = "text" # key to read the transcript from @@ -151,9 +160,6 @@ def get_lhotse_dataloader_from_config( # Resample as a safeguard; it's a no-op when SR is already OK cuts = cuts.resample(config.sample_rate) - # Duration filtering, same as native NeMo dataloaders. - cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration)) - # Expands cuts if multiple translations are provided. cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None))) @@ -181,6 +187,24 @@ def get_lhotse_dataloader_from_config( if config.perturb_speed: cuts = CutSet.mux(cuts, cuts.perturb_speed(0.9), cuts.perturb_speed(1.1),) + # 2.d: truncation/slicing + if config.truncate_duration is not None: + cuts = cuts.truncate( + max_duration=config.truncate_duration, + offset_type=config.truncate_offset_type, + keep_excessive_supervisions=config.keep_excessive_supervisions, + ) + if config.cut_into_windows_duration is not None: + cuts = cuts.cut_into_windows( + duration=config.cut_into_windows_duration, + hop=config.cut_into_windows_hop, + keep_excessive_supervisions=config.keep_excessive_supervisions, + ) + + # Duration filtering, same as native NeMo dataloaders. + # We can filter after the augmentations because they are applied only when calling load_audio(). + cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration)) + if config.use_multimodal_sampling: constraint = MultimodalSamplingConstraint( token_equivalent_duration=config.token_equivalent_duration, diff --git a/scripts/audio_to_audio/convert_nemo_to_lhotse.py b/scripts/audio_to_audio/convert_nemo_to_lhotse.py new file mode 100644 index 000000000000..e498a3b2d460 --- /dev/null +++ b/scripts/audio_to_audio/convert_nemo_to_lhotse.py @@ -0,0 +1,77 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +from nemo.collections.asr.data.audio_to_audio_lhotse import convert_manifest_nemo_to_lhotse + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert an audio-to-audio manifest from NeMo format to Lhotse format. " + "This step enables the use of Lhotse datasets for audio-to-audio processing. " + ) + parser.add_argument("input", help='Path to the input NeMo manifest.') + parser.add_argument( + "output", help="Path where we'll write the output Lhotse manifest (supported extensions: .jsonl.gz and .jsonl)" + ) + parser.add_argument( + "-i", + "--input_key", + default="audio_filepath", + help="Key of the input recording, mapped to Lhotse's 'Cut.recording'.", + ) + parser.add_argument( + "-t", + "--target_key", + default="target_filepath", + help="Key of the target recording, mapped to Lhotse's 'Cut.target_recording'.", + ) + parser.add_argument( + "-r", + "--reference_key", + default="reference_filepath", + help="Key of the reference recording, mapped to Lhotse's 'Cut.reference_recording'.", + ) + parser.add_argument( + "-e", + "--embedding_key", + default="embedding_filepath", + help="Key of the embedding, mapped to Lhotse's 'Cut.embedding_vector'.", + ) + parser.add_argument( + "-a", + "--force_absolute_paths", + action='store_true', + default=False, + help="Force absolute paths in the generated manifests.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + convert_manifest_nemo_to_lhotse( + input_manifest=args.input, + output_manifest=args.output, + input_key=args.input_key, + target_key=args.target_key, + reference_key=args.reference_key, + embedding_key=args.embedding_key, + force_absolute_paths=args.force_absolute_paths, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/collections/asr/test_asr_datasets.py b/tests/collections/asr/test_asr_datasets.py index 4040227c5e3e..946acb614f11 100644 --- a/tests/collections/asr/test_asr_datasets.py +++ b/tests/collections/asr/test_asr_datasets.py @@ -34,6 +34,7 @@ AudioToTargetWithReferenceDataset, _audio_collate_fn, ) +from nemo.collections.asr.data.audio_to_audio_lhotse import LhotseAudioToTargetDataset, convert_manifest_nemo_to_lhotse from nemo.collections.asr.data.audio_to_text import ( DataStoreObject, TarredAudioToBPEDataset, @@ -52,6 +53,7 @@ from nemo.collections.asr.parts.utils.audio_utils import get_segment_start from nemo.collections.asr.parts.utils.manifest_utils import write_manifest from nemo.collections.common import tokenizers +from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config from nemo.utils import logging try: @@ -970,6 +972,27 @@ def test_audio_to_target_dataset(self): } dataset_factory = audio_to_audio_dataset.get_audio_to_target_dataset(config) + # Prepare lhotse manifest + cuts_path = manifest_filepath.replace('.json', '_cuts.jsonl') + convert_manifest_nemo_to_lhotse( + input_manifest=manifest_filepath, + output_manifest=cuts_path, + input_key=data_key['input_signal'], + target_key=data_key['target_signal'], + ) + + # Prepare lhotse dataset + config_lhotse = { + 'cuts_path': cuts_path, + 'use_lhotse': True, + 'sample_rate': sample_rate, + 'batch_size': 1, + } + dl_lhotse = get_lhotse_dataloader_from_config( + OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset() + ) + dataset_lhotse = [item for item in dl_lhotse] + # Test number of channels for signal in data: assert data_num_channels[signal] == dataset.num_channels( @@ -981,23 +1004,25 @@ def test_audio_to_target_dataset(self): # Test returned examples for n in range(num_examples): - item = dataset.__getitem__(n) - item_factory = dataset_factory.__getitem__(n) - for signal in data: - item_signal = item[signal].cpu().detach().numpy() golden_signal = data[signal][n] - assert ( - item_signal.shape == golden_signal.shape - ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' - assert np.allclose( - item_signal, golden_signal, atol=atol - ), f'Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})' - item_factory_signal = item_factory[signal].cpu().detach().numpy() - assert np.allclose( - item_factory_signal, golden_signal, atol=atol - ), f'Test 1: Failed for factory example {n}, signal {signal} (random seed {random_seed})' + for use_lhotse in [False, True]: + item_signal = ( + dataset_lhotse[n][signal].squeeze(0) if use_lhotse else dataset.__getitem__(n)[signal] + ) + item_factory_signal = dataset_factory.__getitem__(n)[signal] + + assert ( + item_signal.shape == golden_signal.shape + ), f'Test 1, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' + assert np.allclose( + item_signal, golden_signal, atol=atol + ), f'Test 1, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})' + + assert np.allclose( + item_factory_signal, golden_signal, atol=atol + ), f'Test 1, use_lhotse={use_lhotse}: Failed for factory example {n}, signal {signal} (random seed {random_seed})' # Test 2 # - Filtering based on signal duration @@ -1013,20 +1038,36 @@ def test_audio_to_target_dataset(self): sample_rate=sample_rate, ) + # Prepare lhotse dataset + config_lhotse = { + 'cuts_path': cuts_path, + 'use_lhotse': True, + 'min_duration': min_duration, + 'max_duration': max_duration, + 'sample_rate': sample_rate, + 'batch_size': 1, + } + dl_lhotse = get_lhotse_dataloader_from_config( + OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset() + ) + dataset_lhotse = [item for item in dl_lhotse] + filtered_examples = [n for n, val in enumerate(data_duration) if min_duration <= val <= max_duration] for n in range(len(dataset)): - item = dataset.__getitem__(n) + for use_lhotse in [False, True]: + for signal in data: + item_signal = ( + dataset_lhotse[n][signal].squeeze(0) if use_lhotse else dataset.__getitem__(n)[signal] + ) + golden_signal = data[signal][filtered_examples[n]] + assert ( + item_signal.shape == golden_signal.shape + ), f'Test 2, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' - for signal in data: - item_signal = item[signal].cpu().detach().numpy() - golden_signal = data[signal][filtered_examples[n]] - assert ( - item_signal.shape == golden_signal.shape - ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' - assert np.allclose( - item_signal, golden_signal, atol=atol - ), f'Test 2: Failed for example {n}, signal {signal} (random seed {random_seed})' + assert np.allclose( + item_signal, golden_signal, atol=atol + ), f'Test 2, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})' # Test 3 # - Use channel selector @@ -1078,58 +1119,98 @@ def test_audio_to_target_dataset(self): random_offset=random_offset, # random offset when selecting subsegment ) - for n in range(len(dataset)): - item = dataset.__getitem__(n) - - golden_start = golden_end = None - for signal in data: - item_signal = item[signal].cpu().detach().numpy() - full_golden_signal = data[signal][filtered_examples[n]] - - # Find random segment using correlation on the first channel - # of the first signal, and then use it fixed for other signals - if golden_start is None: - golden_start = get_segment_start( - signal=full_golden_signal[0, :], segment=item_signal[0, :] - ) - if not random_offset: - assert ( - golden_start == 0 - ), f'Expecting the signal to start at 0 when random_offset is False' - - golden_end = golden_start + audio_duration_samples - golden_signal = full_golden_signal[..., golden_start:golden_end] - - # Test length is correct - assert ( - item_signal.shape[-1] == audio_duration_samples - ), f'Test 4: Signal length ({item_signal.shape[-1]}) not matching the expected length ({audio_duration_samples})' + # Prepare lhotse dataset + config_lhotse = { + 'cuts_path': cuts_path, + 'use_lhotse': True, + 'min_duration': audio_duration, + 'truncate_duration': audio_duration, + 'truncate_offset_type': 'random' if random_offset else 'start', + 'sample_rate': sample_rate, + 'batch_size': 1, + } + dl_lhotse = get_lhotse_dataloader_from_config( + OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset() + ) + dataset_lhotse = [item for item in dl_lhotse] - assert ( - item_signal.shape == golden_signal.shape - ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' - # Test signal values - assert np.allclose( - item_signal, golden_signal, atol=atol - ), f'Test 4: Failed for example {n}, signal {signal} (random seed {random_seed})' + for n in range(len(dataset)): + for use_lhotse in [False, True]: + item = dataset_lhotse[n] if use_lhotse else dataset.__getitem__(n) + golden_start = golden_end = None + for signal in data: + item_signal = item[signal].squeeze(0) if use_lhotse else item[signal] + full_golden_signal = data[signal][filtered_examples[n]] + + # Find random segment using correlation on the first channel + # of the first signal, and then use it fixed for other signals + if golden_start is None: + golden_start = get_segment_start( + signal=full_golden_signal[0, :], segment=item_signal[0, :] + ) + if not random_offset: + assert ( + golden_start == 0 + ), f'Test 4, use_lhotse={use_lhotse}: Expecting the signal to start at 0 when random_offset is False' + + golden_end = golden_start + audio_duration_samples + golden_signal = full_golden_signal[..., golden_start:golden_end] + + # Test length is correct + assert ( + item_signal.shape[-1] == audio_duration_samples + ), f'Test 4, use_lhotse={use_lhotse}: Signal length ({item_signal.shape[-1]}) not matching the expected length ({audio_duration_samples})' + + assert ( + item_signal.shape == golden_signal.shape + ), f'Test 4, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' + # Test signal values + assert np.allclose( + item_signal, golden_signal, atol=atol + ), f'Test 4, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})' # Test 5: # - Test collate_fn batch_size = 16 - batch = [dataset.__getitem__(n) for n in range(batch_size)] - batched = dataset.collate_fn(batch) - for n, signal in enumerate(data.keys()): - signal_shape = batched[2 * n].shape - signal_len = batched[2 * n + 1] + for use_lhotse in [False, True]: + if use_lhotse: + # Get batch from lhotse dataloader + config_lhotse['batch_size'] = batch_size + dl_lhotse = get_lhotse_dataloader_from_config( + OmegaConf.create(config_lhotse), + global_rank=0, + world_size=1, + dataset=LhotseAudioToTargetDataset(), + ) + batched = next(iter(dl_lhotse)) + else: + # Get examples from dataset and collate into a batch + batch = [dataset.__getitem__(n) for n in range(batch_size)] + batched = dataset.collate_fn(batch) - assert signal_shape == ( - batch_size, - data_num_channels[signal], - audio_duration_samples, - ), f'Test 5: Unexpected signal {signal} shape {signal_shape}' - assert len(signal_len) == batch_size, f'Test 5: Unexpected length of signal_len ({len(signal_len)})' - assert all(signal_len == audio_duration_samples), f'Test 5: Unexpected signal_len {signal_len}' + # Test all shapes and lengths + for n, signal in enumerate(data.keys()): + length = signal.replace('_signal', '_length') + + if isinstance(batched, dict): + signal_shape = batched[signal].shape + signal_len = batched[length] + else: + signal_shape = batched[2 * n].shape + signal_len = batched[2 * n + 1] + + assert signal_shape == ( + batch_size, + data_num_channels[signal], + audio_duration_samples, + ), f'Test 5, use_lhotse={use_lhotse}: Unexpected signal {signal} shape {signal_shape}' + assert ( + len(signal_len) == batch_size + ), f'Test 5, use_lhotse={use_lhotse}: Unexpected length of signal_len ({len(signal_len)})' + assert all( + signal_len == audio_duration_samples + ), f'Test 5, use_lhotse={use_lhotse}: Unexpected signal_len {signal_len}' @pytest.mark.unit def test_audio_to_target_dataset_with_target_list(self): @@ -1237,28 +1318,49 @@ def test_audio_to_target_dataset_with_target_list(self): } dataset_factory = audio_to_audio_dataset.get_audio_to_target_dataset(config) - for n in range(num_examples): - item = dataset.__getitem__(n) - item_factory = dataset_factory.__getitem__(n) + # Prepare lhotse manifest + cuts_path = manifest_filepath.replace('.json', '_cuts.jsonl') + convert_manifest_nemo_to_lhotse( + input_manifest=manifest_filepath, + output_manifest=cuts_path, + input_key=data_key['input_signal'], + target_key=data_key['target_signal'], + ) - for signal in data: - item_signal = item[signal].cpu().detach().numpy() - golden_signal = data[signal][n] - assert ( - item_signal.shape == golden_signal.shape - ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' - assert np.allclose( - item_signal, golden_signal, atol=atol - ), f'Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})' + # Prepare lhotse dataset + config_lhotse = { + 'cuts_path': cuts_path, + 'use_lhotse': True, + 'sample_rate': sample_rate, + 'batch_size': 1, + } + dl_lhotse = get_lhotse_dataloader_from_config( + OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset() + ) + dataset_lhotse = [item for item in dl_lhotse] - item_factory_signal = item_factory[signal].cpu().detach().numpy() - assert np.allclose( - item_factory_signal, golden_signal, atol=atol - ), f'Test 1: Failed for factory example {n}, signal {signal} (random seed {random_seed})' + for n in range(num_examples): + for use_lhotse in [False, True]: + item = dataset_lhotse[n] if use_lhotse else dataset.__getitem__(n) + item_factory = dataset_factory.__getitem__(n) + for signal in data: + item_signal = item[signal].squeeze(0) if use_lhotse else item[signal] + golden_signal = data[signal][n] + assert ( + item_signal.shape == golden_signal.shape + ), f'Test 1, use_lhotse={use_lhotse}: Signal {signal} item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' + assert np.allclose( + item_signal, golden_signal, atol=atol + ), f'Test 1, use_lhotse={use_lhotse}: Failed for example {n}, signal {signal} (random seed {random_seed})' + + assert np.allclose( + item_factory[signal], golden_signal, atol=atol + ), f'Test 1, use_lhotse={use_lhotse}: Failed for factory example {n}, signal {signal} (random seed {random_seed})' # Test 2 # Set target as the first channel of input_filepath and all files listed in target_filepath. # In this case, the target will have 3 channels. + # Note: this is currently not supported by lhotse, so we only test the default dataset here. dataset = AudioToTargetDataset( manifest_filepath=manifest_filepath, input_key=data_key['input_signal'], @@ -1367,29 +1469,55 @@ def test_audio_to_target_dataset_for_inference(self): } dataset_factory = audio_to_audio_dataset.get_audio_to_target_dataset(config) + # Prepare lhotse manifest + cuts_path = manifest_filepath.replace('.json', '_cuts.jsonl') + convert_manifest_nemo_to_lhotse( + input_manifest=manifest_filepath, + output_manifest=cuts_path, + input_key=data_key['input_signal'], + target_key=None, + ) + + # Prepare lhotse dataset + config_lhotse = { + 'cuts_path': cuts_path, + 'use_lhotse': True, + 'sample_rate': sample_rate, + 'batch_size': 1, + } + dl_lhotse = get_lhotse_dataloader_from_config( + OmegaConf.create(config_lhotse), global_rank=0, world_size=1, dataset=LhotseAudioToTargetDataset() + ) + dataset_lhotse = [item for item in dl_lhotse] + for n in range(num_examples): - item = dataset.__getitem__(n) - item_factory = dataset_factory.__getitem__(n) - # Check target is None - assert item['target_signal'].numel() == 0, 'target_signal is expected to be empty.' - assert item_factory['target_signal'].numel() == 0, 'target_signal is expected to be empty.' + for label in ['original', 'factory', 'lhotse']: - # Check valid signals - for signal in data: - item_signal = item[signal].cpu().detach().numpy() - golden_signal = data[signal][n] - assert ( - item_signal.shape == golden_signal.shape - ), f'Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' - assert np.allclose( - item_signal, golden_signal, atol=atol - ), f'Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})' + if label == 'original': + item = dataset.__getitem__(n) + elif label == 'factory': + item = dataset_factory.__getitem__(n) + elif label == 'lhotse': + item = dataset_lhotse[n] + else: + raise ValueError(f'Unknown label {label}') - item_factory_signal = item_factory[signal].cpu().detach().numpy() - assert np.allclose( - item_factory_signal, golden_signal, atol=atol - ), f'Test 1: Failed for factory example {n}, signal {signal} (random seed {random_seed})' + # Check target is None + if 'target_signal' in item: + assert item['target_signal'].numel() == 0, f'{label}: target_signal is expected to be empty.' + + # Check valid signals + for signal in data: + + item_signal = item[signal].squeeze(0) if label == 'lhotse' else item[signal] + golden_signal = data[signal][n] + assert ( + item_signal.shape == golden_signal.shape + ), f'{label} -- Signal {signal}: item shape {item_signal.shape} not matching reference shape {golden_signal.shape}' + assert np.allclose( + item_signal, golden_signal, atol=atol + ), f'{label} -- Test 1: Failed for example {n}, signal {signal} (random seed {random_seed})' @pytest.mark.unit def test_audio_to_target_with_reference_dataset(self): diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py index 4e89a93e83e4..791c5df1c018 100644 --- a/tests/collections/common/test_lhotse_dataloading.py +++ b/tests/collections/common/test_lhotse_dataloading.py @@ -21,6 +21,8 @@ import numpy as np import pytest import torch +from lhotse import CutSet, NumpyFilesWriter, Recording +from lhotse.audio import AudioLoadingError from lhotse.cut import Cut from lhotse.cut.text import TextPairExample from omegaconf import OmegaConf @@ -189,6 +191,62 @@ def test_dataloader_from_lhotse_cuts(cutset_path: Path): assert b["audio"].shape[0] == b["audio_lens"].shape[0] == 1 +def test_dataloader_from_lhotse_cuts_truncate(cutset_path: Path): + config = OmegaConf.create( + { + "cuts_path": cutset_path, + "truncate_duration": 0.5, + "sample_rate": 16000, + "shuffle": True, + "use_lhotse": True, + "num_workers": 0, + "batch_size": 4, + "seed": 0, + } + ) + + dl = get_lhotse_dataloader_from_config( + config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset() + ) + + batches = [b for b in dl] + assert len(batches) == 3 + # 0.5s = 8000 samples, note the constant duration and batch size except for last batch + assert batches[0]["audio"].shape == (4, 8000) + assert batches[1]["audio"].shape == (4, 8000) + assert batches[2]["audio"].shape == (2, 8000) + # exactly 10 cuts were used + + +def test_dataloader_from_lhotse_cuts_cut_into_windows(cutset_path: Path): + config = OmegaConf.create( + { + "cuts_path": cutset_path, + "cut_into_windows_duration": 0.5, + "sample_rate": 16000, + "shuffle": True, + "use_lhotse": True, + "num_workers": 0, + "batch_size": 4, + "seed": 0, + } + ) + + dl = get_lhotse_dataloader_from_config( + config=config, global_rank=0, world_size=1, dataset=UnsupervisedAudioDataset() + ) + + batches = [b for b in dl] + assert len(batches) == 5 + # 0.5s = 8000 samples, note the constant duration and batch size + assert batches[0]["audio"].shape == (4, 8000) + assert batches[1]["audio"].shape == (4, 8000) + assert batches[2]["audio"].shape == (4, 8000) + assert batches[3]["audio"].shape == (4, 8000) + assert batches[4]["audio"].shape == (4, 8000) + # exactly 20 cuts were used because we cut 10x 1s cuts into 20x 0.5s cuts + + @requires_torchaudio def test_dataloader_from_lhotse_shar_cuts(cutset_shar_path: Path): config = OmegaConf.create( @@ -770,7 +828,46 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path): assert cut.num_samples == 8000 assert cut.supervisions[0].text == "irrelevant" assert audio.shape == (1, 8000) - np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5) + np.testing.assert_equal(audio[0], expected_audio[:8000]) + + +def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path): + cuts_path = tmp_path / "cuts.jsonl.gz" + audio_path = tmp_path / "_relative_test_audio_.wav" + lhotse.audio.save_audio(audio_path, np.random.rand(16000) - 0.5, 16000) + cut = Recording.from_file(audio_path).to_cut() + cut.recording.sources[0].source = str(audio_path.name) # make the path relative + cut.target_recording = cut.recording # assign a custom field with relative path + with NumpyFilesWriter(tmp_path) as w: + cut.some_array = w.store_array(cut.id, np.random.randn(32)) + cut.some_array.storage_path = "" # relative path + + with pytest.raises(AudioLoadingError): + cut.load_audio() # Lhotse doesn't know about what the path should be relative to + cut.load_target_recording() + + CutSet([cut]).to_file(cuts_path) + + config = OmegaConf.create( + {"cuts_path": cuts_path, "sample_rate": 16000, "use_lhotse": True, "num_workers": 0, "batch_size": 2,} + ) + + class _Identity(torch.utils.data.Dataset): + def __getitem__(self, x): + return x + + dl = get_lhotse_dataloader_from_config(config=config, global_rank=0, world_size=1, dataset=_Identity()) + + batches = [batch for batch in dl] + assert len(batches) == 1 + + for cut in batches[0]: + assert cut.has_recording + cut.load_audio() # works + assert cut.has_custom("target_recording") + cut.load_target_recording() + assert cut.has_custom("some_array") + cut.load_some_array() class Identity(torch.utils.data.Dataset): From 468d5b6d369733909524d42b80a514f33bc19263 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:39:31 -0700 Subject: [PATCH 39/39] Akoumparouli/low mem mixtral ckpt converter (#8895) * add --low-mem option to enable conversion of large checkpoints with low ram requirements Signed-off-by: Alexandros Koumparoulis * delete param_to_weights Signed-off-by: Alexandros Koumparoulis * various fixes; set hf dtype to auto Signed-off-by: Alexandros Koumparoulis * remove unused lien Signed-off-by: Alexandros Koumparoulis * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../convert_mixtral_hf_to_nemo.py | 109 ++++++++++++++---- 1 file changed, 88 insertions(+), 21 deletions(-) diff --git a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py index ac323757a2f6..98143c0328ec 100644 --- a/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mixtral_hf_to_nemo.py @@ -24,6 +24,7 @@ import os from argparse import ArgumentParser from collections import OrderedDict +from pathlib import Path import megatron.core.parallel_state as parallel_state import torch @@ -43,6 +44,8 @@ ) from nemo.utils import logging +torch.set_grad_enabled(False) + def get_args(): parser = ArgumentParser() @@ -51,6 +54,8 @@ def get_args(): ) parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to output .nemo file.") parser.add_argument("--precision", type=str, default="32", help="Model precision") + parser.add_argument('--low-ram', action='store_true') + parser.add_argument('--tmp-dir', default='/tmp/mixtral_ckpt_parts/') args = parser.parse_args() return args @@ -108,6 +113,9 @@ def load_config(mixtral_config, tokenizer_path): # RMSNorm's epsilon. nemo_config.layernorm_epsilon = mixtral_config['rms_norm_eps'] nemo_config.normalization = 'rmsnorm' + nemo_config.micro_batch_size = 1 + nemo_config.global_batch_size = 1 + nemo_config.expert_model_parallel_size = 1 if 'num_key_value_heads' in mixtral_config: nemo_config.num_query_groups = mixtral_config['num_key_value_heads'] @@ -132,24 +140,28 @@ def load_config(mixtral_config, tokenizer_path): return nemo_config -def load_mixtral_ckpt(in_dir): +def load_hf_model_args(in_dir): params_file = os.path.join(in_dir, 'config.json') assert os.path.exists(params_file) with open(params_file, 'r') as fp: model_args = json.load(fp) + return model_args + - model = AutoModelForCausalLM.from_pretrained(in_dir) - ckpt = model.state_dict() +def load_mixtral_ckpt(in_dir, load_model=True): + model_args = load_hf_model_args(in_dir) + ckpt = None + if load_model: + model = AutoModelForCausalLM.from_pretrained(in_dir, torch_dtype='auto') + ckpt = model.state_dict() tokenizer = AutoTokenizer.from_pretrained(in_dir) assert tokenizer.vocab_size == model_args['vocab_size'] return model_args, ckpt, tokenizer -def convert(args): - logging.info(f"loading checkpoint {args.input_name_or_path}") - - model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path) +def make_trainer(args, nemo_config): + model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False) nemo_config = load_config(model_args, tokenizer.vocab_file) if args.precision in ["32", "16"]: @@ -195,6 +207,14 @@ def convert(args): print(f"nemo_config: {nemo_config}") trainer = Trainer(plugins=plugins, accelerator='cpu', strategy=NLPDDPStrategy()) + return trainer, dtype + + +def convert(args): + logging.info(f"loading checkpoint {args.input_name_or_path}") + + model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path) + nemo_config = load_config(model_args, tokenizer.vocab_file) hidden_size = nemo_config.hidden_size head_num = nemo_config.num_attention_heads @@ -207,8 +227,6 @@ def convert(args): 'transformer_engine', False ), "mcore_gpt transformer_engine must be enabled (or disabled) together." - param_to_weights = lambda param: param.float() - checkpoint = OrderedDict() checkpoint['state_dict'] = OrderedDict() @@ -217,7 +235,7 @@ def convert(args): embed_weights_base_name = f'model.embedding.word_embeddings.weight' else: embed_weights_base_name = f'model.language_model.embedding.word_embeddings.weight' - checkpoint['state_dict'][embed_weights_base_name] = param_to_weights(embed_weight) + checkpoint['state_dict'][embed_weights_base_name] = embed_weight if nemo_config.num_query_groups is None or nemo_config.num_query_groups == head_num: num_query_groups = head_num @@ -227,6 +245,10 @@ def convert(args): if mcore_gpt: assert nemo_config.activation.startswith('fast-'), 'mcore only supports fast version of gated linear unit.' + yield checkpoint + checkpoint = OrderedDict() + checkpoint['state_dict'] = OrderedDict() + for l in range(int(num_layers)): print(f"converting layer {l}") old_tensor_shape = ckpt[f'model.layers.{l}.self_attn.q_proj.weight'].size() @@ -249,7 +271,7 @@ def convert(args): qkv_weights_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.weight' else: qkv_weights_base_name = f'model.language_model.encoder.layers.{l}.self_attention.query_key_value.weight' - checkpoint['state_dict'][qkv_weights_base_name] = param_to_weights(qkv_weights) + checkpoint['state_dict'][qkv_weights_base_name] = qkv_weights # attention dense o_weight = ckpt[f'model.layers.{l}.self_attn.o_proj.weight'] @@ -257,7 +279,7 @@ def convert(args): o_weight_base_name = f'model.decoder.layers.{l}.self_attention.linear_proj.weight' else: o_weight_base_name = f'model.language_model.encoder.layers.{l}.self_attention.dense.weight' - checkpoint['state_dict'][o_weight_base_name] = param_to_weights(o_weight) + checkpoint['state_dict'][o_weight_base_name] = o_weight # # MLP # Handle gate @@ -266,7 +288,7 @@ def convert(args): moe_gate_name = f'model.decoder.layers.{l}.mlp.router.weight' else: raise Exception("not implemented") - checkpoint['state_dict'][moe_gate_name] = param_to_weights(moe_gate) + checkpoint['state_dict'][moe_gate_name] = moe_gate # Handle experts for i in range(nemo_config.num_moe_experts): gate_proj = ckpt[f'model.layers.{l}.block_sparse_moe.experts.{i}.w1.weight'] @@ -276,14 +298,14 @@ def convert(args): else: raise Exception("not implemented") mlp_down_weight = torch.cat((gate_proj, up_proj), axis=0) - checkpoint['state_dict'][mlp_down_base_name] = param_to_weights(mlp_down_weight) + checkpoint['state_dict'][mlp_down_base_name] = mlp_down_weight mlp_up_weight = ckpt[f'model.layers.{l}.block_sparse_moe.experts.{i}.w2.weight'] if mcore_gpt: mlp_up_base_name = f'model.decoder.layers.{l}.mlp.experts.local_experts.{i}.linear_fc2.weight' else: raise Exception("not implemented") - checkpoint['state_dict'][mlp_up_base_name] = param_to_weights(mlp_up_weight) + checkpoint['state_dict'][mlp_up_base_name] = mlp_up_weight # LayerNorm input_ln_weight = ckpt[f'model.layers.{l}.input_layernorm.weight'] @@ -292,7 +314,7 @@ def convert(args): input_ln_base_name = f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight' else: input_ln_base_name = f'model.language_model.encoder.layers.{l}.input_layernorm.weight' - checkpoint['state_dict'][input_ln_base_name] = param_to_weights(input_ln_weight) + checkpoint['state_dict'][input_ln_base_name] = input_ln_weight post_attn_ln_weight = ckpt[f'model.layers.{l}.post_attention_layernorm.weight'] if mcore_gpt: @@ -301,28 +323,57 @@ def convert(args): post_attn_ln_base_name = f'model.decoder.layers.{l}.pre_mlp_layernorm.weight' else: post_attn_ln_base_name = f'model.language_model.encoder.layers.{l}.post_attention_layernorm.weight' - checkpoint['state_dict'][post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight) + checkpoint['state_dict'][post_attn_ln_base_name] = post_attn_ln_weight print(f"done layer {l}") + yield checkpoint + checkpoint = OrderedDict() + checkpoint['state_dict'] = OrderedDict() + final_ln_weight = ckpt[f'model.norm.weight'] if mcore_gpt: final_ln_base_name = f'model.decoder.final_layernorm.weight' else: final_ln_base_name = f'model.language_model.encoder.final_layernorm.weight' - checkpoint['state_dict'][final_ln_base_name] = param_to_weights(final_ln_weight) + checkpoint['state_dict'][final_ln_base_name] = final_ln_weight output_layer_weight = ckpt[f'lm_head.weight'] if mcore_gpt: output_layer_base_name = f'model.output_layer.weight' else: output_layer_base_name = f'model.language_model.output_layer.weight' - checkpoint['state_dict'][output_layer_base_name] = param_to_weights(output_layer_weight) + checkpoint['state_dict'][output_layer_base_name] = output_layer_weight checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config - + yield checkpoint del ckpt + +def merge(a: dict, b: dict, path=[]): + is_dict = lambda x: isinstance(x, OrderedDict) or isinstance(x, dict) + for key in b: + if key in a: + if is_dict(a[key]) and is_dict(b[key]): + merge(a[key], b[key], path + [str(key)]) + elif a[key] != b[key]: + raise Exception('Value conflict: ' + '.'.join(path + [str(key)])) + else: + a[key] = b[key] + return a + + +def save_to_nemo(args, checkpoint): + + logging.info(f"loading checkpoint {args.input_name_or_path}") + model_args, ckpt, tokenizer = load_mixtral_ckpt(args.input_name_or_path, load_model=False) + nemo_config = load_config(model_args, tokenizer.vocab_file) + trainer, dtype = make_trainer(args, nemo_config) + + checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY] = nemo_config + checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY].use_cpu_initialization = True + checkpoint[MegatronGPTModel.CHECKPOINT_HYPER_PARAMS_KEY].perform_initialization = False + if nemo_config.get('megatron_amp_O2', False): keys = list(checkpoint['state_dict'].keys()) for key in keys: @@ -342,5 +393,21 @@ def convert(args): if __name__ == '__main__': args = get_args() + if args.low_ram: + os.makedirs(args.tmp_dir, exist_ok=True) + parallel_state.set_expert_model_parallel_world_size(1) - convert(args) + checkpoint = OrderedDict() + for i, ckpt_part in enumerate(convert(args)): + if args.low_ram: + torch.save(ckpt_part, f'{args.tmp_dir}/nemo_ckpt_part_{i}.pth') + else: + checkpoint = merge(checkpoint, ckpt_part) + + if args.low_ram: + print("Loading partial checkpoints") + for path in map(str, Path(args.tmp_dir).rglob("*.pth")): + print(f"Loading checkpoint: {path}") + checkpoint = merge(checkpoint, torch.load(path, mmap=True)) + + save_to_nemo(args, checkpoint)