diff --git a/Jenkinsfile b/Jenkinsfile index 1f974333dd3a..12fafac57a67 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -72,8 +72,8 @@ pipeline { steps { sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \ cd Megatron-LM && \ - git checkout e122536b7645edcb7ebf099b5c92a443f7dbf8e7 && \ - pip install -e .' + git checkout 973330e9c3681604703bf1eb6b5a265d1b9b9b38 && \ + pip install .' } } diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py index 79dd20fcf84a..b6325be40829 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py @@ -90,6 +90,9 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False): if cfg.model.get('seq_len_interpolation_factor', None) is not None: gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor + if cfg.model.get('rotary_base', None) is not None: + gpt_cfg.rotary_base = cfg.model.rotary_base + sft_cls = MegatronGPTSFTModel gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}" diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 5b14532016c5..c2e39ea03a3e 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -318,6 +318,7 @@ def model_provider_func(self, pre_process, post_process): position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'), rotary_percent=self.cfg.get('rotary_percentage', 1.0), seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None), + rotary_base=self.cfg.get('rotary_base', 10000), ) else: assert self.cfg.get('num_query_groups', None) is None or self.cfg.get( diff --git a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py index c281088f8c5c..d1453aeee972 100644 --- a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py +++ b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py @@ -116,6 +116,8 @@ def load_config(args, llama_config): nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor'] else: raise ValueError("Only linear rope scaling type is supported now") + if llama_config['rope_theta'] is not None: + nemo_config['rotary_base'] = llama_config['rope_theta'] base = 128 while llama_config['vocab_size'] % base != 0: