Pass in rotary_base to mcore and from HF (#7933)

* Pass in rotary_base to mcore and from HF Signed-off-by: Igor Gitman <igitman@nvidia.com> * Allow changing rotary_base from the sft config file Signed-off-by: Igor Gitman <igitman@nvidia.com> * Update mcore in jenkins Signed-off-by: Igor Gitman <igitman@nvidia.com> --------- Signed-off-by: Igor Gitman <igitman@nvidia.com> Co-authored-by: Eric Harper <complex451@gmail.com>
NVIDIA · Dec 3, 2023 · ae5d7e8 · ae5d7e8
1 parent acf1d9b
commit ae5d7e8
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 2 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -72,8 +72,8 @@ pipeline {
       steps {
          sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
              cd Megatron-LM && \
-             git checkout e122536b7645edcb7ebf099b5c92a443f7dbf8e7 && \
-             pip install -e .'
+             git checkout 973330e9c3681604703bf1eb6b5a265d1b9b9b38 && \
+             pip install .'
       }
     }
 

diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py b/examples/nlp/language_modeling/tuning/megatron_gpt_sft.py
@@ -90,6 +90,9 @@ def _modify_config(gpt_cfg, cfg, add_cfg_to_tree=False):
         if cfg.model.get('seq_len_interpolation_factor', None) is not None:
             gpt_cfg.seq_len_interpolation_factor = cfg.model.seq_len_interpolation_factor
 
+        if cfg.model.get('rotary_base', None) is not None:
+            gpt_cfg.rotary_base = cfg.model.rotary_base
+
         sft_cls = MegatronGPTSFTModel
         gpt_cfg.target = f"{sft_cls.__module__}.{sft_cls.__name__}"
 

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -318,6 +318,7 @@ def model_provider_func(self, pre_process, post_process):
                 position_embedding_type=self.cfg.get('position_embedding_type', 'learned_absolute'),
                 rotary_percent=self.cfg.get('rotary_percentage', 1.0),
                 seq_len_interpolation_factor=self.cfg.get('seq_len_interpolation_factor', None),
+                rotary_base=self.cfg.get('rotary_base', 10000),
             )
         else:
             assert self.cfg.get('num_query_groups', None) is None or self.cfg.get(

diff --git a/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py b/scripts/nlp_language_modeling/convert_hf_llama_to_nemo.py
@@ -116,6 +116,8 @@ def load_config(args, llama_config):
             nemo_config['seq_len_interpolation_factor'] = llama_config['rope_scaling']['factor']
         else:
             raise ValueError("Only linear rope scaling type is supported now")
+    if llama_config['rope_theta'] is not None:
+        nemo_config['rotary_base'] = llama_config['rope_theta']
 
     base = 128
     while llama_config['vocab_size'] % base != 0: