Remove unnecessary optim in convert_HF (#71)

eole-nlp · Jul 12, 2024 · d3f05fe · d3f05fe
1 parent 2009415
commit d3f05fe
Show file tree

Hide file tree

Showing 7 changed files with 0 additions and 13 deletions.
diff --git a/eole/bin/convert/convert_HF.py b/eole/bin/convert/convert_HF.py
@@ -1029,7 +1029,6 @@ def get_weight(checkpoint, tensor_name):
                 quant_type=quant_type,
                 w_bit=w_bit,
                 group_size=group_size,
-                optim="fusedadam",
             ),
         )
         config_dict = recursive_model_fields_set(config)

diff --git a/eole/bin/convert/convert_T5.py b/eole/bin/convert/convert_T5.py
@@ -420,7 +420,6 @@ def run(cls, args):
                 ),
                 layer_norm="rms",
                 pos_ffn_activation_fn="gated-gelu",
-                self_attn_type="scaled-dot",
                 relative_positions_buckets=params["relative_attention_num_buckets"],
                 parallel_residual=False,
                 add_qkvbias=False,
@@ -437,7 +436,6 @@ def run(cls, args):
                 accum_count=[32],
                 accum_steps=[0],
                 valid_batch_size=256,
-                optim="fusedadam",
             ),
         )
 

diff --git a/eole/bin/convert/convert_falcon.py b/eole/bin/convert/convert_falcon.py
@@ -381,7 +381,6 @@ def run(cls, args):
                 # tgt_word_vec_size=tgt_word_vec_size,
                 model_type="text",
                 pos_ffn_activation_fn="gelu",
-                self_attn_type="scaled-dot",  # not sure if scaled-dot-flash is fine
                 num_kv=num_kv,
                 parallel_residual=True,
                 shared_layer_norm=shared_layer,
@@ -397,7 +396,6 @@ def run(cls, args):
                 accum_count=[32],
                 accum_steps=[0],
                 valid_batch_size=256,
-                optim="fusedadam",
             ),
         )
 

diff --git a/eole/bin/convert/convert_llama.py b/eole/bin/convert/convert_llama.py
@@ -374,7 +374,6 @@ def run(cls, args):
                 layer_norm="rms",
                 norm_eps=norm_eps,
                 pos_ffn_activation_fn="silu",
-                self_attn_type="scaled-dot",
                 rotary_interleave=True,
                 rotary_theta=10000,
                 rotary_dim=0,
@@ -393,7 +392,6 @@ def run(cls, args):
                 accum_count=[32],
                 accum_steps=[0],
                 valid_batch_size=256,
-                optim="fusedadam",
             ),
         )
 

diff --git a/eole/bin/convert/convert_mpt.py b/eole/bin/convert/convert_mpt.py
@@ -223,7 +223,6 @@ def run(cls, args):
                 # tgt_word_vec_size=tgt_word_vec_size,
                 layer_norm="standard",
                 pos_ffn_activation_fn="gelu",
-                self_attn_type="scaled-dot",
                 parallel_residual=False,
                 add_qkvbias=False,
                 add_ffnbias=False,
@@ -237,7 +236,6 @@ def run(cls, args):
                 accum_count=[32],
                 accum_steps=[0],
                 valid_batch_size=256,
-                optim="fusedadam",
             ),
         )
 

diff --git a/eole/bin/convert/convert_redpajama.py b/eole/bin/convert/convert_redpajama.py
@@ -281,7 +281,6 @@ def run(cls, args):
                 # tgt_word_vec_size=tgt_word_vec_size,
                 layer_norm="standard",
                 pos_ffn_activation_fn="gelu",
-                self_attn_type="scaled-dot",
                 parallel_residual=False,
                 add_qkvbias=True,
                 add_ffnbias=True,
@@ -295,7 +294,6 @@ def run(cls, args):
                 accum_count=[32],
                 accum_steps=[0],
                 valid_batch_size=256,
-                optim="fusedadam",
             ),
         )
 

diff --git a/eole/bin/convert/convert_xgen.py b/eole/bin/convert/convert_xgen.py
@@ -239,7 +239,6 @@ def run(cls, args):
                 # tgt_word_vec_size=tgt_word_vec_size,
                 layer_norm="rms",
                 pos_ffn_activation_fn="silu",
-                self_attn_type="scaled-dot",
                 parallel_residual=False,
                 add_qkvbias=False,
                 add_ffnbias=False,
@@ -253,7 +252,6 @@ def run(cls, args):
                 accum_count=[32],
                 accum_steps=[0],
                 valid_batch_size=256,
-                optim="fusedadam",
             ),
         )