Merge pull request #731 from TransformerLensOrg/model-llama-32

Model llama 3.2
TransformerLensOrg · Sep 26, 2024 · 4b8321c · 4b8321c
2 parents 762bb5d + 2f3f1ae
commit 4b8321c
Showing 1 changed file with 80 additions and 0 deletions.
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -151,6 +151,10 @@
  "meta-llama/Meta-Llama-3-8B-Instruct",
  "meta-llama/Meta-Llama-3-70B",
  "meta-llama/Meta-Llama-3-70B-Instruct",
+ "meta-llama/Llama-3.2-1B",
+ "meta-llama/Llama-3.2-3B",
+ "meta-llama/Llama-3.2-1B-Instruct",
+ "meta-llama/Llama-3.2-3B-Instruct",
  "Baidicoot/Othello-GPT-Transformer-Lens",
  "bert-base-cased",
  "roneneldan/TinyStories-1M",
@@ -885,6 +889,82 @@ def convert_hf_model_config(model_name: str, **kwargs):
  "final_rms": True,
  "gated_mlp": True,
  }
+ elif "Llama-3.2-1B" in official_model_name:
+ cfg_dict = {
+ "d_model": 2048,
+ "d_head": 64,
+ "n_heads": 32,
+ "d_mlp": 8192,
+ "n_layers": 16,
+ "n_ctx": 2048, # capped due to memory issues
+ "eps": 1e-5,
+ "d_vocab": 128256,
+ "act_fn": "silu",
+ "n_key_value_heads": 8,
+ "normalization_type": "RMS",
+ "positional_embedding_type": "rotary",
+ "rotary_adjacent_pairs": False,
+ "rotary_dim": 64,
+ "final_rms": True,
+ "gated_mlp": True,
+ }
+ elif "Llama-3.2-3B" in official_model_name:
+ cfg_dict = {
+ "d_model": 3072,
+ "d_head": 128,
+ "n_heads": 24,
+ "d_mlp": 8192,
+ "n_layers": 28,
+ "n_ctx": 2048, # capped due to memory issues
+ "eps": 1e-5,
+ "d_vocab": 128256,
+ "act_fn": "silu",
+ "n_key_value_heads": 8,
+ "normalization_type": "RMS",
+ "positional_embedding_type": "rotary",
+ "rotary_adjacent_pairs": False,
+ "rotary_dim": 128,
+ "final_rms": True,
+ "gated_mlp": True,
+ }
+ elif "Llama-3.2-1B-Instruct" in official_model_name:
+ cfg_dict = {
+ "d_model": 2048,
+ "d_head": 64,
+ "n_heads": 32,
+ "d_mlp": 8192,
+ "n_layers": 16,
+ "n_ctx": 2048, # capped due to memory issues
+ "eps": 1e-5,
+ "d_vocab": 128256,
+ "act_fn": "silu",
+ "n_key_value_heads": 8,
+ "normalization_type": "RMS",
+ "positional_embedding_type": "rotary",
+ "rotary_adjacent_pairs": False,
+ "rotary_dim": 64,
+ "final_rms": True,
+ "gated_mlp": True,
+ }
+ elif "Llama-3.2-3B-Instruct" in official_model_name:
+ cfg_dict = {
+ "d_model": 3072,
+ "d_head": 128,
+ "n_heads": 24,
+ "d_mlp": 8192,
+ "n_layers": 28,
+ "n_ctx": 2048, # capped due to memory issues
+ "eps": 1e-5,
+ "d_vocab": 128256,
+ "act_fn": "silu",
+ "n_key_value_heads": 8,
+ "normalization_type": "RMS",
+ "positional_embedding_type": "rotary",
+ "rotary_adjacent_pairs": False,
+ "rotary_dim": 128,
+ "final_rms": True,
+ "gated_mlp": True,
+ }
  elif architecture == "GPTNeoForCausalLM":
  cfg_dict = {
  "d_model": hf_config.hidden_size,