From fd14eca1d38eb33d87ee4f25f52fd5122ed6c414 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Mon, 15 Apr 2024 17:07:38 +0200
Subject: [PATCH 01/10] test 1

---
 src/lighteval/models/nanotron_model.py | 105 ++++++++++++++++---------
 1 file changed, 68 insertions(+), 37 deletions(-)

diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 89c1b75c..64ce85bd 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# ruff: noqa: C901,E120
+# ruff: noqa: C901
 import os
 import time
 from typing import List, Optional, Tuple, Type, Union
@@ -54,6 +54,7 @@
     LoglikelihoodDataset,
     LoglikelihoodSingleTokenDataset,
 )
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.models.base_model import LightevalModel
 from lighteval.models.model_config import EnvConfig
 from lighteval.models.model_output import Batch, GenerateReturn, LoglikelihoodReturn, LoglikelihoodSingleTokenReturn
@@ -1138,7 +1139,7 @@ def greedy_until(
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
         for request in requests:
-            request.stop_sequence = list(request.stop_sequence) + [self.tokenizer.eos_token]
+            request.stop_sequence = as_list(request.stop_sequence) + [self.tokenizer.eos_token]
             request.tokenized_context = self.tok_encode(request.context)
 
         dataset = GenerativeTaskDatasetNanotron(requests=requests, dataset_splits=dataset_splits)
@@ -1161,13 +1162,20 @@ def greedy_until(
             dataset.split_start = subset_start
             dataset.split_end = min(subset_start + subset_length, total_length)
 
-            context_enc = dataset[0][1].tokenized_context
-            max_gen = max(item[1].generation_size for item in dataset)
-            max_input_length = min(len(context_enc) + max_gen, self.max_length)
+            if dataset[0].generation_size is None:
+                # No constraints on the generation size: max length allowed is the max model context
+                max_input_length = self.max_length
+            else:
+                # Longest context in the current split is the first item (since we sort reversed)
+                context_enc = dataset[0][1].tokenized_context
+                max_gen = max(item[1].generation_size for item in dataset)
+                max_input_length = min(len(context_enc) + max_gen, self.max_length)
+
             batch_size = self._get_batch_size(
                 override_bs=override_bs, max_input_length=max_input_length, starting_batch_size=starting_batch_size
             )
-            starting_batch_size = batch_size * 2  # for the next round
+            # For next iteration, since the batch will be smaller, we'll test a bigger batch size
+            starting_batch_size = batch_size * 2
 
             # For the DP replicas
             distributed_sampler = GenDistributedSampler(
@@ -1188,7 +1196,7 @@ def greedy_until(
             )
 
             tq = tqdm(dataloader, desc=f"greedy in subset {s} Node {dist.get_rank(self.parallel_context.world_pg)}")
-            for j, all_batch in enumerate(tq):
+            for j, indexed_batch in enumerate(tq):
                 if j < 3:
                     log_rank(
                         f"Memory usage: {torch.cuda.memory_allocated() / 1024**2:.2f}MB. Peak reserved memory: {torch.cuda.max_memory_reserved() / 1024**2:.2f}MB",
@@ -1198,22 +1206,59 @@ def greedy_until(
                         rank=0,
                     )
                 iteration_start_time = time.time()
-                example_index, batch_data = zip(*all_batch)
-                context = [c.tokenized_context for c in batch_data]
-                # we take the longest asked generation in the batch
-                # Multiple request may have different max generation length
-                max_tokens = max(d.generation_size for d in batch_data)  # d[1][1]
-                if max_tokens <= 0:
-                    raise ValueError("Greedy generation requires a positive value for max generation but we got -1")
-
-                max_context = self.max_length - max_tokens
-                padding_length = min(len(context[0]), max_context)
-                batch_model = self.prepare_batch(
+                example_index, batch = zip(*indexed_batch)
+
+                # NOTE: we are assuming all items in a batch behave similarly (same
+                # stop_tokens and max_tokens genrated) which is not necessarily
+                # the case! Because of that we should only use batch size of 1
+
+                # Since items are sorted by inverse length, the first one always has
+                # the maximum allowed generation size for the batch, unless we want to force truncation
+                # need to pass them somewhere ! stop_tokens = batch[0].stop_sequence
+                max_new_tokens = batch[0].generation_size
+
+                # The main question for this step is the following:
+                # Would we rather truncate the prompt to allow generation to go to max_new_tokens, at the risk
+                # of loosing some meaning, or have some generations that are exceedingly short?
+                # The choice we go for here is to avoid truncating the prompt if we can, since it
+                # should have been managed by the prompt creator/few shot manager if requested by the user.
+                context = [c.context for c in batch]  # or tokenized context?
+                smallest_context = min(len(c) for c in context)
+                biggest_context = max(len(c) for c in context)
+                if smallest_context > self.max_length:
+                    hlog_warn(
+                        f"The smallest context of your batch ({smallest_context}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
+                        + str({i.task_name for i in batch})
+                        + ". This is likely to lead to some errors."  # noqa C401
+                    )
+
+                if (
+                    biggest_context > self.max_length
+                ):  # There will be truncation of at least one sample, maximum generation size will be one
+                    max_new_tokens = 1
+                else:  # We can't allow generation of more than max_length
+                    max_new_tokens = min(self.max_length - biggest_context, max_new_tokens)
+
+                # See doc https://huggingface.co/docs/transformers/v4.38.2/en/pad_truncation#padding-and-truncation
+                # Will do left truncation and padding, as defined when creating the tokenizer
+                tokenized = self.tokenizer(
                     context,
-                    padding_length=padding_length,
-                    max_context=max_context,
-                    pad_on_left=True,
-                    full_attention_masks=False,
+                    truncation="longest_first",  # we truncate to the model max length if needed
+                    padding="longest",  # we pad to the longest sequence
+                    return_tensors="pt",
+                    max_length=self.max_length - 1,  # we always allow minimum one token of generation
+                    add_special_tokens=self.add_special_tokens,
+                ).to(self.device)
+
+                batch_model = Batch(
+                    input_ids=tokenized["input_ids"],
+                    input_lengths=[len(item == 1) for item in tokenized["attention_mask"]],
+                    input_mask=tokenized["attention_mask"],
+                    truncated=[
+                        len(c) - tokenized["input_ids"].shape[1] if len(c) > tokenized["input_ids"].shape[1] else 0
+                        for c in context
+                    ],
+                    padded=[sum(mask == 0) for mask in tokenized["attention_mask"]],
                 )
 
                 # responses, logits and input_ids have all been gathered accross GPUs already
@@ -1222,10 +1267,9 @@ def greedy_until(
                 outputs = decode_tokenized(
                     input_ids=batch_model.input_ids,
                     input_mask=batch_model.input_mask,
-                    # TODO @thomasw21: From ModelWithLoss extract the model.
                     model=self.model,
                     parallel_context=self.parallel_context,
-                    max_new_tokens=max_tokens,
+                    max_new_tokens=max_new_tokens,
                     max_micro_batch_size=batch_size,  # ok for PP=1 for PP>1 we'll need to split the batch
                     returns_logits=returns_logits,
                     generation_config=self.generation_config,
@@ -1241,19 +1285,6 @@ def greedy_until(
                     logits = torch.stack([o.logits for o in outputs])
                     logits, len_logits = self.pad_and_gather(logits)
 
-                # if returns_logits:
-                #     # Used input_ids to get its max_length
-                #     transition_scores, len_logits = self.pad_and_gather(transition_scores)
-                # else:
-                #     transition_scores, len_logits = None, None
-
-                # responses, logits, input_ids, len_resps, len_logits, len_ids = self._model_generate(
-                #     input_ids=batched_inputs,
-                #     attention_mask=attention_masks,
-                #     max_tokens=max_tokens,
-                #     stop=stop_tokens,
-                #     returns_logits=returns_logits,
-                # )
                 if dist.get_rank(self.parallel_context.pp_pg) == self.output_pp_rank:
                     generations = batch_generations.numpy(force=True)
                     input_ids = batch_input_ids.numpy(force=True)

From 90ed9e39d91c6fe6d65f1d6574078960b3ab787f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Mon, 15 Apr 2024 17:32:38 +0200
Subject: [PATCH 02/10] fix

---
 src/lighteval/models/nanotron_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 64ce85bd..69ad420f 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -1162,7 +1162,7 @@ def greedy_until(
             dataset.split_start = subset_start
             dataset.split_end = min(subset_start + subset_length, total_length)
 
-            if dataset[0].generation_size is None:
+            if dataset[0][1].generation_size is None:
                 # No constraints on the generation size: max length allowed is the max model context
                 max_input_length = self.max_length
             else:

From 114a98199118938079cf85105a9c0ee35fdc7f13 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Thu, 18 Apr 2024 13:13:00 +0000
Subject: [PATCH 03/10] nanotron tests

---
 run_evals_nanotron.py                         |   6 +
 src/lighteval/main_nanotron.py                |  20 ++-
 src/lighteval/metrics/metrics.py              |  58 ++++----
 src/lighteval/models/nanotron_model.py        |   5 +-
 tests/config/README.md                        |  24 ++++
 .../lighteval_config_override_custom.yaml     |  30 ++++
 .../reference_task_scores_nanotron.py         |  48 +++++++
 tests/test_main_nanotron.py                   | 129 ++++++++++++++++++
 8 files changed, 288 insertions(+), 32 deletions(-)
 mode change 100644 => 100755 run_evals_nanotron.py
 create mode 100644 tests/config/README.md
 create mode 100644 tests/config/lighteval_config_override_custom.yaml
 create mode 100644 tests/reference_scores/reference_task_scores_nanotron.py
 create mode 100644 tests/test_main_nanotron.py

diff --git a/run_evals_nanotron.py b/run_evals_nanotron.py
old mode 100644
new mode 100755
index 3a4a2a42..8d354dfc
--- a/run_evals_nanotron.py
+++ b/run_evals_nanotron.py
@@ -45,6 +45,12 @@ def get_parser():
         default=None,
         help="Cache directory",
     )
+    parser.add_argument(
+        "--max_samples",
+        type=int,
+        required=True,
+        help="number of samples used for evaluation",
+    )
 
     return parser
 
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 4610ea86..701e8922 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 # flake8: noqa: C901
+from argparse import Namespace
 import os
 import random
 from typing import Optional, Type
@@ -37,6 +38,7 @@
 from lighteval.tasks.registry import Registry, get_custom_tasks, taskinfo_selector
 from lighteval.utils import NO_NANOTRON_ERROR_MSG, is_nanotron_available
 from lighteval.utils_parallelism import test_all_gather
+import torch
 
 
 if not is_nanotron_available():
@@ -48,7 +50,6 @@
 from nanotron.parallel.context import ParallelContext
 from nanotron.utils import local_ranks_zero_first
 
-
 logger = get_logger(__name__)
 
 SEED = 1234
@@ -56,7 +57,7 @@
 CACHE_DIR = os.getenv("HF_HOME", "/scratch")
 
 
-@htrack()
+# @htrack()
 def main(
     checkpoint_config_path: str,
     lighteval_config_path: Optional[str] = None,
@@ -64,7 +65,13 @@ def main(
     config_cls: Type = Config,
     model_config_cls: Optional[Type] = None,
     model_cls: Optional[Type] = None,
+    args: Optional[Namespace] = None  # accept args for more flexibility 
 ):
+    if args is not None: 
+        checkpoint_config_path= args.checkpoint_config_path if checkpoint_config_path==None else checkpoint_config_path
+        lighteval_config_path= args.lighteval_override if lighteval_config_path==None else lighteval_config_path
+        cache_dir=args.cache_dir if cache_dir==None else cache_dir
+
     if cache_dir is None:
         cache_dir = CACHE_DIR
 
@@ -89,6 +96,9 @@ def main(
             nanotron_config.lighteval = lighteval_config
         else:
             lighteval_config = nanotron_config.lighteval
+            
+        if args.max_samples is not None: 
+            lighteval_config.tasks.max_samples=args.max_samples
 
         parallel_context = ParallelContext(
             tensor_parallel_size=lighteval_config.parallelism.tp,
@@ -157,8 +167,13 @@ def main(
 
     with htrack_block("Setting seeds and waiting for all processes"):
         hlog(f"setting seed to {SEED} for random and numpy")
+
+        torch.manual_seed(SEED)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(SEED)
         random.seed(SEED)
         np.random.seed(SEED)
+        
         dist.barrier()
 
     with htrack_block("Evaluation"):
@@ -192,3 +207,4 @@ def main(
         hlog(make_results_table(final_dict))
 
         return final_dict
+
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index e685a450..d57e2303 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -225,35 +225,35 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
-    llm_judge_multi_turn = SampleLevelMetricGrouping(
-        metric=["single_turn", "multi_turn"],
-        higher_is_better=True,
-        category=MetricCategory.GENERATIVE_MULTI_TURN,
-        use_case=MetricUseCase.SUMMARIZATION,
-        sample_level_fn=JudgeLLM(
-            judge_model_name="gpt-3.5-turbo",
-            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
-            multi_turn=True,
-        ).compute,
-        corpus_level_fn={
-            "single_turn": np.mean,
-            "multi_turn": np.mean,
-        },
-    )
-    llm_judge = SampleLevelMetricGrouping(
-        metric=["judge_score"],
-        higher_is_better=True,
-        category=MetricCategory.GENERATIVE,
-        use_case=MetricUseCase.SUMMARIZATION,
-        sample_level_fn=JudgeLLM(
-            judge_model_name="gpt-3.5-turbo",
-            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
-            multi_turn=False,
-        ).compute,
-        corpus_level_fn={
-            "judge_score": np.mean,
-        },
-    )
+    # llm_judge_multi_turn = SampleLevelMetricGrouping(
+    #     metric=["single_turn", "multi_turn"],
+    #     higher_is_better=True,
+    #     category=MetricCategory.GENERATIVE_MULTI_TURN,
+    #     use_case=MetricUseCase.SUMMARIZATION,
+    #     sample_level_fn=JudgeLLM(
+    #         judge_model_name="gpt-3.5-turbo",
+    #         template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+    #         multi_turn=True,
+    #     ).compute,
+    #     corpus_level_fn={
+    #         "single_turn": np.mean,
+    #         "multi_turn": np.mean,
+    #     },
+    # )
+    # llm_judge = SampleLevelMetricGrouping(
+    #     metric=["judge_score"],
+    #     higher_is_better=True,
+    #     category=MetricCategory.GENERATIVE,
+    #     use_case=MetricUseCase.SUMMARIZATION,
+    #     sample_level_fn=JudgeLLM(
+    #         judge_model_name="gpt-3.5-turbo",
+    #         template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+    #         multi_turn=False,
+    #     ).compute,
+    #     corpus_level_fn={
+    #         "judge_score": np.mean,
+    #     },
+    # )
     loglikelihood_acc = SampleLevelMetric(
         metric="acc",
         sample_level_fn=LoglikelihoodAcc().compute,
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 69ad420f..34645891 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -585,6 +585,8 @@ def prepare_batch(
 
             # since in _collate we make sure length is descending, the longest is always the first one.
             padding_length = padding_length if padding_length is not None else inplen
+            # if padding_length - inplen < 0:
+            #     print("padding_length,inplen: ",padding_length,inplen)
             if padding_length - inplen < 0:
                 raise ValueError("Negative padding")
             padded.append(padding_length - inplen)
@@ -670,7 +672,7 @@ def _get_subsets(self, dataset, dataset_splits):
     def _loglikelihood_single_token(
         self, requests, disable_tqdm: bool = False, override_bs: int = -1, dataset_splits: int = 1
     ) -> List[LoglikelihoodSingleTokenReturn]:
-        dataset = LoglikelihoodSingleTokenDataset(requests=requests)
+        dataset = LoglikelihoodSingleTokenDataset(requests=requests, dataset_splits=dataset_splits)
         res = []
 
         # Dataset is sorted in descending size.
@@ -985,6 +987,7 @@ def _loglikelihood_tokens(
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
                     out = torch.cat(gathered_out, dim=-1)
 
+                    ## debug: multi_logits is different, but similar
                     out = out.transpose(0, 1)  # [batch, seq_length, vocab]
                     multi_logits = F.log_softmax(out, dim=-1)  # [batch, padding_length, vocab]
 
diff --git a/tests/config/README.md b/tests/config/README.md
new file mode 100644
index 00000000..335de91f
--- /dev/null
+++ b/tests/config/README.md
@@ -0,0 +1,24 @@
+# Nanotron tests guide
+## How it works: 
+First select some tasks and then use the model to generate reference scores and save them in reference_task_scores_nanotron.py file, it has been done, but if you want to add a new task, you need to re-run it.
+
+After that, each time a test need to be conducted, the evaluation will be run and the results are compared to the previous reference score.
+
+## To run nanotron test:   
+```
+pytest tests/test_main_nanotron.py -sv
+```
+
+## Choose your own tasks for evaluation:
+Modify the tasks.tasks in config file(lighteval/tests/config/lighteval_config_override_custom.yaml) to set the tasks.   
+Example:  
+```
+tasks:    
+   custom_tasks: null    
+   dataset_loading_processes: 1  
+   max_samples: 10  
+   multichoice_continuations_start_space: null  
+   no_multichoice_continuations_start_space: null  
+   num_fewshot_seeds: null  
+   tasks: lighteval|anli:r1|0|0,lighteval|blimp:adjunct_island|0|0,...
+```
diff --git a/tests/config/lighteval_config_override_custom.yaml b/tests/config/lighteval_config_override_custom.yaml
new file mode 100644
index 00000000..71d75261
--- /dev/null
+++ b/tests/config/lighteval_config_override_custom.yaml
@@ -0,0 +1,30 @@
+batch_size: 16
+checkpoints_path: null
+generation: null
+logging:
+  hub_repo_details: null
+  hub_repo_results: null
+  hub_repo_tensorboard: zzhhjjj/debug-nanotron
+  local_output_path: /scratch/haojun/lighteval/nanotron-119M-seed-6-3188821
+  push_details_to_hub: null
+  push_results_to_hub: null
+  push_results_to_tensorboard: true
+  tensorboard_metric_prefix: e
+parallelism:
+  dp: 1
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 1
+  tp_linear_async_communication: false
+  tp_mode: ALL_REDUCE
+slurm_script_dir: /fsx/haojun/logs_evals
+slurm_template: /fsx/haojun/brrr/examples/get-started-kit/run_eval.slurm.jinja
+tasks:
+  custom_tasks: null
+  dataset_loading_processes: 1
+  max_samples: 10
+  multichoice_continuations_start_space: null
+  no_multichoice_continuations_start_space: null
+  num_fewshot_seeds: null
+  tasks: lighteval|anli:r1|0|0,lighteval|blimp:adjunct_island|0|0,lighteval|blimp:ellipsis_n_bar_1|0|0,leaderboard|arc:challenge|25|0,leaderboard|hellaswag|10|0,leaderboard|mmlu:abstract_algebra|5|0,leaderboard|mmlu:college_chemistry|5|0,leaderboard|mmlu:computer_security|5|0,leaderboard|mmlu:us_foreign_policy|5|0,leaderboard|truthfulqa:mc|0|0,helm|mmlu:abstract_algebra|5|0,helm|mmlu:college_chemistry|5|0,helm|mmlu:computer_security|5|0,helm|mmlu:us_foreign_policy|5|0,helm|boolq|5|0,helm|hellaswag|5|0,leaderboard|gsm8k|5|0
diff --git a/tests/reference_scores/reference_task_scores_nanotron.py b/tests/reference_scores/reference_task_scores_nanotron.py
new file mode 100644
index 00000000..e042de23
--- /dev/null
+++ b/tests/reference_scores/reference_task_scores_nanotron.py
@@ -0,0 +1,48 @@
+RESULTS_NANOTRON_LITE = {
+    "LLama-119M": {
+        "helm:boolq:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.0, 'pqem_stderr': 0.0},
+        "helm:hellaswag:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.0, 'pqem_stderr': 0.0},
+        "helm:mmlu:abstract_algebra:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.25, 'qem_stderr': 0.25, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.25, 'pqem_stderr': 0.25},
+        "helm:mmlu:college_chemistry:5": {'em': 0.25, 'em_stderr': 0.25, 'qem': 0.25, 'qem_stderr': 0.25, 'pem': 0.25, 'pem_stderr': 0.25, 'pqem': 0.25, 'pqem_stderr': 0.25},
+        "helm:mmlu:computer_security:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.0, 'pqem_stderr': 0.0},
+        "helm:mmlu:us_foreign_policy:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.25, 'pqem_stderr': 0.25},
+        "leaderboard:gsm8k:5": {'qem': 0.0, 'qem_stderr': 0.0},
+        "leaderboard:arc:challenge:25": {'acc': 0.5, 'acc_stderr': 0.28867513459481287, 'acc_norm': 0.5, 'acc_norm_stderr': 0.28867513459481287},
+        "leaderboard:hellaswag:10": {'acc': 0.0, 'acc_stderr': 0.0, 'acc_norm': 0.0, 'acc_norm_stderr': 0.0},
+        "leaderboard:mmlu:abstract_algebra:5": {'acc': 0.25, 'acc_stderr': 0.25},
+        "leaderboard:mmlu:college_chemistry:5": {'acc': 0.0, 'acc_stderr': 0.0},
+        "leaderboard:mmlu:computer_security:5": {'acc': 0.25, 'acc_stderr': 0.25},
+        "leaderboard:mmlu:us_foreign_policy:5": {'acc': 0.25, 'acc_stderr': 0.25},
+        "leaderboard:truthfulqa:mc:0": {'truthfulqa_mc1': 0.5, 'truthfulqa_mc1_stderr': 0.28867513459481287, 'truthfulqa_mc2': 0.4317633664159167, 'truthfulqa_mc2_stderr': 0.25500097927438214},
+        "lighteval:blimp:adjunct_island:0": {'acc': 0.5, 'acc_stderr': 0.28867513459481287},
+        "lighteval:blimp:ellipsis_n_bar_1:0": {'acc': 0.25, 'acc_stderr': 0.25},
+        "lighteval:anli:r1:0": {'acc': 0.25, 'acc_stderr': 0.25},
+        "helm:mmlu:_average:5": {'em': 0.0625, 'em_stderr': 0.0625, 'qem': 0.125, 'qem_stderr': 0.125, 'pem': 0.0625, 'pem_stderr': 0.0625, 'pqem': 0.1875, 'pqem_stderr': 0.1875},
+        "leaderboard:mmlu:_average:5": {'acc': 0.1875, 'acc_stderr': 0.1875},
+        "lighteval:blimp:_average:0": {'acc': 0.375, 'acc_stderr': 0.26933756729740643},
+    }
+}
+RESULTS_NANOTRON_FULL = {
+    "LLama-119M": {
+        "helm:boolq:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0006116207951070336, 'qem_stderr': 0.0004324150578206582, 'pem': 0.0003058103975535168, 'pem_stderr': 0.00030581039755354006, 'pqem': 0.0024464831804281344, 'pqem_stderr': 0.0008640358432108371},
+        "helm:hellaswag:5": {'em': 0.0016928898625771759, 'em_stderr': 0.00041025884285982294, 'qem': 0.0016928898625771759, 'qem_stderr': 0.00041025884285982294, 'pem': 0.0016928898625771759, 'pem_stderr': 0.00041025884285982294, 'pqem': 0.0016928898625771759, 'pqem_stderr': 0.00041025884285982294},
+        "helm:mmlu:abstract_algebra:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.12, 'pem_stderr': 0.03265986323710906, 'pqem': 0.36, 'pqem_stderr': 0.04824181513244218},
+        "helm:mmlu:college_chemistry:5": {'em': 0.02, 'em_stderr': 0.014070529413628952, 'qem': 0.02, 'qem_stderr': 0.014070529413628952, 'pem': 0.02, 'pem_stderr': 0.014070529413628952, 'pqem': 0.22, 'pqem_stderr': 0.04163331998932269},
+        "helm:mmlu:computer_security:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.01, 'qem_stderr': 0.009999999999999998, 'pem': 0.07, 'pem_stderr': 0.025643239997624283, 'pqem': 0.35, 'pqem_stderr': 0.04793724854411019},
+        "helm:mmlu:us_foreign_policy:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.02, 'pem_stderr': 0.014070529413628954, 'pqem': 0.32, 'pqem_stderr': 0.046882617226215034},
+        "leaderboard:gsm8k:5": {'qem': 0.0, 'qem_stderr': 0.0},
+        "leaderboard:arc:challenge:25": {'acc': 0.20733788395904437, 'acc_stderr': 0.011846905782971364, 'acc_norm': 0.24829351535836178, 'acc_norm_stderr': 0.012624912868089772},
+        "leaderboard:hellaswag:10": {'acc': 0.2577175861382195, 'acc_stderr': 0.004364838000335622, 'acc_norm': 0.26030671181039633, 'acc_norm_stderr': 0.00437905135702414},
+        "leaderboard:mmlu:abstract_algebra:5": {'acc': 0.29, 'acc_stderr': 0.045604802157206845},
+        "leaderboard:mmlu:college_chemistry:5": {'acc': 0.2, 'acc_stderr': 0.04020151261036846},
+        "leaderboard:mmlu:computer_security:5": {'acc': 0.32, 'acc_stderr': 0.04688261722621503},
+        "leaderboard:mmlu:us_foreign_policy:5": {'acc': 0.24, 'acc_stderr': 0.042923469599092816},
+        "leaderboard:truthfulqa:mc:0": {'truthfulqa_mc1': 0.23011015911872704, 'truthfulqa_mc1_stderr': 0.01473455795980776, 'truthfulqa_mc2': 0.4796459449168539, 'truthfulqa_mc2_stderr': 0.016677952132527703},
+        "lighteval:blimp:adjunct_island:0": {'acc': 0.506, 'acc_stderr': 0.015818160898606715},
+        "lighteval:blimp:ellipsis_n_bar_1:0": {'acc': 0.513, 'acc_stderr': 0.015813952101896622},
+        "lighteval:anli:r1:0": {'acc': 0.315, 'acc_stderr': 0.014696631960792496},
+        "helm:mmlu:_average:5": {'em': 0.005, 'em_stderr': 0.003517632353407238, 'qem': 0.0075, 'qem_stderr': 0.006017632353407238, 'pem': 0.057499999999999996, 'pem_stderr': 0.021611040515497813, 'pqem': 0.3125, 'pqem_stderr': 0.046173750223022524},
+        "leaderboard:mmlu:_average:5": {'acc': 0.2625, 'acc_stderr': 0.04390310039822079},
+        "lighteval:blimp:_average:0": {'acc': 0.5095000000000001, 'acc_stderr': 0.01581605650025167},
+    }
+}
diff --git a/tests/test_main_nanotron.py b/tests/test_main_nanotron.py
new file mode 100644
index 00000000..824c0160
--- /dev/null
+++ b/tests/test_main_nanotron.py
@@ -0,0 +1,129 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""This file should be launched using `pytest tests/test_main_nanotron.py -sv`. It must stay at the same level or above as main"""
+import os
+import sys
+import pytest
+from pytest import approx
+import json
+
+from lighteval.main_nanotron import main  # noqa: E402
+from nanotron.config import LightEvalConfig, get_config_from_file
+from run_evals_nanotron import get_parser
+
+from tests.reference_scores.reference_tasks import ALL_SUBSETS
+from tests.reference_scores.reference_task_scores_nanotron import RESULTS_NANOTRON_LITE , RESULTS_NANOTRON_FULL
+
+# Set env var for deterministic run of models
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+
+# Set cache for github actions
+os.environ["HF_DATASETS_CACHE"] = "cache/datasets/"
+os.environ["HF_HOME"] = "cache/models/"
+
+# To add new models or tasks, change here
+# ! The correct results must be present in reference_task_scores
+MODELS=[{"name":'LLama-119M', "config_path":'/fsx/haojun/lighteval_evaluation_model/config.yaml'}]
+LIGHTEVAL_CONFIG_PATH="/fsx/haojun/lighteval/tests/config/lighteval_config_override_custom.yaml" # define tasks
+SAVE_RESULTS=False # whether you want to save the results in json format, and update reference_tasks_scores_nanotron.py later
+RESULTS_DIRECTORY="/fsx/haojun/lighteval/tests"
+FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", True) # Full evaluation or Lite evaluation
+
+# set env variables as nanotron need them
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "29400"
+os.environ["WORLD_SIZE"] = "1"
+os.environ["RANK"] = "0"
+
+
+def run_model_predictions_full(config_path: str, lighteval_config_path: str):
+    """Runs the full main as a black box, using the input model and tasks, on all samples without parallelism"""
+    lighteval_args = ["--checkpoint-config-path", f"{config_path}", "--lighteval-override", f"{lighteval_config_path}"]
+    lighteval_args += ["--max_samples","10000000"]
+    parser = get_parser()
+    args = parser.parse_args(lighteval_args)
+    results = main(args.checkpoint_config_path,args=args)
+    return results
+
+def run_model_predictions_lite(config_path: str, lighteval_config_path: str):
+    """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism"""
+    lighteval_args = ["--checkpoint-config-path", f"{config_path}", "--lighteval-override", f"{lighteval_config_path}"]
+    lighteval_args += ["--max_samples","4"]
+    parser = get_parser()
+    args = parser.parse_args(lighteval_args)
+    results = main(args.checkpoint_config_path,args=args)
+    return results
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc):
+    """Initializes the main test setup. This function is automatically called by pytest and
+    should not be called manually.
+
+    Every function with "model_input" as arguments will be sent the "parameters".
+    This function will be run only once, ensuring that each model is run only once on the selected tasks.
+    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
+    """
+    parameters = []
+
+    # If model_input is a test function argument
+    # (= the function requires a fixture)
+    if "model_input" in metafunc.fixturenames:
+        # tasks = TASKS  # must be a list not a file name
+        for model in MODELS:
+            if FULL_TEST:
+                predictions_full = run_model_predictions_full(model['config_path'],LIGHTEVAL_CONFIG_PATH)
+                ## store the results
+                if SAVE_RESULTS: 
+                    with open(f'{RESULTS_DIRECTORY}/predictions_full.json', 'w') as file:
+                        json.dump(predictions_full["results"], file, indent=4)
+                ## end
+                tasks=list(RESULTS_NANOTRON_FULL[model['name']].keys())
+                for eval_name in tasks:
+                    for metric, reference in RESULTS_NANOTRON_FULL[model['name']][eval_name].items():
+                        if len(eval_name.split("|")) == 4:
+                            eval_name = "|".join(eval_name.split("|")[:-1])
+                        prediction = predictions_full["results"][eval_name.replace("|", ":")][metric]
+                        parameters.append((model, "all", eval_name, metric, prediction, reference))
+            else:
+                predictions_lite = run_model_predictions_lite(model['config_path'],LIGHTEVAL_CONFIG_PATH)
+                ## store the results
+                if SAVE_RESULTS: 
+                    with open(f'{RESULTS_DIRECTORY}/predictions_lite.json', 'w') as file:
+                        json.dump(predictions_lite["results"], file, indent=4)
+                # end 
+                tasks=list(RESULTS_NANOTRON_LITE[model['name']].keys())
+                for eval_name in tasks:
+                    for metric, reference in RESULTS_NANOTRON_LITE[model['name']][eval_name].items():
+                        if len(eval_name.split("|")) == 4:
+                            eval_name = "|".join(eval_name.split("|")[:-1])
+                        prediction = predictions_lite["results"][eval_name.replace("|", ":")][metric]
+                        parameters.append((model, "lite", eval_name, metric, prediction, reference))
+        metafunc.parametrize("model_input", parameters, scope="session")
+
+
+def test_model_prediction(model_input: tuple):
+    """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
+    model_name, test_type, eval_name, metric, prediction, source  = model_input
+    assert source == approx(
+        prediction, rel=1e-4
+    ), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"

From c2322c8802486ae079a00b4e1a6447ef3e43c8e2 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Thu, 18 Apr 2024 18:37:48 +0000
Subject: [PATCH 04/10] args

---
 tests/test_main_nanotron.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_main_nanotron.py b/tests/test_main_nanotron.py
index 824c0160..6885e014 100644
--- a/tests/test_main_nanotron.py
+++ b/tests/test_main_nanotron.py
@@ -47,7 +47,7 @@
 LIGHTEVAL_CONFIG_PATH="/fsx/haojun/lighteval/tests/config/lighteval_config_override_custom.yaml" # define tasks
 SAVE_RESULTS=False # whether you want to save the results in json format, and update reference_tasks_scores_nanotron.py later
 RESULTS_DIRECTORY="/fsx/haojun/lighteval/tests"
-FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", True) # Full evaluation or Lite evaluation
+FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False) # Full evaluation or Lite evaluation
 
 # set env variables as nanotron need them
 os.environ["MASTER_ADDR"] = "localhost"

From b423117f5e463ac8ad55f61ca14ecd7a825bc3e9 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Thu, 18 Apr 2024 18:50:48 +0000
Subject: [PATCH 05/10] remove useless code

---
 src/lighteval/main_nanotron.py         |  2 +-
 src/lighteval/metrics/metrics.py       | 59 +++++++++++++-------------
 src/lighteval/models/nanotron_model.py |  4 +-
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 701e8922..4cdc0292 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -57,7 +57,7 @@
 CACHE_DIR = os.getenv("HF_HOME", "/scratch")
 
 
-# @htrack()
+@htrack()
 def main(
     checkpoint_config_path: str,
     lighteval_config_path: Optional[str] = None,
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index d57e2303..7f526bae 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -225,35 +225,36 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
-    # llm_judge_multi_turn = SampleLevelMetricGrouping(
-    #     metric=["single_turn", "multi_turn"],
-    #     higher_is_better=True,
-    #     category=MetricCategory.GENERATIVE_MULTI_TURN,
-    #     use_case=MetricUseCase.SUMMARIZATION,
-    #     sample_level_fn=JudgeLLM(
-    #         judge_model_name="gpt-3.5-turbo",
-    #         template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
-    #         multi_turn=True,
-    #     ).compute,
-    #     corpus_level_fn={
-    #         "single_turn": np.mean,
-    #         "multi_turn": np.mean,
-    #     },
-    # )
-    # llm_judge = SampleLevelMetricGrouping(
-    #     metric=["judge_score"],
-    #     higher_is_better=True,
-    #     category=MetricCategory.GENERATIVE,
-    #     use_case=MetricUseCase.SUMMARIZATION,
-    #     sample_level_fn=JudgeLLM(
-    #         judge_model_name="gpt-3.5-turbo",
-    #         template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
-    #         multi_turn=False,
-    #     ).compute,
-    #     corpus_level_fn={
-    #         "judge_score": np.mean,
-    #     },
-    # )
+    # this took me some time each time when I run the tests, even I don't need it 
+    llm_judge_multi_turn = SampleLevelMetricGrouping(
+        metric=["single_turn", "multi_turn"],
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE_MULTI_TURN,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLM(
+            judge_model_name="gpt-3.5-turbo",
+            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+            multi_turn=True,
+        ).compute,
+        corpus_level_fn={
+            "single_turn": np.mean,
+            "multi_turn": np.mean,
+        },
+    )
+    llm_judge = SampleLevelMetricGrouping(
+        metric=["judge_score"],
+        higher_is_better=True,
+        category=MetricCategory.GENERATIVE,
+        use_case=MetricUseCase.SUMMARIZATION,
+        sample_level_fn=JudgeLLM(
+            judge_model_name="gpt-3.5-turbo",
+            template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
+            multi_turn=False,
+        ).compute,
+        corpus_level_fn={
+            "judge_score": np.mean,
+        },
+    )
     loglikelihood_acc = SampleLevelMetric(
         metric="acc",
         sample_level_fn=LoglikelihoodAcc().compute,
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index 34645891..1e9bff86 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -585,8 +585,7 @@ def prepare_batch(
 
             # since in _collate we make sure length is descending, the longest is always the first one.
             padding_length = padding_length if padding_length is not None else inplen
-            # if padding_length - inplen < 0:
-            #     print("padding_length,inplen: ",padding_length,inplen)
+
             if padding_length - inplen < 0:
                 raise ValueError("Negative padding")
             padded.append(padding_length - inplen)
@@ -987,7 +986,6 @@ def _loglikelihood_tokens(
                     dist.all_gather(gathered_out, out, group=self.parallel_context.tp_pg, async_op=False)
                     out = torch.cat(gathered_out, dim=-1)
 
-                    ## debug: multi_logits is different, but similar
                     out = out.transpose(0, 1)  # [batch, seq_length, vocab]
                     multi_logits = F.log_softmax(out, dim=-1)  # [batch, padding_length, vocab]
 

From bd0fd11f39820d24de55b0e961c321027b246ba2 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 19 Apr 2024 09:37:16 +0000
Subject: [PATCH 06/10] readme update

---
 tests/config/README.md | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tests/config/README.md b/tests/config/README.md
index 335de91f..474c53f1 100644
--- a/tests/config/README.md
+++ b/tests/config/README.md
@@ -10,7 +10,7 @@ pytest tests/test_main_nanotron.py -sv
 ```
 
 ## Choose your own tasks for evaluation:
-Modify the tasks.tasks in config file(lighteval/tests/config/lighteval_config_override_custom.yaml) to set the tasks.   
+Modify the **tasks.tasks** in config file(lighteval/tests/config/lighteval_config_override_custom.yaml) to set the tasks.   
 Example:  
 ```
 tasks:    
@@ -22,3 +22,34 @@ tasks:
    num_fewshot_seeds: null  
    tasks: lighteval|anli:r1|0|0,lighteval|blimp:adjunct_island|0|0,...
 ```
+
+## Randomized results
+Please make sure to set **for_inference** to true. This will load model with a fixed output layer norm implementation. It's set to true by default for training
+```
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: float64
+  init_method:
+    std: 0.02
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    hidden_size: 512
+    initializer_range: 0.02
+    intermediate_size: 2048
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 16
+    num_hidden_layers: 16
+    num_key_value_heads: 16
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 50272
+    for_inference: true
+```

From 0c804573614cd12f199a2e6065c6baf61a97e430 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 19 Apr 2024 10:01:43 +0000
Subject: [PATCH 07/10] linting

---
 .github/workflows/tests.yaml                  |   2 +-
 src/lighteval/main_nanotron.py                |  26 +-
 src/lighteval/metrics/metrics.py              |   2 +-
 .../extended/mt_bench/judge_prompts.jsonl     |   2 +-
 tests/config/README.md                        |  22 +-
 .../reference_task_scores_nanotron.py         | 253 +++++++++++++++---
 tests/test_main_nanotron.py                   | 152 +++++++----
 7 files changed, 341 insertions(+), 118 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 233c4672..93faed61 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -26,7 +26,7 @@ jobs:
          cache: 'pip'
      - name: Install lighteval in editable mode
        run: |
-         pip install -e .[dev,extended_tasks]
+         pip install -e .[dev,nanotron,extended_tasks]
      - name: Get cached files
        uses: actions/cache@v2
        id: get-cache
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 4cdc0292..6b0d39bd 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -21,12 +21,13 @@
 # SOFTWARE.
 
 # flake8: noqa: C901
-from argparse import Namespace
 import os
 import random
+from argparse import Namespace
 from typing import Optional, Type
 
 import numpy as np
+import torch
 
 from lighteval.evaluator import evaluate, make_results_table
 from lighteval.logging.evaluation_tracker import EvaluationTracker
@@ -38,7 +39,6 @@
 from lighteval.tasks.registry import Registry, get_custom_tasks, taskinfo_selector
 from lighteval.utils import NO_NANOTRON_ERROR_MSG, is_nanotron_available
 from lighteval.utils_parallelism import test_all_gather
-import torch
 
 
 if not is_nanotron_available():
@@ -50,6 +50,7 @@
 from nanotron.parallel.context import ParallelContext
 from nanotron.utils import local_ranks_zero_first
 
+
 logger = get_logger(__name__)
 
 SEED = 1234
@@ -65,12 +66,14 @@ def main(
     config_cls: Type = Config,
     model_config_cls: Optional[Type] = None,
     model_cls: Optional[Type] = None,
-    args: Optional[Namespace] = None  # accept args for more flexibility 
+    args: Optional[Namespace] = None,  # accept args for more flexibility
 ):
-    if args is not None: 
-        checkpoint_config_path= args.checkpoint_config_path if checkpoint_config_path==None else checkpoint_config_path
-        lighteval_config_path= args.lighteval_override if lighteval_config_path==None else lighteval_config_path
-        cache_dir=args.cache_dir if cache_dir==None else cache_dir
+    if args is not None:
+        checkpoint_config_path = (
+            args.checkpoint_config_path if checkpoint_config_path is None else checkpoint_config_path
+        )
+        lighteval_config_path = args.lighteval_override if lighteval_config_path is None else lighteval_config_path
+        cache_dir = args.cache_dir if cache_dir is None else cache_dir
 
     if cache_dir is None:
         cache_dir = CACHE_DIR
@@ -96,9 +99,9 @@ def main(
             nanotron_config.lighteval = lighteval_config
         else:
             lighteval_config = nanotron_config.lighteval
-            
-        if args.max_samples is not None: 
-            lighteval_config.tasks.max_samples=args.max_samples
+
+        if args.max_samples is not None:
+            lighteval_config.tasks.max_samples = args.max_samples
 
         parallel_context = ParallelContext(
             tensor_parallel_size=lighteval_config.parallelism.tp,
@@ -173,7 +176,7 @@ def main(
             torch.cuda.manual_seed(SEED)
         random.seed(SEED)
         np.random.seed(SEED)
-        
+
         dist.barrier()
 
     with htrack_block("Evaluation"):
@@ -207,4 +210,3 @@ def main(
         hlog(make_results_table(final_dict))
 
         return final_dict
-
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index 78a7953a..24d49656 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -225,7 +225,7 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
-    # this took me some time each time when I run the tests, even I don't need it 
+    # this took me some time each time when I run the tests, even I don't need it
     llm_judge_multi_turn = SampleLevelMetricGrouping(
         metric=["single_turn", "multi_turn"],
         higher_is_better=True,
diff --git a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
index 86854fff..4ec7524c 100644
--- a/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
+++ b/src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl
@@ -5,4 +5,4 @@
 {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
 {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
 {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
-{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
\ No newline at end of file
+{"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"}
diff --git a/tests/config/README.md b/tests/config/README.md
index 474c53f1..2c04c822 100644
--- a/tests/config/README.md
+++ b/tests/config/README.md
@@ -1,25 +1,25 @@
 # Nanotron tests guide
-## How it works: 
+## How it works:
 First select some tasks and then use the model to generate reference scores and save them in reference_task_scores_nanotron.py file, it has been done, but if you want to add a new task, you need to re-run it.
 
 After that, each time a test need to be conducted, the evaluation will be run and the results are compared to the previous reference score.
 
-## To run nanotron test:   
+## To run nanotron test:
 ```
 pytest tests/test_main_nanotron.py -sv
 ```
 
 ## Choose your own tasks for evaluation:
-Modify the **tasks.tasks** in config file(lighteval/tests/config/lighteval_config_override_custom.yaml) to set the tasks.   
-Example:  
+Modify the **tasks.tasks** in config file(lighteval/tests/config/lighteval_config_override_custom.yaml) to set the tasks.
+Example:
 ```
-tasks:    
-   custom_tasks: null    
-   dataset_loading_processes: 1  
-   max_samples: 10  
-   multichoice_continuations_start_space: null  
-   no_multichoice_continuations_start_space: null  
-   num_fewshot_seeds: null  
+tasks:
+   custom_tasks: null
+   dataset_loading_processes: 1
+   max_samples: 10
+   multichoice_continuations_start_space: null
+   no_multichoice_continuations_start_space: null
+   num_fewshot_seeds: null
    tasks: lighteval|anli:r1|0|0,lighteval|blimp:adjunct_island|0|0,...
 ```
 
diff --git a/tests/reference_scores/reference_task_scores_nanotron.py b/tests/reference_scores/reference_task_scores_nanotron.py
index e042de23..21aa0195 100644
--- a/tests/reference_scores/reference_task_scores_nanotron.py
+++ b/tests/reference_scores/reference_task_scores_nanotron.py
@@ -1,48 +1,221 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 RESULTS_NANOTRON_LITE = {
     "LLama-119M": {
-        "helm:boolq:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.0, 'pqem_stderr': 0.0},
-        "helm:hellaswag:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.0, 'pqem_stderr': 0.0},
-        "helm:mmlu:abstract_algebra:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.25, 'qem_stderr': 0.25, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.25, 'pqem_stderr': 0.25},
-        "helm:mmlu:college_chemistry:5": {'em': 0.25, 'em_stderr': 0.25, 'qem': 0.25, 'qem_stderr': 0.25, 'pem': 0.25, 'pem_stderr': 0.25, 'pqem': 0.25, 'pqem_stderr': 0.25},
-        "helm:mmlu:computer_security:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.0, 'pqem_stderr': 0.0},
-        "helm:mmlu:us_foreign_policy:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.0, 'pem_stderr': 0.0, 'pqem': 0.25, 'pqem_stderr': 0.25},
-        "leaderboard:gsm8k:5": {'qem': 0.0, 'qem_stderr': 0.0},
-        "leaderboard:arc:challenge:25": {'acc': 0.5, 'acc_stderr': 0.28867513459481287, 'acc_norm': 0.5, 'acc_norm_stderr': 0.28867513459481287},
-        "leaderboard:hellaswag:10": {'acc': 0.0, 'acc_stderr': 0.0, 'acc_norm': 0.0, 'acc_norm_stderr': 0.0},
-        "leaderboard:mmlu:abstract_algebra:5": {'acc': 0.25, 'acc_stderr': 0.25},
-        "leaderboard:mmlu:college_chemistry:5": {'acc': 0.0, 'acc_stderr': 0.0},
-        "leaderboard:mmlu:computer_security:5": {'acc': 0.25, 'acc_stderr': 0.25},
-        "leaderboard:mmlu:us_foreign_policy:5": {'acc': 0.25, 'acc_stderr': 0.25},
-        "leaderboard:truthfulqa:mc:0": {'truthfulqa_mc1': 0.5, 'truthfulqa_mc1_stderr': 0.28867513459481287, 'truthfulqa_mc2': 0.4317633664159167, 'truthfulqa_mc2_stderr': 0.25500097927438214},
-        "lighteval:blimp:adjunct_island:0": {'acc': 0.5, 'acc_stderr': 0.28867513459481287},
-        "lighteval:blimp:ellipsis_n_bar_1:0": {'acc': 0.25, 'acc_stderr': 0.25},
-        "lighteval:anli:r1:0": {'acc': 0.25, 'acc_stderr': 0.25},
-        "helm:mmlu:_average:5": {'em': 0.0625, 'em_stderr': 0.0625, 'qem': 0.125, 'qem_stderr': 0.125, 'pem': 0.0625, 'pem_stderr': 0.0625, 'pqem': 0.1875, 'pqem_stderr': 0.1875},
-        "leaderboard:mmlu:_average:5": {'acc': 0.1875, 'acc_stderr': 0.1875},
-        "lighteval:blimp:_average:0": {'acc': 0.375, 'acc_stderr': 0.26933756729740643},
+        "helm:boolq:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.0,
+            "pqem_stderr": 0.0,
+        },
+        "helm:hellaswag:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.0,
+            "pqem_stderr": 0.0,
+        },
+        "helm:mmlu:abstract_algebra:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.25,
+            "qem_stderr": 0.25,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.25,
+            "pqem_stderr": 0.25,
+        },
+        "helm:mmlu:college_chemistry:5": {
+            "em": 0.25,
+            "em_stderr": 0.25,
+            "qem": 0.25,
+            "qem_stderr": 0.25,
+            "pem": 0.25,
+            "pem_stderr": 0.25,
+            "pqem": 0.25,
+            "pqem_stderr": 0.25,
+        },
+        "helm:mmlu:computer_security:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.0,
+            "pqem_stderr": 0.0,
+        },
+        "helm:mmlu:us_foreign_policy:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.25,
+            "pqem_stderr": 0.25,
+        },
+        "leaderboard:gsm8k:5": {"qem": 0.0, "qem_stderr": 0.0},
+        "leaderboard:arc:challenge:25": {
+            "acc": 0.5,
+            "acc_stderr": 0.28867513459481287,
+            "acc_norm": 0.5,
+            "acc_norm_stderr": 0.28867513459481287,
+        },
+        "leaderboard:hellaswag:10": {"acc": 0.0, "acc_stderr": 0.0, "acc_norm": 0.0, "acc_norm_stderr": 0.0},
+        "leaderboard:mmlu:abstract_algebra:5": {"acc": 0.25, "acc_stderr": 0.25},
+        "leaderboard:mmlu:college_chemistry:5": {"acc": 0.0, "acc_stderr": 0.0},
+        "leaderboard:mmlu:computer_security:5": {"acc": 0.25, "acc_stderr": 0.25},
+        "leaderboard:mmlu:us_foreign_policy:5": {"acc": 0.25, "acc_stderr": 0.25},
+        "leaderboard:truthfulqa:mc:0": {
+            "truthfulqa_mc1": 0.5,
+            "truthfulqa_mc1_stderr": 0.28867513459481287,
+            "truthfulqa_mc2": 0.4317633664159167,
+            "truthfulqa_mc2_stderr": 0.25500097927438214,
+        },
+        "lighteval:blimp:adjunct_island:0": {"acc": 0.5, "acc_stderr": 0.28867513459481287},
+        "lighteval:blimp:ellipsis_n_bar_1:0": {"acc": 0.25, "acc_stderr": 0.25},
+        "lighteval:anli:r1:0": {"acc": 0.25, "acc_stderr": 0.25},
+        "helm:mmlu:_average:5": {
+            "em": 0.0625,
+            "em_stderr": 0.0625,
+            "qem": 0.125,
+            "qem_stderr": 0.125,
+            "pem": 0.0625,
+            "pem_stderr": 0.0625,
+            "pqem": 0.1875,
+            "pqem_stderr": 0.1875,
+        },
+        "leaderboard:mmlu:_average:5": {"acc": 0.1875, "acc_stderr": 0.1875},
+        "lighteval:blimp:_average:0": {"acc": 0.375, "acc_stderr": 0.26933756729740643},
     }
 }
 RESULTS_NANOTRON_FULL = {
     "LLama-119M": {
-        "helm:boolq:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0006116207951070336, 'qem_stderr': 0.0004324150578206582, 'pem': 0.0003058103975535168, 'pem_stderr': 0.00030581039755354006, 'pqem': 0.0024464831804281344, 'pqem_stderr': 0.0008640358432108371},
-        "helm:hellaswag:5": {'em': 0.0016928898625771759, 'em_stderr': 0.00041025884285982294, 'qem': 0.0016928898625771759, 'qem_stderr': 0.00041025884285982294, 'pem': 0.0016928898625771759, 'pem_stderr': 0.00041025884285982294, 'pqem': 0.0016928898625771759, 'pqem_stderr': 0.00041025884285982294},
-        "helm:mmlu:abstract_algebra:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.12, 'pem_stderr': 0.03265986323710906, 'pqem': 0.36, 'pqem_stderr': 0.04824181513244218},
-        "helm:mmlu:college_chemistry:5": {'em': 0.02, 'em_stderr': 0.014070529413628952, 'qem': 0.02, 'qem_stderr': 0.014070529413628952, 'pem': 0.02, 'pem_stderr': 0.014070529413628952, 'pqem': 0.22, 'pqem_stderr': 0.04163331998932269},
-        "helm:mmlu:computer_security:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.01, 'qem_stderr': 0.009999999999999998, 'pem': 0.07, 'pem_stderr': 0.025643239997624283, 'pqem': 0.35, 'pqem_stderr': 0.04793724854411019},
-        "helm:mmlu:us_foreign_policy:5": {'em': 0.0, 'em_stderr': 0.0, 'qem': 0.0, 'qem_stderr': 0.0, 'pem': 0.02, 'pem_stderr': 0.014070529413628954, 'pqem': 0.32, 'pqem_stderr': 0.046882617226215034},
-        "leaderboard:gsm8k:5": {'qem': 0.0, 'qem_stderr': 0.0},
-        "leaderboard:arc:challenge:25": {'acc': 0.20733788395904437, 'acc_stderr': 0.011846905782971364, 'acc_norm': 0.24829351535836178, 'acc_norm_stderr': 0.012624912868089772},
-        "leaderboard:hellaswag:10": {'acc': 0.2577175861382195, 'acc_stderr': 0.004364838000335622, 'acc_norm': 0.26030671181039633, 'acc_norm_stderr': 0.00437905135702414},
-        "leaderboard:mmlu:abstract_algebra:5": {'acc': 0.29, 'acc_stderr': 0.045604802157206845},
-        "leaderboard:mmlu:college_chemistry:5": {'acc': 0.2, 'acc_stderr': 0.04020151261036846},
-        "leaderboard:mmlu:computer_security:5": {'acc': 0.32, 'acc_stderr': 0.04688261722621503},
-        "leaderboard:mmlu:us_foreign_policy:5": {'acc': 0.24, 'acc_stderr': 0.042923469599092816},
-        "leaderboard:truthfulqa:mc:0": {'truthfulqa_mc1': 0.23011015911872704, 'truthfulqa_mc1_stderr': 0.01473455795980776, 'truthfulqa_mc2': 0.4796459449168539, 'truthfulqa_mc2_stderr': 0.016677952132527703},
-        "lighteval:blimp:adjunct_island:0": {'acc': 0.506, 'acc_stderr': 0.015818160898606715},
-        "lighteval:blimp:ellipsis_n_bar_1:0": {'acc': 0.513, 'acc_stderr': 0.015813952101896622},
-        "lighteval:anli:r1:0": {'acc': 0.315, 'acc_stderr': 0.014696631960792496},
-        "helm:mmlu:_average:5": {'em': 0.005, 'em_stderr': 0.003517632353407238, 'qem': 0.0075, 'qem_stderr': 0.006017632353407238, 'pem': 0.057499999999999996, 'pem_stderr': 0.021611040515497813, 'pqem': 0.3125, 'pqem_stderr': 0.046173750223022524},
-        "leaderboard:mmlu:_average:5": {'acc': 0.2625, 'acc_stderr': 0.04390310039822079},
-        "lighteval:blimp:_average:0": {'acc': 0.5095000000000001, 'acc_stderr': 0.01581605650025167},
+        "helm:boolq:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0006116207951070336,
+            "qem_stderr": 0.0004324150578206582,
+            "pem": 0.0003058103975535168,
+            "pem_stderr": 0.00030581039755354006,
+            "pqem": 0.0024464831804281344,
+            "pqem_stderr": 0.0008640358432108371,
+        },
+        "helm:hellaswag:5": {
+            "em": 0.0016928898625771759,
+            "em_stderr": 0.00041025884285982294,
+            "qem": 0.0016928898625771759,
+            "qem_stderr": 0.00041025884285982294,
+            "pem": 0.0016928898625771759,
+            "pem_stderr": 0.00041025884285982294,
+            "pqem": 0.0016928898625771759,
+            "pqem_stderr": 0.00041025884285982294,
+        },
+        "helm:mmlu:abstract_algebra:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.12,
+            "pem_stderr": 0.03265986323710906,
+            "pqem": 0.36,
+            "pqem_stderr": 0.04824181513244218,
+        },
+        "helm:mmlu:college_chemistry:5": {
+            "em": 0.02,
+            "em_stderr": 0.014070529413628952,
+            "qem": 0.02,
+            "qem_stderr": 0.014070529413628952,
+            "pem": 0.02,
+            "pem_stderr": 0.014070529413628952,
+            "pqem": 0.22,
+            "pqem_stderr": 0.04163331998932269,
+        },
+        "helm:mmlu:computer_security:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.01,
+            "qem_stderr": 0.009999999999999998,
+            "pem": 0.07,
+            "pem_stderr": 0.025643239997624283,
+            "pqem": 0.35,
+            "pqem_stderr": 0.04793724854411019,
+        },
+        "helm:mmlu:us_foreign_policy:5": {
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.02,
+            "pem_stderr": 0.014070529413628954,
+            "pqem": 0.32,
+            "pqem_stderr": 0.046882617226215034,
+        },
+        "leaderboard:gsm8k:5": {"qem": 0.0, "qem_stderr": 0.0},
+        "leaderboard:arc:challenge:25": {
+            "acc": 0.20733788395904437,
+            "acc_stderr": 0.011846905782971364,
+            "acc_norm": 0.24829351535836178,
+            "acc_norm_stderr": 0.012624912868089772,
+        },
+        "leaderboard:hellaswag:10": {
+            "acc": 0.2577175861382195,
+            "acc_stderr": 0.004364838000335622,
+            "acc_norm": 0.26030671181039633,
+            "acc_norm_stderr": 0.00437905135702414,
+        },
+        "leaderboard:mmlu:abstract_algebra:5": {"acc": 0.29, "acc_stderr": 0.045604802157206845},
+        "leaderboard:mmlu:college_chemistry:5": {"acc": 0.2, "acc_stderr": 0.04020151261036846},
+        "leaderboard:mmlu:computer_security:5": {"acc": 0.32, "acc_stderr": 0.04688261722621503},
+        "leaderboard:mmlu:us_foreign_policy:5": {"acc": 0.24, "acc_stderr": 0.042923469599092816},
+        "leaderboard:truthfulqa:mc:0": {
+            "truthfulqa_mc1": 0.23011015911872704,
+            "truthfulqa_mc1_stderr": 0.01473455795980776,
+            "truthfulqa_mc2": 0.4796459449168539,
+            "truthfulqa_mc2_stderr": 0.016677952132527703,
+        },
+        "lighteval:blimp:adjunct_island:0": {"acc": 0.506, "acc_stderr": 0.015818160898606715},
+        "lighteval:blimp:ellipsis_n_bar_1:0": {"acc": 0.513, "acc_stderr": 0.015813952101896622},
+        "lighteval:anli:r1:0": {"acc": 0.315, "acc_stderr": 0.014696631960792496},
+        "helm:mmlu:_average:5": {
+            "em": 0.005,
+            "em_stderr": 0.003517632353407238,
+            "qem": 0.0075,
+            "qem_stderr": 0.006017632353407238,
+            "pem": 0.057499999999999996,
+            "pem_stderr": 0.021611040515497813,
+            "pqem": 0.3125,
+            "pqem_stderr": 0.046173750223022524,
+        },
+        "leaderboard:mmlu:_average:5": {"acc": 0.2625, "acc_stderr": 0.04390310039822079},
+        "lighteval:blimp:_average:0": {"acc": 0.5095000000000001, "acc_stderr": 0.01581605650025167},
     }
 }
diff --git a/tests/test_main_nanotron.py b/tests/test_main_nanotron.py
index 6885e014..ac63b1a5 100644
--- a/tests/test_main_nanotron.py
+++ b/tests/test_main_nanotron.py
@@ -21,18 +21,16 @@
 # SOFTWARE.
 
 """This file should be launched using `pytest tests/test_main_nanotron.py -sv`. It must stay at the same level or above as main"""
+import json
 import os
-import sys
+
 import pytest
 from pytest import approx
-import json
 
 from lighteval.main_nanotron import main  # noqa: E402
-from nanotron.config import LightEvalConfig, get_config_from_file
 from run_evals_nanotron import get_parser
+from tests.reference_scores.reference_task_scores_nanotron import RESULTS_NANOTRON_FULL, RESULTS_NANOTRON_LITE
 
-from tests.reference_scores.reference_tasks import ALL_SUBSETS
-from tests.reference_scores.reference_task_scores_nanotron import RESULTS_NANOTRON_LITE , RESULTS_NANOTRON_FULL
 
 # Set env var for deterministic run of models
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
@@ -43,11 +41,13 @@
 
 # To add new models or tasks, change here
 # ! The correct results must be present in reference_task_scores
-MODELS=[{"name":'LLama-119M', "config_path":'/fsx/haojun/lighteval_evaluation_model/config.yaml'}]
-LIGHTEVAL_CONFIG_PATH="/fsx/haojun/lighteval/tests/config/lighteval_config_override_custom.yaml" # define tasks
-SAVE_RESULTS=False # whether you want to save the results in json format, and update reference_tasks_scores_nanotron.py later
-RESULTS_DIRECTORY="/fsx/haojun/lighteval/tests"
-FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False) # Full evaluation or Lite evaluation
+MODELS = [{"name": "LLama-119M", "config_path": "/fsx/haojun/lighteval_evaluation_model/config.yaml"}]
+LIGHTEVAL_CONFIG_PATH = "/fsx/haojun/lighteval/tests/config/lighteval_config_override_custom.yaml"  # define tasks
+SAVE_RESULTS = (
+    False  # whether you want to save the results in json format, and update reference_tasks_scores_nanotron.py later
+)
+RESULTS_DIRECTORY = "/fsx/haojun/lighteval/tests"
+FULL_TEST = os.environ.get("LIGHTEVAL_FULL_TEST", False)  # Full evaluation or Lite evaluation
 
 # set env variables as nanotron need them
 os.environ["MASTER_ADDR"] = "localhost"
@@ -59,71 +59,119 @@
 def run_model_predictions_full(config_path: str, lighteval_config_path: str):
     """Runs the full main as a black box, using the input model and tasks, on all samples without parallelism"""
     lighteval_args = ["--checkpoint-config-path", f"{config_path}", "--lighteval-override", f"{lighteval_config_path}"]
-    lighteval_args += ["--max_samples","10000000"]
+    lighteval_args += ["--max_samples", "10000000"]
     parser = get_parser()
     args = parser.parse_args(lighteval_args)
-    results = main(args.checkpoint_config_path,args=args)
+    results = main(args.checkpoint_config_path, args=args)
     return results
 
+
 def run_model_predictions_lite(config_path: str, lighteval_config_path: str):
     """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism"""
     lighteval_args = ["--checkpoint-config-path", f"{config_path}", "--lighteval-override", f"{lighteval_config_path}"]
-    lighteval_args += ["--max_samples","4"]
+    lighteval_args += ["--max_samples", "4"]
     parser = get_parser()
     args = parser.parse_args(lighteval_args)
-    results = main(args.checkpoint_config_path,args=args)
+    results = main(args.checkpoint_config_path, args=args)
     return results
 
 
-def pytest_generate_tests(metafunc: pytest.Metafunc):
-    """Initializes the main test setup. This function is automatically called by pytest and
-    should not be called manually.
+# def pytest_generate_tests(metafunc: pytest.Metafunc):
+#     """Initializes the main test setup. This function is automatically called by pytest and
+#     should not be called manually.
+
+#     Every function with "model_input" as arguments will be sent the "parameters".
+#     This function will be run only once, ensuring that each model is run only once on the selected tasks.
+#     (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
+#     """
+#     parameters = []
+
+#     # If model_input is a test function argument
+#     # (= the function requires a fixture)
+#     if "model_input" in metafunc.fixturenames:
+#         # tasks = TASKS  # must be a list not a file name
+#         for model in MODELS:
+#             if FULL_TEST:
+#                 predictions_full = run_model_predictions_full(model["config_path"], LIGHTEVAL_CONFIG_PATH)
+#                 # store the results
+#                 if SAVE_RESULTS:
+#                     with open(f"{RESULTS_DIRECTORY}/predictions_full.json", "w") as file:
+#                         json.dump(predictions_full["results"], file, indent=4)
+
+#                 tasks = list(RESULTS_NANOTRON_FULL[model["name"]].keys())
+#                 for eval_name in tasks:
+#                     for metric, reference in RESULTS_NANOTRON_FULL[model["name"]][eval_name].items():
+#                         if len(eval_name.split("|")) == 4:
+#                             eval_name = "|".join(eval_name.split("|")[:-1])
+#                         prediction = predictions_full["results"][eval_name.replace("|", ":")][metric]
+#                         parameters.append((model, "all", eval_name, metric, prediction, reference))
+#             else:
+#                 predictions_lite = run_model_predictions_lite(model["config_path"], LIGHTEVAL_CONFIG_PATH)
+#                 # store the results
+#                 if SAVE_RESULTS:
+#                     with open(f"{RESULTS_DIRECTORY}/predictions_lite.json", "w") as file:
+#                         json.dump(predictions_lite["results"], file, indent=4)
+
+#                 tasks = list(RESULTS_NANOTRON_LITE[model["name"]].keys())
+#                 for eval_name in tasks:
+#                     for metric, reference in RESULTS_NANOTRON_LITE[model["name"]][eval_name].items():
+#                         if len(eval_name.split("|")) == 4:
+#                             eval_name = "|".join(eval_name.split("|")[:-1])
+#                         prediction = predictions_lite["results"][eval_name.replace("|", ":")][metric]
+#                         parameters.append((model, "lite", eval_name, metric, prediction, reference))
+#         metafunc.parametrize("model_input", parameters, scope="session")
+
+
+def generate_full_test_parameters(model, tasks, results_nanotron_full):
+    predictions_full = run_model_predictions_full(model["config_path"], LIGHTEVAL_CONFIG_PATH)
+    if SAVE_RESULTS:
+        with open(f"{RESULTS_DIRECTORY}/predictions_full.json", "w") as file:
+            json.dump(predictions_full["results"], file, indent=4)
+
+    parameters = []
+    for eval_name in tasks:
+        for metric, reference in results_nanotron_full[model["name"]][eval_name].items():
+            if len(eval_name.split("|")) == 4:
+                eval_name = "|".join(eval_name.split("|")[:-1])
+            prediction = predictions_full["results"][eval_name.replace("|", ":")][metric]
+            parameters.append((model, "all", eval_name, metric, prediction, reference))
+    return parameters
+
+
+def generate_lite_test_parameters(model, tasks, results_nanotron_lite):
+    predictions_lite = run_model_predictions_lite(model["config_path"], LIGHTEVAL_CONFIG_PATH)
+    if SAVE_RESULTS:
+        with open(f"{RESULTS_DIRECTORY}/predictions_lite.json", "w") as file:
+            json.dump(predictions_lite["results"], file, indent=4)
 
-    Every function with "model_input" as arguments will be sent the "parameters".
-    This function will be run only once, ensuring that each model is run only once on the selected tasks.
-    (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
-    """
+    parameters = []
+    for eval_name in tasks:
+        for metric, reference in results_nanotron_lite[model["name"]][eval_name].items():
+            if len(eval_name.split("|")) == 4:
+                eval_name = "|".join(eval_name.split("|")[:-1])
+            prediction = predictions_lite["results"][eval_name.replace("|", ":")][metric]
+            parameters.append((model, "lite", eval_name, metric, prediction, reference))
+    return parameters
+
+
+def pytest_generate_tests(metafunc: pytest.Metafunc):
     parameters = []
 
-    # If model_input is a test function argument
-    # (= the function requires a fixture)
     if "model_input" in metafunc.fixturenames:
-        # tasks = TASKS  # must be a list not a file name
         for model in MODELS:
             if FULL_TEST:
-                predictions_full = run_model_predictions_full(model['config_path'],LIGHTEVAL_CONFIG_PATH)
-                ## store the results
-                if SAVE_RESULTS: 
-                    with open(f'{RESULTS_DIRECTORY}/predictions_full.json', 'w') as file:
-                        json.dump(predictions_full["results"], file, indent=4)
-                ## end
-                tasks=list(RESULTS_NANOTRON_FULL[model['name']].keys())
-                for eval_name in tasks:
-                    for metric, reference in RESULTS_NANOTRON_FULL[model['name']][eval_name].items():
-                        if len(eval_name.split("|")) == 4:
-                            eval_name = "|".join(eval_name.split("|")[:-1])
-                        prediction = predictions_full["results"][eval_name.replace("|", ":")][metric]
-                        parameters.append((model, "all", eval_name, metric, prediction, reference))
+                tasks = list(RESULTS_NANOTRON_FULL[model["name"]].keys())
+                parameters.extend(generate_full_test_parameters(model, tasks, RESULTS_NANOTRON_FULL))
             else:
-                predictions_lite = run_model_predictions_lite(model['config_path'],LIGHTEVAL_CONFIG_PATH)
-                ## store the results
-                if SAVE_RESULTS: 
-                    with open(f'{RESULTS_DIRECTORY}/predictions_lite.json', 'w') as file:
-                        json.dump(predictions_lite["results"], file, indent=4)
-                # end 
-                tasks=list(RESULTS_NANOTRON_LITE[model['name']].keys())
-                for eval_name in tasks:
-                    for metric, reference in RESULTS_NANOTRON_LITE[model['name']][eval_name].items():
-                        if len(eval_name.split("|")) == 4:
-                            eval_name = "|".join(eval_name.split("|")[:-1])
-                        prediction = predictions_lite["results"][eval_name.replace("|", ":")][metric]
-                        parameters.append((model, "lite", eval_name, metric, prediction, reference))
-        metafunc.parametrize("model_input", parameters, scope="session")
+                tasks = list(RESULTS_NANOTRON_LITE[model["name"]].keys())
+                parameters.extend(generate_lite_test_parameters(model, tasks, RESULTS_NANOTRON_LITE))
+
+    metafunc.parametrize("model_input", parameters, scope="session")
 
 
 def test_model_prediction(model_input: tuple):
     """Evaluates a model on a full task - is parametrized using pytest_generate_test"""
-    model_name, test_type, eval_name, metric, prediction, source  = model_input
+    model_name, test_type, eval_name, metric, prediction, source = model_input
     assert source == approx(
         prediction, rel=1e-4
     ), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"

From 2a041ddad95ec437704ea1c0d0675865523a7229 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 19 Apr 2024 10:03:55 +0000
Subject: [PATCH 08/10] typo

---
 tests/config/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/config/README.md b/tests/config/README.md
index 2c04c822..d7b2e9be 100644
--- a/tests/config/README.md
+++ b/tests/config/README.md
@@ -24,7 +24,7 @@ tasks:
 ```
 
 ## Randomized results
-Please make sure to set **for_inference** to true. This will load model with a fixed output layer norm implementation. It's set to true by default for training
+Please make sure to set **for_inference** to true. This will load model with a fixed output layer norm implementation. It's set to false by default for training
 ```
 model:
   ddp_bucket_cap_mb: 25

From d07ca91e743de3b5f8bafc9bc9d4335a1454f03d Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 19 Apr 2024 10:05:25 +0000
Subject: [PATCH 09/10] remove useless code

---
 tests/test_main_nanotron.py | 46 -------------------------------------
 1 file changed, 46 deletions(-)

diff --git a/tests/test_main_nanotron.py b/tests/test_main_nanotron.py
index ac63b1a5..5daf9fa1 100644
--- a/tests/test_main_nanotron.py
+++ b/tests/test_main_nanotron.py
@@ -76,52 +76,6 @@ def run_model_predictions_lite(config_path: str, lighteval_config_path: str):
     return results
 
 
-# def pytest_generate_tests(metafunc: pytest.Metafunc):
-#     """Initializes the main test setup. This function is automatically called by pytest and
-#     should not be called manually.
-
-#     Every function with "model_input" as arguments will be sent the "parameters".
-#     This function will be run only once, ensuring that each model is run only once on the selected tasks.
-#     (This is better than using fixtures as fixtures are re-run once for each test, which is not a behavior we want).
-#     """
-#     parameters = []
-
-#     # If model_input is a test function argument
-#     # (= the function requires a fixture)
-#     if "model_input" in metafunc.fixturenames:
-#         # tasks = TASKS  # must be a list not a file name
-#         for model in MODELS:
-#             if FULL_TEST:
-#                 predictions_full = run_model_predictions_full(model["config_path"], LIGHTEVAL_CONFIG_PATH)
-#                 # store the results
-#                 if SAVE_RESULTS:
-#                     with open(f"{RESULTS_DIRECTORY}/predictions_full.json", "w") as file:
-#                         json.dump(predictions_full["results"], file, indent=4)
-
-#                 tasks = list(RESULTS_NANOTRON_FULL[model["name"]].keys())
-#                 for eval_name in tasks:
-#                     for metric, reference in RESULTS_NANOTRON_FULL[model["name"]][eval_name].items():
-#                         if len(eval_name.split("|")) == 4:
-#                             eval_name = "|".join(eval_name.split("|")[:-1])
-#                         prediction = predictions_full["results"][eval_name.replace("|", ":")][metric]
-#                         parameters.append((model, "all", eval_name, metric, prediction, reference))
-#             else:
-#                 predictions_lite = run_model_predictions_lite(model["config_path"], LIGHTEVAL_CONFIG_PATH)
-#                 # store the results
-#                 if SAVE_RESULTS:
-#                     with open(f"{RESULTS_DIRECTORY}/predictions_lite.json", "w") as file:
-#                         json.dump(predictions_lite["results"], file, indent=4)
-
-#                 tasks = list(RESULTS_NANOTRON_LITE[model["name"]].keys())
-#                 for eval_name in tasks:
-#                     for metric, reference in RESULTS_NANOTRON_LITE[model["name"]][eval_name].items():
-#                         if len(eval_name.split("|")) == 4:
-#                             eval_name = "|".join(eval_name.split("|")[:-1])
-#                         prediction = predictions_lite["results"][eval_name.replace("|", ":")][metric]
-#                         parameters.append((model, "lite", eval_name, metric, prediction, reference))
-#         metafunc.parametrize("model_input", parameters, scope="session")
-
-
 def generate_full_test_parameters(model, tasks, results_nanotron_full):
     predictions_full = run_model_predictions_full(model["config_path"], LIGHTEVAL_CONFIG_PATH)
     if SAVE_RESULTS:

From 6f629b6d3a6cc951739bfcf6bba06769f303fa5f Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 19 Apr 2024 13:27:56 +0000
Subject: [PATCH 10/10] quick fix for nanotron main branch

---
 run_evals_nanotron.py                         |   2 +-
 src/lighteval/main_nanotron.py                |   1 +
 .../reference_task_scores_nanotron.py         | 186 +++++++++---------
 3 files changed, 95 insertions(+), 94 deletions(-)

diff --git a/run_evals_nanotron.py b/run_evals_nanotron.py
index 8d354dfc..3c92a3e5 100755
--- a/run_evals_nanotron.py
+++ b/run_evals_nanotron.py
@@ -48,7 +48,7 @@ def get_parser():
     parser.add_argument(
         "--max_samples",
         type=int,
-        required=True,
+        default=10,
         help="number of samples used for evaluation",
     )
 
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 6b0d39bd..02ae131f 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -92,6 +92,7 @@ def main(
             model_config_class=model_config_cls,
             skip_unused_config_keys=True,
             skip_null_keys=True,
+            igonore_all_unused_keys=True,
         )
 
         if lighteval_config_path:
diff --git a/tests/reference_scores/reference_task_scores_nanotron.py b/tests/reference_scores/reference_task_scores_nanotron.py
index 21aa0195..01b6a5e2 100644
--- a/tests/reference_scores/reference_task_scores_nanotron.py
+++ b/tests/reference_scores/reference_task_scores_nanotron.py
@@ -45,22 +45,22 @@
         "helm:mmlu:abstract_algebra:5": {
             "em": 0.0,
             "em_stderr": 0.0,
-            "qem": 0.25,
-            "qem_stderr": 0.25,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
             "pem": 0.0,
             "pem_stderr": 0.0,
             "pqem": 0.25,
             "pqem_stderr": 0.25,
         },
         "helm:mmlu:college_chemistry:5": {
-            "em": 0.25,
-            "em_stderr": 0.25,
-            "qem": 0.25,
-            "qem_stderr": 0.25,
-            "pem": 0.25,
-            "pem_stderr": 0.25,
-            "pqem": 0.25,
-            "pqem_stderr": 0.25,
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.0,
+            "pqem_stderr": 0.0,
         },
         "helm:mmlu:computer_security:5": {
             "em": 0.0,
@@ -89,32 +89,32 @@
             "acc_norm": 0.5,
             "acc_norm_stderr": 0.28867513459481287,
         },
-        "leaderboard:hellaswag:10": {"acc": 0.0, "acc_stderr": 0.0, "acc_norm": 0.0, "acc_norm_stderr": 0.0},
-        "leaderboard:mmlu:abstract_algebra:5": {"acc": 0.25, "acc_stderr": 0.25},
+        "leaderboard:hellaswag:10": {"acc": 0.0, "acc_stderr": 0.0, "acc_norm": 0.25, "acc_norm_stderr": 0.25},
+        "leaderboard:mmlu:abstract_algebra:5": {"acc": 0.5, "acc_stderr": 0.28867513459481287},
         "leaderboard:mmlu:college_chemistry:5": {"acc": 0.0, "acc_stderr": 0.0},
-        "leaderboard:mmlu:computer_security:5": {"acc": 0.25, "acc_stderr": 0.25},
+        "leaderboard:mmlu:computer_security:5": {"acc": 1.0, "acc_stderr": 0.0},
         "leaderboard:mmlu:us_foreign_policy:5": {"acc": 0.25, "acc_stderr": 0.25},
         "leaderboard:truthfulqa:mc:0": {
-            "truthfulqa_mc1": 0.5,
-            "truthfulqa_mc1_stderr": 0.28867513459481287,
-            "truthfulqa_mc2": 0.4317633664159167,
-            "truthfulqa_mc2_stderr": 0.25500097927438214,
+            "truthfulqa_mc1": 0.0,
+            "truthfulqa_mc1_stderr": 0.0,
+            "truthfulqa_mc2": 0.2509311177276107,
+            "truthfulqa_mc2_stderr": 0.1476758333226878,
         },
         "lighteval:blimp:adjunct_island:0": {"acc": 0.5, "acc_stderr": 0.28867513459481287},
-        "lighteval:blimp:ellipsis_n_bar_1:0": {"acc": 0.25, "acc_stderr": 0.25},
+        "lighteval:blimp:ellipsis_n_bar_1:0": {"acc": 0.5, "acc_stderr": 0.28867513459481287},
         "lighteval:anli:r1:0": {"acc": 0.25, "acc_stderr": 0.25},
         "helm:mmlu:_average:5": {
-            "em": 0.0625,
-            "em_stderr": 0.0625,
-            "qem": 0.125,
-            "qem_stderr": 0.125,
-            "pem": 0.0625,
-            "pem_stderr": 0.0625,
-            "pqem": 0.1875,
-            "pqem_stderr": 0.1875,
-        },
-        "leaderboard:mmlu:_average:5": {"acc": 0.1875, "acc_stderr": 0.1875},
-        "lighteval:blimp:_average:0": {"acc": 0.375, "acc_stderr": 0.26933756729740643},
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.125,
+            "pqem_stderr": 0.125,
+        },
+        "leaderboard:mmlu:_average:5": {"acc": 0.4375, "acc_stderr": 0.13466878364870322},
+        "lighteval:blimp:_average:0": {"acc": 0.5, "acc_stderr": 0.28867513459481287},
     }
 }
 RESULTS_NANOTRON_FULL = {
@@ -122,40 +122,40 @@
         "helm:boolq:5": {
             "em": 0.0,
             "em_stderr": 0.0,
-            "qem": 0.0006116207951070336,
-            "qem_stderr": 0.0004324150578206582,
-            "pem": 0.0003058103975535168,
-            "pem_stderr": 0.00030581039755354006,
-            "pqem": 0.0024464831804281344,
-            "pqem_stderr": 0.0008640358432108371,
+            "qem": 0.0,
+            "qem_stderr": 0.0,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.0009174311926605505,
+            "pqem_stderr": 0.0005295170903140158,
         },
         "helm:hellaswag:5": {
-            "em": 0.0016928898625771759,
-            "em_stderr": 0.00041025884285982294,
-            "qem": 0.0016928898625771759,
-            "qem_stderr": 0.00041025884285982294,
-            "pem": 0.0016928898625771759,
-            "pem_stderr": 0.00041025884285982294,
-            "pqem": 0.0016928898625771759,
-            "pqem_stderr": 0.00041025884285982294,
+            "em": 0.0008962358095996813,
+            "em_stderr": 0.00029862623598600317,
+            "qem": 0.0008962358095996813,
+            "qem_stderr": 0.00029862623598600317,
+            "pem": 0.0008962358095996813,
+            "pem_stderr": 0.00029862623598600317,
+            "pqem": 0.0008962358095996813,
+            "pqem_stderr": 0.00029862623598600317,
         },
         "helm:mmlu:abstract_algebra:5": {
             "em": 0.0,
             "em_stderr": 0.0,
             "qem": 0.0,
             "qem_stderr": 0.0,
-            "pem": 0.12,
-            "pem_stderr": 0.03265986323710906,
-            "pqem": 0.36,
-            "pqem_stderr": 0.04824181513244218,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.24,
+            "pqem_stderr": 0.042923469599092816,
         },
         "helm:mmlu:college_chemistry:5": {
-            "em": 0.02,
-            "em_stderr": 0.014070529413628952,
-            "qem": 0.02,
-            "qem_stderr": 0.014070529413628952,
-            "pem": 0.02,
-            "pem_stderr": 0.014070529413628952,
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.01,
+            "qem_stderr": 0.009999999999999998,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
             "pqem": 0.22,
             "pqem_stderr": 0.04163331998932269,
         },
@@ -164,58 +164,58 @@
             "em_stderr": 0.0,
             "qem": 0.01,
             "qem_stderr": 0.009999999999999998,
-            "pem": 0.07,
-            "pem_stderr": 0.025643239997624283,
-            "pqem": 0.35,
-            "pqem_stderr": 0.04793724854411019,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.28,
+            "pqem_stderr": 0.045126085985421276,
         },
         "helm:mmlu:us_foreign_policy:5": {
             "em": 0.0,
             "em_stderr": 0.0,
             "qem": 0.0,
             "qem_stderr": 0.0,
-            "pem": 0.02,
-            "pem_stderr": 0.014070529413628954,
-            "pqem": 0.32,
-            "pqem_stderr": 0.046882617226215034,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.28,
+            "pqem_stderr": 0.04512608598542128,
         },
         "leaderboard:gsm8k:5": {"qem": 0.0, "qem_stderr": 0.0},
         "leaderboard:arc:challenge:25": {
-            "acc": 0.20733788395904437,
-            "acc_stderr": 0.011846905782971364,
-            "acc_norm": 0.24829351535836178,
-            "acc_norm_stderr": 0.012624912868089772,
+            "acc": 0.21331058020477817,
+            "acc_stderr": 0.011970971742326334,
+            "acc_norm": 0.2593856655290102,
+            "acc_norm_stderr": 0.012808273573927092,
         },
         "leaderboard:hellaswag:10": {
-            "acc": 0.2577175861382195,
-            "acc_stderr": 0.004364838000335622,
-            "acc_norm": 0.26030671181039633,
-            "acc_norm_stderr": 0.00437905135702414,
-        },
-        "leaderboard:mmlu:abstract_algebra:5": {"acc": 0.29, "acc_stderr": 0.045604802157206845},
-        "leaderboard:mmlu:college_chemistry:5": {"acc": 0.2, "acc_stderr": 0.04020151261036846},
-        "leaderboard:mmlu:computer_security:5": {"acc": 0.32, "acc_stderr": 0.04688261722621503},
-        "leaderboard:mmlu:us_foreign_policy:5": {"acc": 0.24, "acc_stderr": 0.042923469599092816},
+            "acc": 0.25712009559848636,
+            "acc_stderr": 0.004361529679492746,
+            "acc_norm": 0.25941047600079664,
+            "acc_norm_stderr": 0.004374153847826758,
+        },
+        "leaderboard:mmlu:abstract_algebra:5": {"acc": 0.21, "acc_stderr": 0.040936018074033256},
+        "leaderboard:mmlu:college_chemistry:5": {"acc": 0.18, "acc_stderr": 0.03861229196653694},
+        "leaderboard:mmlu:computer_security:5": {"acc": 0.31, "acc_stderr": 0.04648231987117316},
+        "leaderboard:mmlu:us_foreign_policy:5": {"acc": 0.23, "acc_stderr": 0.04229525846816505},
         "leaderboard:truthfulqa:mc:0": {
-            "truthfulqa_mc1": 0.23011015911872704,
-            "truthfulqa_mc1_stderr": 0.01473455795980776,
-            "truthfulqa_mc2": 0.4796459449168539,
-            "truthfulqa_mc2_stderr": 0.016677952132527703,
-        },
-        "lighteval:blimp:adjunct_island:0": {"acc": 0.506, "acc_stderr": 0.015818160898606715},
-        "lighteval:blimp:ellipsis_n_bar_1:0": {"acc": 0.513, "acc_stderr": 0.015813952101896622},
-        "lighteval:anli:r1:0": {"acc": 0.315, "acc_stderr": 0.014696631960792496},
+            "truthfulqa_mc1": 0.23745410036719705,
+            "truthfulqa_mc1_stderr": 0.014896277441041843,
+            "truthfulqa_mc2": 0.47183673282563937,
+            "truthfulqa_mc2_stderr": 0.01683985739593103,
+        },
+        "lighteval:blimp:adjunct_island:0": {"acc": 0.531, "acc_stderr": 0.015788865959539006},
+        "lighteval:blimp:ellipsis_n_bar_1:0": {"acc": 0.489, "acc_stderr": 0.01581547119529269},
+        "lighteval:anli:r1:0": {"acc": 0.366, "acc_stderr": 0.015240612726405756},
         "helm:mmlu:_average:5": {
-            "em": 0.005,
-            "em_stderr": 0.003517632353407238,
-            "qem": 0.0075,
-            "qem_stderr": 0.006017632353407238,
-            "pem": 0.057499999999999996,
-            "pem_stderr": 0.021611040515497813,
-            "pqem": 0.3125,
-            "pqem_stderr": 0.046173750223022524,
-        },
-        "leaderboard:mmlu:_average:5": {"acc": 0.2625, "acc_stderr": 0.04390310039822079},
-        "lighteval:blimp:_average:0": {"acc": 0.5095000000000001, "acc_stderr": 0.01581605650025167},
+            "em": 0.0,
+            "em_stderr": 0.0,
+            "qem": 0.005,
+            "qem_stderr": 0.004999999999999999,
+            "pem": 0.0,
+            "pem_stderr": 0.0,
+            "pqem": 0.255,
+            "pqem_stderr": 0.04370224038981452,
+        },
+        "leaderboard:mmlu:_average:5": {"acc": 0.23249999999999998, "acc_stderr": 0.042081472094977104},
+        "lighteval:blimp:_average:0": {"acc": 0.51, "acc_stderr": 0.01580216857741585},
     }
 }