From 654533e23c8c598b6cb7484c51cf563308d84586 Mon Sep 17 00:00:00 2001 From: Erik Ritter Date: Wed, 3 Jan 2024 10:20:49 -0800 Subject: [PATCH] Improve MMMU performance with prompt engineering (#1450) With this improvement we now have a 0-shot performance of 59.6% (averaged over 3 eval runs) on the MMMU validation set, which beats the 56.8% reported in the [MMMU paper](https://arxiv.org/pdf/2311.16502.pdf) --- evals/elsuite/mmmu/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/elsuite/mmmu/eval.py b/evals/elsuite/mmmu/eval.py index 53cd4c7866..f338ba667e 100644 --- a/evals/elsuite/mmmu/eval.py +++ b/evals/elsuite/mmmu/eval.py @@ -88,7 +88,7 @@ def eval_sample(self, sample: Sample, rng): rng=rng, ) prompt = sample.question + "\n" + options - system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is exactly one of A,B,C,D: "ANSWER: X"' + system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is exactly one of A,B,C,D: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.' else: correct_answer = sample.label prompt = sample.question