From 654533e23c8c598b6cb7484c51cf563308d84586 Mon Sep 17 00:00:00 2001
From: Erik Ritter <erik.t.ritter@gmail.com>
Date: Wed, 3 Jan 2024 10:20:49 -0800
Subject: [PATCH] Improve MMMU performance with prompt engineering (#1450)

With this improvement we now have a 0-shot performance of 59.6%
(averaged over 3 eval runs) on the MMMU validation set, which beats the
56.8% reported in the [MMMU paper](https://arxiv.org/pdf/2311.16502.pdf)
---
 evals/elsuite/mmmu/eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/elsuite/mmmu/eval.py b/evals/elsuite/mmmu/eval.py
index 53cd4c7866..f338ba667e 100644
--- a/evals/elsuite/mmmu/eval.py
+++ b/evals/elsuite/mmmu/eval.py
@@ -88,7 +88,7 @@ def eval_sample(self, sample: Sample, rng):
                 rng=rng,
             )
             prompt = sample.question + "\n" + options
-            system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is exactly one of A,B,C,D: "ANSWER: X"'
+            system_prompt = f'You are an expert in {self.subject} whose job is to answer questions from the user using images. First, reason about the correct answer. Then write the answer in the following format where X is exactly one of A,B,C,D: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.'
         else:
             correct_answer = sample.label
             prompt = sample.question