Randomly select MMMU answer when none is returned from the model (ope…

…nai#1447) This is the behavior MMMU used for evaluating, so we should match this here. As an example this increased the mmmu-music benchmark from `0.3666` to `0.4` as multiple questions in that benchmark were unanswered by the model
rem2akhmad · Dec 24, 2023 · ded9382 · ded9382
1 parent 02f35cc
commit ded9382
Showing 1 changed file with 8 additions and 0 deletions.
diff --git a/evals/elsuite/mmmu/eval.py b/evals/elsuite/mmmu/eval.py
@@ -159,6 +159,14 @@ def eval_sample(self, sample: Sample, rng):
 
  match = sampled.find(f"ANSWER: {correct_answer}") != -1
 
+ if not match and sampled.find("ANSWER") == -1 and sample.question_type == "multiple-choice":
+ # The model didn't answer anything, so randomly pick an answer
+ # This matches the behavior described in section 4.1 of the MMMU paper: https://arxiv.org/pdf/2311.16502.pdf
+ logging.info("No answer found for multiple choice so picking a random answer.")
+ answer_idx = rng.randint(0, len(sample.answers) - 1)
+ answer_letter = chr(ord("A") + answer_idx)
+ match = correct_answer == answer_letter
+
  record_match(
  match,
  expected=correct_answer,