From 917b02d5eecf6a683087c598a5e988d86c440243 Mon Sep 17 00:00:00 2001
From: MingxuanXia <xiamingxuan@zju.edu.cn>
Date: Tue, 19 Mar 2024 03:42:02 +0000
Subject: [PATCH] Update multi modal evaluation

---
 docs/examples/multimodal.md     |  32 ++--
 examples/multimodal.ipynb       |  37 ++---
 promptbench/dataload/dataset.py | 262 ++++++++++++++++----------------
 3 files changed, 152 insertions(+), 179 deletions(-)
diff --git a/docs/examples/multimodal.md b/docs/examples/multimodal.md
index c360cd8..d1a8a78 100644
--- a/docs/examples/multimodal.md
+++ b/docs/examples/multimodal.md
@@ -28,32 +28,18 @@ print(pb.SUPPORTED_DATASETS_VLM)
 dataset = pb.DatasetLoader.load_dataset("mmmu")
 
 # print the first 5 examples
-dataset[:5]
+for idx in range(5):
+    print(dataset[idx])
 ```
 
     All supported datasets: 
     ['vqav2', 'nocaps', 'science_qa', 'math_vista', 'ai2d', 'mmmu', 'chart_qa']
-
-
-
-
-
-    [{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=733x237>],
-      'answer': 'B',
-      'question': '<image 1> Baxter Company has a relevant range of production between 15,000 and 30,000 units. The following cost data represents average variable costs per unit for 25,000 units of production. If 30,000 units are produced, what are the per unit manufacturing overhead costs incurred?\nA: $6\nB: $7\nC: $8\nD: $9'},
-     {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=342x310>],
-      'answer': 'C',
-      'question': 'Assume accounts have normal balances, solve for the one missing account balance: Dividends. Equipment was recently purchased, so there is neither depreciation expense nor accumulated depreciation. <image 1>\nA: $194,815\nB: $182,815\nC: $12,000\nD: $9,000'},
-     {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=336x169>],
-      'answer': 'B',
-      'question': 'Maxwell Software, Inc., has the following mutually exclusive projects.Suppose the company uses the NPV rule to rank these two projects.<image 1> Which project should be chosen if the appropriate discount rate is 15 percent?\nA: Project A\nB: Project B'},
-     {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1222x237>],
-      'answer': 'D',
-      'question': "Each situation below relates to an independent company's Owners' Equity. <image 1> Calculate the missing values of company 2.\nA: $1,620\nB: $12,000\nC: $51,180\nD: $0"},
-     {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1219x217>],
-      'answer': 'B',
-      'question': 'The following data show the units in beginning work in process inventory, the number of units started, the number of units transferred, and the percent completion of the ending work in process for conversion. Given that materials are added at the beginning of the process, what are the equivalent units for conversion costs for each quarter using the weighted-average method? Assume that the quarters are independent.<image 1>\nA: 132,625\nB: 134,485\nC: 135,332\nD: 132,685'}]
-
+    Images already saved to local, loading file:  /home/v-mingxia/promptbench/promptbench/data/mmmu/validation.json
+    {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=733x237 at 0x7F13BA2CD160>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/0_image_1.png'], 'answer': 'B', 'question': '<image 1> Baxter Company has a relevant range of production between 15,000 and 30,000 units. The following cost data represents average variable costs per unit for 25,000 units of production. If 30,000 units are produced, what are the per unit manufacturing overhead costs incurred?\nA: $6\nB: $7\nC: $8\nD: $9'}
+    {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=342x310 at 0x7F13BA2CD550>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/1_image_1.png'], 'answer': 'C', 'question': 'Assume accounts have normal balances, solve for the one missing account balance: Dividends. Equipment was recently purchased, so there is neither depreciation expense nor accumulated depreciation. <image 1>\nA: $194,815\nB: $182,815\nC: $12,000\nD: $9,000'}
+    {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=336x169 at 0x7F13BA2CD130>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/2_image_1.png'], 'answer': 'B', 'question': 'Maxwell Software, Inc., has the following mutually exclusive projects.Suppose the company uses the NPV rule to rank these two projects.<image 1> Which project should be chosen if the appropriate discount rate is 15 percent?\nA: Project A\nB: Project B'}
+    {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1222x237 at 0x7F13BA2CD460>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/3_image_1.png'], 'answer': 'D', 'question': "Each situation below relates to an independent company's Owners' Equity. <image 1> Calculate the missing values of company 2.\nA: $1,620\nB: $12,000\nC: $51,180\nD: $0"}
+    {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1219x217 at 0x7F13BA2CD400>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/4_image_1.png'], 'answer': 'B', 'question': 'The following data show the units in beginning work in process inventory, the number of units started, the number of units transferred, and the percent completion of the ending work in process for conversion. Given that materials are added at the beginning of the process, what are the equivalent units for conversion costs for each quarter using the weighted-average method? Assume that the quarters are independent.<image 1>\nA: 132,625\nB: 134,485\nC: 135,332\nD: 132,685'}
 
 
 ## Load models
@@ -104,7 +90,7 @@ for prompt in prompts:
     for data in tqdm(dataset):
         # process input
         input_text = pb.InputProcess.basic_format(prompt, data)
-        input_images = data['images']
+        input_images = data['images']  # please use data['image_paths'] instead of data['images'] for models that only support image path/url, such as GPT-4v
         label = data['answer']
         raw_pred = model(input_images, input_text)
         # process output
diff --git a/examples/multimodal.ipynb b/examples/multimodal.ipynb
index adb3014..9a072a3 100644
--- a/examples/multimodal.ipynb
+++ b/examples/multimodal.ipynb
@@ -51,32 +51,14 @@
      "output_type": "stream",
      "text": [
       "All supported datasets: \n",
-      "['vqav2', 'nocaps', 'science_qa', 'math_vista', 'ai2d', 'mmmu', 'chart_qa']\n"
+      "['vqav2', 'nocaps', 'science_qa', 'math_vista', 'ai2d', 'mmmu', 'chart_qa']\n",
+      "Images already saved to local, loading file:  /home/v-mingxia/promptbench/promptbench/data/mmmu/validation.json\n",
+      "{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=733x237 at 0x7F13BA2CD160>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/0_image_1.png'], 'answer': 'B', 'question': '<image 1> Baxter Company has a relevant range of production between 15,000 and 30,000 units. The following cost data represents average variable costs per unit for 25,000 units of production. If 30,000 units are produced, what are the per unit manufacturing overhead costs incurred?\\nA: $6\\nB: $7\\nC: $8\\nD: $9'}\n",
+      "{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=342x310 at 0x7F13BA2CD550>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/1_image_1.png'], 'answer': 'C', 'question': 'Assume accounts have normal balances, solve for the one missing account balance: Dividends. Equipment was recently purchased, so there is neither depreciation expense nor accumulated depreciation. <image 1>\\nA: $194,815\\nB: $182,815\\nC: $12,000\\nD: $9,000'}\n",
+      "{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=336x169 at 0x7F13BA2CD130>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/2_image_1.png'], 'answer': 'B', 'question': 'Maxwell Software, Inc., has the following mutually exclusive projects.Suppose the company uses the NPV rule to rank these two projects.<image 1> Which project should be chosen if the appropriate discount rate is 15 percent?\\nA: Project A\\nB: Project B'}\n",
+      "{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1222x237 at 0x7F13BA2CD460>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/3_image_1.png'], 'answer': 'D', 'question': \"Each situation below relates to an independent company's Owners' Equity. <image 1> Calculate the missing values of company 2.\\nA: $1,620\\nB: $12,000\\nC: $51,180\\nD: $0\"}\n",
+      "{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1219x217 at 0x7F13BA2CD400>], 'image_paths': ['/home/v-mingxia/promptbench/promptbench/data/mmmu/validation/4_image_1.png'], 'answer': 'B', 'question': 'The following data show the units in beginning work in process inventory, the number of units started, the number of units transferred, and the percent completion of the ending work in process for conversion. Given that materials are added at the beginning of the process, what are the equivalent units for conversion costs for each quarter using the weighted-average method? Assume that the quarters are independent.<image 1>\\nA: 132,625\\nB: 134,485\\nC: 135,332\\nD: 132,685'}\n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[{'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=733x237>],\n",
-       "  'answer': 'B',\n",
-       "  'question': '<image 1> Baxter Company has a relevant range of production between 15,000 and 30,000 units. The following cost data represents average variable costs per unit for 25,000 units of production. If 30,000 units are produced, what are the per unit manufacturing overhead costs incurred?\\nA: $6\\nB: $7\\nC: $8\\nD: $9'},\n",
-       " {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=342x310>],\n",
-       "  'answer': 'C',\n",
-       "  'question': 'Assume accounts have normal balances, solve for the one missing account balance: Dividends. Equipment was recently purchased, so there is neither depreciation expense nor accumulated depreciation. <image 1>\\nA: $194,815\\nB: $182,815\\nC: $12,000\\nD: $9,000'},\n",
-       " {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=336x169>],\n",
-       "  'answer': 'B',\n",
-       "  'question': 'Maxwell Software, Inc., has the following mutually exclusive projects.Suppose the company uses the NPV rule to rank these two projects.<image 1> Which project should be chosen if the appropriate discount rate is 15 percent?\\nA: Project A\\nB: Project B'},\n",
-       " {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1222x237>],\n",
-       "  'answer': 'D',\n",
-       "  'question': \"Each situation below relates to an independent company's Owners' Equity. <image 1> Calculate the missing values of company 2.\\nA: $1,620\\nB: $12,000\\nC: $51,180\\nD: $0\"},\n",
-       " {'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1219x217>],\n",
-       "  'answer': 'B',\n",
-       "  'question': 'The following data show the units in beginning work in process inventory, the number of units started, the number of units transferred, and the percent completion of the ending work in process for conversion. Given that materials are added at the beginning of the process, what are the equivalent units for conversion costs for each quarter using the weighted-average method? Assume that the quarters are independent.<image 1>\\nA: 132,625\\nB: 134,485\\nC: 135,332\\nD: 132,685'}]"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -89,7 +71,8 @@
     "dataset = pb.DatasetLoader.load_dataset(\"mmmu\")\n",
     "\n",
     "# print the first 5 examples\n",
-    "dataset[:5]"
+    "for idx in range(5):\n",
+    "    print(dataset[idx])"
    ]
   },
   {
@@ -219,7 +202,7 @@
     "    for data in tqdm(dataset):\n",
     "        # process input\n",
     "        input_text = pb.InputProcess.basic_format(prompt, data)\n",
-    "        input_images = data['images']\n",
+    "        input_images = data['images']  # please use data['image_paths'] instead of data['images'] for models that only support image path/url, such as GPT-4v\n",
     "        label = data['answer']\n",
     "        raw_pred = model(input_images, input_text)\n",
     "        # process output\n",
diff --git a/promptbench/dataload/dataset.py b/promptbench/dataload/dataset.py
index 03c3803..88066ad 100644
--- a/promptbench/dataload/dataset.py
+++ b/promptbench/dataload/dataset.py
@@ -6,6 +6,8 @@
 import random
 import requests
 import json
+from PIL import Image as PILImage
+from tqdm import tqdm
 
 from promptbench.config import *
 from datasets import load_dataset
@@ -61,6 +63,42 @@ def __getitem__(self, idx):
     def extract_answer(self, output): 
         return output
 
+    def save_images_to_local(self, dataset, split, key_list):
+        # Get the parent directory
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        self.data_dir = os.path.join(os.path.dirname(cur_dir), 'data')
+        if not os.path.exists(self.data_dir):
+            os.mkdir(self.data_dir)
+
+        self.dataset_dir = os.path.join(self.data_dir, dataset)
+        if not os.path.exists(self.dataset_dir):
+            os.mkdir(self.dataset_dir)
+        
+        self.split_dir = os.path.join(self.dataset_dir, split)
+        if not os.path.exists(self.split_dir):
+            os.mkdir(self.split_dir)
+
+        self.filepath = os.path.join(self.dataset_dir, f"{split}.json")
+        if not os.path.exists(self.filepath):
+            local_path_dict = {}
+            print('Saveing images to local path: ', self.split_dir)
+            for idx in tqdm(range(len(self.data))):
+                local_path_dict[str(idx)] = []
+                for key in key_list:
+                    if self.data[idx][key] is not None:
+                        image_dir = os.path.join(self.split_dir, f'{idx}_{key}.png')
+                        self.data[idx][key].save(image_dir)
+                        local_path_dict[str(idx)].append(image_dir)
+            print('Saving file: ', self.filepath)
+            with open(self.filepath, 'w') as f:
+                json.dump(local_path_dict, f)
+        else:
+            print('Images already saved to local, loading file: ', self.filepath)
+            with open(self.filepath, 'r') as f:
+                local_path_dict = json.load(f)
+
+        self.local_path_dict = local_path_dict
 
 class BoolLogic(Dataset):
     """
@@ -663,32 +701,30 @@ class VQAv2(Dataset):
 
     Example data format:
     {
-        'question_type': 'what is',
-        'multiple_choice_answer': 'picnic table',
-        'answers': [{'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 1},
-        {'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 2},
-        {'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 3},
-        {'answer': 'picnic table', 'answer_confidence': 'yes', 'answer_id': 4},
-        {'answer': 'picnic table', 'answer_confidence': 'yes', 'answer_id': 5},
-        {'answer': 'picnic table', 'answer_confidence': 'yes', 'answer_id': 6},
-        {'answer': 'picnic table', 'answer_confidence': 'yes', 'answer_id': 7},
-        {'answer': 'picnic table', 'answer_confidence': 'yes', 'answer_id': 8},
-        {'answer': 'skateboard', 'answer_confidence': 'yes', 'answer_id': 9},
-        {'answer': 'picnic table', 'answer_confidence': 'yes', 'answer_id': 10}],
-        'image_id': 262148,
-        'answer_type': 'other',
-        'question_id': 262148002,
-        'question': 'What is he on top of?',
-        'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x512>
+        'images': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x512>],
+        'image_paths': ['/Users/username/.cache/huggingface/datasets/downloads/extracted/adf50b3f63fdc93e9e2d368b11520a5796a4e904a80852e73a19a7112fca3592/val2014/COCO_val2014_000000262148.jpg'],
+        'answers': [{'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 1},
+        {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 2},
+        {'answer': 'at table', 'answer_confidence': 'yes', 'answer_id': 3},
+        {'answer': 'skateboard', 'answer_confidence': 'yes', 'answer_id': 4},
+        {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 5},
+        {'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 6},
+        {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 7},
+        {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 8},
+        {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 9},
+        {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 10}],
+        'question': 'Where is he looking?'
     }
     """
     def __init__(self):
-        data = load_dataset("HuggingFaceM4/VQAv2", split="validation")
+        from datasets import Image
+        data = load_dataset("HuggingFaceM4/VQAv2", split="validation").cast_column("image", Image(decode=False))
         self.data = data
     
     def __getitem__(self, idx):
         assert len(self.data) > 0, "Empty dataset. Please load data first."
-        return {"images": [self.data[idx]['image']],
+        return {"images": [PILImage.open(self.data[idx]['image']['path'])],
+                "image_paths": [self.data[idx]['image']['path']],
                 "answers": self.data[idx]['answers'],
                 "question": self.data[idx]['question'],}
 
@@ -703,17 +739,9 @@ class NoCaps(Dataset):
 
     Example data format:
     {
-        'image': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=732x1024>,
-        'image_coco_url': 'https://s3.amazonaws.com/nocaps/val/0013ea2087020901.jpg',
-        'image_date_captured': '2018-11-06 11:04:33',
-        'image_file_name': '0013ea2087020901.jpg',
-        'image_height': 1024,
-        'image_width': 732,
-        'image_id': 0,
-        'image_license': 0,
-        'image_open_images_id': '0013ea2087020901',
-        'annotations_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-        'annotations_captions': ['A baby is standing in front of a house.',
+        'images': [<PIL.JpegImagePlugin.JpegImageFile image mode=L size=732x1024>],
+        'image_paths': ['https://s3.amazonaws.com/nocaps/val/0013ea2087020901.jpg'],
+        'answers': ['A baby is standing in front of a house.',
         'A little girl in a white jacket and sandals.',
         'A young child stands in front of a house.',
         'A child is wearing a white shirt and standing on a side walk. ',
@@ -726,12 +754,16 @@ class NoCaps(Dataset):
     }
     """
     def __init__(self):
-        data = load_dataset("HuggingFaceM4/NoCaps", split="validation")
+        from datasets import Image
+        data = load_dataset("HuggingFaceM4/NoCaps", split="validation").cast_column("image", Image(decode=False))
+        data1 = load_dataset("HuggingFaceM4/NoCaps", split="validation")
         self.data = data
+        self.data1 = data1
     
     def __getitem__(self, idx):
         assert len(self.data) > 0, "Empty dataset. Please load data first."
-        return {"images": [self.data[idx]['image']],
+        return {"images": [self.data1[idx]['image']],
+                "image_paths": [self.data[idx]['image']['path']],
                 "answers": self.data[idx]['annotations_captions']}
 
 class MathVista(Dataset):
@@ -745,37 +777,22 @@ class MathVista(Dataset):
 
     Example data format:
     {
-        'pid': '1',
-        'question': "When a spring does work on an object, we cannot find the work by simply multiplying the spring force by the object's displacement. The reason is that there is no one value for the force-it changes. However, we can split the displacement up into an infinite number of tiny parts and then approximate the force in each as being constant. Integration sums the work done in all those parts. Here we use the generic result of the integration.\r\n\r\nIn Figure, a cumin canister of mass $m=0.40 \\mathrm{~kg}$ slides across a horizontal frictionless counter with speed $v=0.50 \\mathrm{~m} / \\mathrm{s}$. It then runs into and compresses a spring of spring constant $k=750 \\mathrm{~N} / \\mathrm{m}$. When the canister is momentarily stopped by the spring, by what distance $d$ is the spring compressed?",
-        'image': 'images/1.jpg',
-        'decoded_image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1514x720>,
-        'choices': None,
-        'unit': None,
-        'precision': 1.0,
+        'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1514x720>],
+        'image_paths': ['/Users/username/promptbench/promptbench/data/math_vista/testmini/0_decoded_image.png'],
         'answer': '1.2',
-        'question_type': 'free_form',
-        'answer_type': 'float',
-        'metadata': {'category': 'math-targeted-vqa',
-        'context': 'scientific figure',
-        'grade': 'college',
-        'img_height': 720,
-        'img_width': 1514,
-        'language': 'english',
-        'skills': ['scientific reasoning'],
-        'source': 'SciBench',
-        'split': 'testmini',
-        'task': 'textbook question answering'},
-        'query': "Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.\nQuestion: When a spring does work on an object, we cannot find the work by simply multiplying the spring force by the object's displacement. The reason is that there is no one value for the force-it changes. However, we can split the displacement up into an infinite number of tiny parts and then approximate the force in each as being constant. Integration sums the work done in all those parts. Here we use the generic result of the integration.\r\n\r\nIn Figure, a cumin canister of mass $m=0.40 \\mathrm{~kg}$ slides across a horizontal frictionless counter with speed $v=0.50 \\mathrm{~m} / \\mathrm{s}$. It then runs into and compresses a spring of spring constant $k=750 \\mathrm{~N} / \\mathrm{m}$. When the canister is momentarily stopped by the spring, by what distance $d$ is the spring compressed?"
+        'question': "When a spring does work on an object, we cannot find the work by simply multiplying the spring force by the object's displacement. The reason is that there is no one value for the force-it changes. However, we can split the displacement up into an infinite number of tiny parts and then approximate the force in each as being constant. Integration sums the work done in all those parts. Here we use the generic result of the integration.\r\n\r\nIn Figure, a cumin canister of mass $m=0.40 \\mathrm{~kg}$ slides across a horizontal frictionless counter with speed $v=0.50 \\mathrm{~m} / \\mathrm{s}$. It then runs into and compresses a spring of spring constant $k=750 \\mathrm{~N} / \\mathrm{m}$. When the canister is momentarily stopped by the spring, by what distance $d$ is the spring compressed?\nANSWER TYPE: float"
     }
         
     """
     def __init__(self):
         data = load_dataset("AI4Math/MathVista", split="testmini")
         self.data = data
+        self.save_images_to_local(dataset='math_vista', split='testmini', key_list=['decoded_image'])
 
     def __getitem__(self, idx):
         assert len(self.data) > 0, "Empty dataset. Please load data first."
         return {"images": [self.data[idx]['decoded_image']],
+                "image_paths": self.local_path_dict[str(idx)],
                 "answer": self.data[idx]['answer'],
                 "question":  self.data[idx]['question'] + "\nANSWER TYPE: " + self.data[idx]['answer_type'],}
 
@@ -790,27 +807,27 @@ class AI2D(Dataset):
 
     Example data format:
     {
-        'question': 'which of these define dairy item',
-        'options': ['c', 'D', 'b', 'a'],
-        'answer': '1',
-        'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=600x449>
+        'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=600x449>],
+        'image_paths': ['/Users/username/promptbench/promptbench/data/ai2d/test/0_image.png'],
+        'question': 'which of these define dairy item\n0: c\n1: D\n2: b\n3: a',
+        'answer': '1'
     }
     """
     def __init__(self):
         data = load_dataset("lmms-lab/ai2d", split="test")
-        self.data = []
-
-        for d in data:
-            choices_dict = dict(enumerate(d['options']))
-            choices = ''
-            for k, v in choices_dict.items():
-                choices += f"\n{k}: {v}"
+        self.data = data
+        self.save_images_to_local(dataset='ai2d', split='test', key_list=['image'])
 
-            self.data.append({
-                "images": [d['image']],
-                "question": d['question'] + choices,
-                "answer": d['answer']
-            })
+    def __getitem__(self, idx):
+        assert len(self.data) > 0, "Empty dataset. Please load data first."
+        choices_dict = dict(enumerate(self.data[idx]['options']))
+        choices = ''
+        for k, v in choices_dict.items():
+            choices += f"\n{k}: {v}"
+        return({"images": [self.data[idx]['image']],
+                "image_paths": self.local_path_dict[str(idx)],
+                "question": self.data[idx]['question'] + choices,
+                "answer": self.data[idx]['answer']})
 
 class ChartQA(Dataset):
     """
@@ -823,19 +840,21 @@ class ChartQA(Dataset):
 
     Example data format:
     {
-        'type': 'human_test',
-        'question': 'How many food item is shown in the bar graph?',
+        'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=850x600>],
+        'image_paths': ['/Users/username/promptbench/promptbench/data/chart_qa/test/0_image.png'],
         'answer': '14',
-        'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=850x600>
+        'question': 'How many food item is shown in the bar graph?'
     }
     """
     def __init__(self):
         data = load_dataset("lmms-lab/ChartQA", split="test")
         self.data = data
+        self.save_images_to_local(dataset='chart_qa', split='test', key_list=['image'])
     
     def __getitem__(self, idx):
         assert len(self.data) > 0, "Empty dataset. Please load data first."
         return {"images": [self.data[idx]['image']],
+                "image_paths": self.local_path_dict[str(idx)],
                 "answer": self.data[idx]['answer'],
                 "question":  self.data[idx]['question'],}
 
@@ -850,38 +869,31 @@ class ScienceQA(Dataset):
 
     Example data format:
     {
-        'image': None,
-        'question': 'Which figure of speech is used in this text?\nSing, O goddess, the anger of Achilles son of Peleus, that brought countless ills upon the Achaeans.\n—Homer, The Iliad',
-        'choices': ['chiasmus', 'apostrophe'],
+        'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGB size=202x202>],
+        'image_paths': ['/Users/username/promptbench/promptbench/data/science_qa/validation/0_image.png'],
         'answer': 1,
-        'hint': '',
-        'task': 'closed choice',
-        'grade': 'grade11',
-        'subject': 'language science',
-        'topic': 'figurative-language',
-        'category': 'Literary devices',
-        'skill': 'Classify the figure of speech: anaphora, antithesis, apostrophe, assonance, chiasmus, understatement',
-        'lecture': 'Figures of speech are words or phrases that use language in a nonliteral or unusual way. They can make writing more expressive.\nAnaphora is the repetition of the same word or words at the beginning of several phrases or clauses.\nWe are united. We are powerful. We are winners.\nAntithesis involves contrasting opposing ideas within a parallel grammatical structure.\nI want to help, not to hurt.\nApostrophe is a direct address to an absent person or a nonhuman entity.\nOh, little bird, what makes you sing so beautifully?\nAssonance is the repetition of a vowel sound in a series of nearby words.\nTry to light the fire.\nChiasmus is an expression in which the second half parallels the first but reverses the order of words.\nNever let a fool kiss you or a kiss fool you.\nUnderstatement involves deliberately representing something as less serious or important than it really is.\nAs you know, it can get a little cold in the Antarctic.',
-        'solution': 'The text uses apostrophe, a direct address to an absent person or a nonhuman entity.\nO goddess is a direct address to a goddess, a nonhuman entity.'}
+        'question': "Which animal's mouth is also adapted for bottom feeding?\n0: discus\n1: armored catfish"
     }
     """
     def __init__(self):
         data = load_dataset("derek-thomas/ScienceQA", split="validation")
-        self.data = []
-
+        data_with_images = []
         for d in data:
             if d['image'] is not None:
+                data_with_images.append(d)
+        self.data = data_with_images
+        self.save_images_to_local(dataset='science_qa', split='validation', key_list=['image'])
 
-                choices_dict = dict(enumerate(d['choices']))
-                choices = ''
-                for k, v in choices_dict.items():
-                    choices += f"\n{k}: {v}"
-
-                self.data.append({
-                    "images": [d['image']],
-                    "question": d['question'] + choices,
-                    "answer": d['answer']
-                })
+    def __getitem__(self, idx):
+        assert len(self.data) > 0, "Empty dataset. Please load data first."
+        choices_dict = dict(enumerate(self.data[idx]['choices']))
+        choices = ''
+        for k, v in choices_dict.items():
+            choices += f"\n{k}: {v}"
+        return {"images": [self.data[idx]['image']],
+                "image_paths": self.local_path_dict[str(idx)],
+                "answer": self.data[idx]['answer'],
+                "question":  self.data[idx]['question'] + choices,}
 
 class MMMU(Dataset):
     """
@@ -893,43 +905,35 @@ class MMMU(Dataset):
     MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI (https://arxiv.org/abs/2311.16502)
 
     {
-        'id': 'validation_Accounting_1',
-        'question': '<image 1> Baxter Company has a relevant range of production between 15,000 and 30,000 units. The following cost data represents average variable costs per unit for 25,000 units of production. If 30,000 units are produced, what are the per unit manufacturing overhead costs incurred?',
-        'options': "['$6', '$7', '$8', '$9']",
-        'explanation': '',
-        'image_1': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=733x237>,
-        'image_2': None,
-        'image_3': None,
-        'image_4': None,
-        'image_5': None,
-        'image_6': None,
-        'image_7': None,
-        'img_type': "['Tables']",
+        'images': [<PIL.PngImagePlugin.PngImageFile image mode=RGBA size=733x237>],
+        'image_paths': ['/Users/username/promptbench/promptbench/data/mmmu/validation/0_image_1.png'],
         'answer': 'B',
-        'topic_difficulty': 'Medium',
-        'question_type': 'multiple-choice',
-        'subfield': 'Managerial Accounting'
+        'question': '<image 1> Baxter Company has a relevant range of production between 15,000 and 30,000 units. The following cost data represents average variable costs per unit for 25,000 units of production. If 30,000 units are produced, what are the per unit manufacturing overhead costs incurred?\nA: $6\nB: $7\nC: $8\nD: $9'
     }
     """
     def __init__(self):
         data = load_dataset("lmms-lab/MMMU", split="validation")
-        self.data = []
+        self.data = data
+        self.save_images_to_local(dataset='mmmu', split='validation', key_list=['image_1', 'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7'])
 
-        for d in data:
-                
-            choices_dict = dict(enumerate(eval(d['options'])))
-            choices = ''
-            for k, v in choices_dict.items():
-                choices += f"\n{chr(ord('A') + int(k))}: {v}"
-            question = d['question'] + choices
-
-            images = []
-            for i in range(1, 7):
-                if f'image {i}' in question:
-                    if d[f'image_{i}'].mode == 'P':
-                        d[f'image_{i}'] = d[f'image_{i}'].convert('RGBA')
-                    images.append(d[f'image_{i}'])
-
-            self.data.append({"images": images,
-                              "answer": d['answer'],
-                              "question": question})
\ No newline at end of file
+    def __getitem__(self, idx):
+        assert len(self.data) > 0, "Empty dataset. Please load data first."
+        
+        d = self.data[idx]
+        choices_dict = dict(enumerate(eval(d['options'])))
+        choices = ''
+        for k, v in choices_dict.items():
+            choices += f"\n{chr(ord('A') + int(k))}: {v}"
+        question = d['question'] + choices
+
+        images = []
+        for i in range(1, 7):
+            if f'image {i}' in question:
+                if d[f'image_{i}'].mode == 'P':
+                    d[f'image_{i}'] = d[f'image_{i}'].convert('RGBA')
+                images.append(d[f'image_{i}'])
+        
+        return {"images": images,
+                "image_paths": self.local_path_dict[str(idx)],
+                "answer": d['answer'],
+                "question": question,}
\ No newline at end of file