From 67b1a96039ef22166c3d038a497d8c3b0164b0fc Mon Sep 17 00:00:00 2001
From: KainingYing <kaining.ying.cv@gmail.com>
Date: Mon, 17 Jun 2024 20:24:55 +0800
Subject: [PATCH 1/5] update mmt-bench

---
 run.py                            |   6 +
 vlmeval/evaluate/__init__.py      |   1 +
 vlmeval/evaluate/mmtbench_eval.py | 364 ++++++++++++++++++++++++++++++
 vlmeval/utils/dataset_config.py   |  17 ++
 4 files changed, 388 insertions(+)
 create mode 100644 vlmeval/evaluate/mmtbench_eval.py

diff --git a/run.py b/run.py
index ba3153a4..fe1eb268 100644
--- a/run.py
+++ b/run.py
@@ -119,6 +119,12 @@ def main():
                 judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
             if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
                 judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+            
+            if rank == 0:
+                if 'MMT-Bench_ALL' in dataset_name:
+                    submission_file = MMTBench_result_transfer(result_file, dataset=dataset_name, **judge_kwargs)
+                    logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}')  # noqa: E501
+                    continue
 
             if rank == 0 and args.mode == 'all':
                 if DATASET_TYPE(dataset_name) == 'multi-choice':
diff --git a/vlmeval/evaluate/__init__.py b/vlmeval/evaluate/__init__.py
index 10248c4b..fad24540 100644
--- a/vlmeval/evaluate/__init__.py
+++ b/vlmeval/evaluate/__init__.py
@@ -7,3 +7,4 @@
 from .llavabench import LLaVABench_eval
 from .misc import build_judge
 from .OCRBench import OCRBench_eval
+from .mmtbench_eval import *
diff --git a/vlmeval/evaluate/mmtbench_eval.py b/vlmeval/evaluate/mmtbench_eval.py
new file mode 100644
index 00000000..4deb70dd
--- /dev/null
+++ b/vlmeval/evaluate/mmtbench_eval.py
@@ -0,0 +1,364 @@
+import os.path as osp
+
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+
+from vlmeval.evaluate.misc import build_judge
+from vlmeval.utils import can_infer, track_progress_rich, TSVDataset
+from vlmeval.smp import *
+
+
+abbrs = {
+    "visual_recognition": "VR",
+    "localization": "Loc",
+    "ocr": "OCR",
+    "counting": "Count",
+    "hallucination": "HLN",
+    "image_retrieval": "IR",
+    "threed": "3D",
+    "visual_captioning": "VC",
+    "visual_grounding": "VG",
+    "doc_understanding": "DU",
+    "action_recognition": "AR",
+    "pixel_level_perception": "PLP",
+    "image-to-image_translation": "I2IT",
+    "relation_reasoning": "RR",
+    "intelligence_quotient_test": "IQT",
+    "emotion": "Emo",
+    "visual_illusion": "VI",
+    "meme_understanding": "MemU",
+    "visual_prompt_understanding": "VPU",
+    "anomaly_detection": "AND",
+    "keypoint_detection": "KD",
+    "visual_commonsense_reasoning": "VCR",
+    "image_evaluation_judgement": "IEJ",
+    "multiple_image_analysis": "MIA",
+    "cross_image_matching": "CIM",
+    "temporal_understanding": "TU",
+    "visual_code": "VP",
+    "medical_understanding": "MedU",
+    "autonomous_driving": "AUD",
+    "discipline_knowledge_reasoning": "DKR",
+    "embodied_ai": "EA",
+    "gui_navigation": "GN"
+}
+
+
+def report_acc(df):
+    # assert group in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+    res['split'] = list()
+    res['Overall'] = list()
+    for _, name in abbrs.items():
+        res[name] = list()
+
+    if 'split' in df:
+        splits = list(set(df['split']))
+        res['split'] = splits
+
+    else:
+        df['split'] = ['none'] * len(df)
+        res['split'] = ['none']
+
+    for group in [None, 'category', 'l2-category']:
+        if group is None:
+            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+            res['Overall'].extend([np.mean(df['hit'])])
+        elif group not in df:
+            continue
+        elif group == "category":
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                ab_name = ab
+                sub_df = df[df[group] == ab]
+                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+                res[ab_name].extend([np.mean(sub_df['hit'])])
+        else:
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
+                sub_task_acc = []
+                for sub_task_name in sub_task_name_list:
+                    sub_df = df[df['category'] == sub_task_name]
+                    sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
+                
+                new_acc = []
+                for i in range(len(sub_task_acc[0])):
+                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
+                ab_name = abbrs[ab] if ab in abbrs else ab
+                res[ab_name] = new_acc
+
+                sub_task_acc = []
+                for sub_task_name in sub_task_name_list:
+                    sub_df = df[df['category'] == sub_task_name]
+                    sub_task_acc.append([np.mean(sub_df['hit'])])
+                new_acc = []
+                for i in range(len(sub_task_acc[0])):
+                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
+
+                res[ab_name].extend(new_acc)
+
+    res['split'].append('ALL')
+    return pd.DataFrame(res)
+
+
+def build_choices(item):
+    ret = {}
+    for ch in string.ascii_uppercase:
+        if ch in item and (not pd.isna(item[ch])):
+            ret[ch] = item[ch]
+    return ret
+
+
+def prefetch_answer(item):
+    choices = build_choices(item)
+    return can_infer(item['prediction'], choices)
+
+
+def prefetch_sub_data(sub_data, answer_map, verbose=False):
+    lt = len(sub_data)
+    GT, PRED = [], []
+    for i in range(lt):
+        item = sub_data.iloc[i]
+        idx = item['index']
+        GT.append(answer_map[idx])
+        PRED.append(prefetch_answer(item))
+        if PRED[-1] and (PRED[-1] in string.ascii_uppercase):
+            
+            return dict(opt=PRED[-1])
+    flag = True
+    for p in PRED:
+        if not p:
+            flag = False
+    ret = (dict(opt=PRED[-1]), ) if flag else (None, )
+    ret = ret + (GT, PRED) if verbose else ret
+    return ret if len(ret) > 1 else ret[0]
+
+
+def eval_sub_data(model, sub_data, answer_map):
+    res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True)
+    if res is not None:
+        return res
+
+    lt = len(sub_data)
+    log = ''
+    for i in range(lt):
+        if PRED[i]:
+            log += f'Rolling {i} Matched.\n'
+        else:
+            res = extract_answer_from_item(model, sub_data.iloc[i])
+            opt, match_log = res['opt'], res['log']
+            PRED[i] = opt
+            return dict(opt = opt)
+            # if PRED[i] != GT[i]:
+            #     # log += (
+            #     #     f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
+            #     #     f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
+            #     # )
+            #     return dict(opt = opt)
+            # else:
+            #     # log += (
+            #     #     f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
+            #     #     f'Pre-fetched is {PRED[i]}.\n'
+            #     # )
+            pass
+
+    return dict(opt = opt)
+
+
+def extract_answer_from_item(model, item):
+    logger = get_logger('Evaluation')
+    # It will return: (pred, raw, llm_time)
+    choices = build_choices(item)
+    option_str = build_option_str(choices)
+
+    if cn_string(item['question']):
+        prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
+    else:
+        prompt = build_prompt(item['question'], option_str, item['prediction'])
+    retry = 3
+
+    ret = can_infer(item['prediction'], choices)
+    if ret:
+        return dict(opt=ret, log=item['prediction'])
+
+    while retry:
+        ans = model.generate(prompt)
+        if 'Failed to obtain answer via API' in ans:
+            logger.warning('GPT API failed to answer. ')
+        else:
+            ret = can_infer(ans, choices)
+            if ret:
+                return dict(opt=ret, log=ans)
+            else:
+                logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
+        retry -= 1
+
+        if retry == 0:
+            options = list(choices) + ['Z'] if 'Z' not in choices else []
+            return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
+
+
+def build_prompt(question, options, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: a cute teddy bear\nYour output: A\n'
+        'Example 2: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: Spider\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+
+def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16):
+    prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups]
+    remain = []
+    for dg, pf in zip(data_groups, prefetched):
+        if pf:
+            result[dg.iloc[0]['index'] % 1e6] = pf
+        else:
+            remain.append(dg)
+    dump(result, result_file)
+    tups = [(model, x, answer_map) for x in remain]
+    keys = [x.iloc[0]['index'] % 1e6 for x in remain]
+    if len(tups) == 0:
+        return
+    
+    assert model
+
+    res = track_progress_rich(
+        eval_sub_data,
+        tups,
+        nproc=nproc,
+        chunksize=nproc,
+        save=result_file,
+        keys=keys)
+    result = load(result_file)
+    for k, v in zip(keys, res):
+        if k in result:
+            # assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
+            pass
+        else:
+            result[k] = v
+    dump(result, result_file)
+
+
+
+def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
+    logger = get_logger('Evaluation')
+    INTERNAL = os.environ.get('INTERNAL', 0)
+
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    rd.seed(2680)
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125']
+    name_str_map = {
+        'chatgpt-0613': 'openai',
+        'gpt-4-0125': 'gpt4'
+    }
+    name_str = name_str_map[model] if model in name_str_map else model
+
+    if model == 'exact_matching':
+        model = None
+    else:
+        if INTERNAL or gpt_key_set():
+            model = build_judge(**judge_kwargs)
+        else:
+            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+    logger.info(f'Evaluating {eval_file}')
+    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+
+    data = load(eval_file)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+    if dataset != 'default':
+        meta = TSVDataset(dataset).data
+    else:
+        logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ')
+        meta = load(eval_file)
+        assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.'
+
+    answer_map = {i: 'A' for i, c in zip(meta['index'], meta['index'])}  # 123
+    cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None
+    l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None
+    split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None
+
+    if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]):
+        cate_map = None
+    if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]):
+        l2_cate_map = None
+    if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]):
+        split_map = None
+
+    data = data[data['index'].isin(cate_map)]
+    data_main = data[data['index'] < int(1e6)]
+    meta_idx_set = set(meta['index'])
+    data_main = data_main[data_main['index'].isin(meta_idx_set)]
+
+    lt = len(data_main)
+
+    data_groups = []
+    for i in tqdm(range(lt)):
+        # Dealing with the normal part
+        item_main = data_main.iloc[i]
+        idx = item_main['index']
+
+        if idx in result:
+            continue
+
+        sub_data = data[data['index'] % int(1e6) == idx]
+        data_groups.append(sub_data)
+
+    if len(data_groups):
+         eval_data_groups(
+            model=model,
+            data_groups=data_groups,
+            answer_map=answer_map,
+            nproc=nproc,
+            result=result,
+            result_file=result_file)
+
+    tmp_pth = f'/tmp/{timestr()}.xlsx'
+    dump(data_main, tmp_pth)
+    data_main = load(tmp_pth)
+
+    res = load(result_file)
+    indices = data_main['index']
+
+    data_main['opt'] = [res[i]['opt'] for i in indices]
+    # data_main['log'] = [res[i]['log'] for i in indices]
+
+    main_idx = data_main['index']
+    if cate_map is not None:
+        data_main['category'] = [cate_map[i] for i in main_idx]
+    if l2_cate_map is not None:
+        data_main['l2-category'] = [l2_cate_map[i] for i in main_idx]
+    if split_map is not None:
+        data_main['split'] = [split_map[i] for i in indices]
+
+    # load split
+    output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
+    dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
+    return output_path
diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py
index 8b13eab5..9f2e7a6f 100644
--- a/vlmeval/utils/dataset_config.py
+++ b/vlmeval/utils/dataset_config.py
@@ -44,6 +44,11 @@
     'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv',
     'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
     'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
+    # MMT-Bench
+    'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. Multi-image samples are inputted as separate images.
+    'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. ulti-image samples are merged into a single large image for processing.
+    'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx', # VAL Split (10%), evaluated on local. Multi-image samples are inputted as separate images.
+    'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx' # VAL Split (10%), evaluated on local. ulti-image samples are merged into a single large image for processing.
 }
 
 dataset_md5_dict = {
@@ -89,6 +94,11 @@
     'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e',
     'RealWorldQA': '92321028d2bc29040284b6674721e48f',
     'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
+    # MMT-Bench
+    'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7',
+    'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f',
+    'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462',
+    'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0'
 }
 
 img_root_map = {k: k for k in dataset_URLs}
@@ -116,6 +126,11 @@
     'MathVista_MINI': 'MathVista',
     'HallusionBench': 'Hallusion',
     'DocVQA_VAL': 'DocVQA',
+    # MMT-Bench
+    'MMT-Bench_ALL_MI': 'MMT-Bench',
+    'MMT-Bench_ALL': 'MMT-Bench',
+    'MMT-Bench_VAL_MI': 'MMT-Bench',
+    'MMT-Bench_VAL': 'MMT-Bench'
 })
 
 assert set(dataset_URLs) == set(img_root_map)
@@ -126,6 +141,8 @@ def DATASET_TYPE(dataset):
     dataset = dataset.lower()
     if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d', 'mmstar', 'realworldqa'], dataset):
         return 'multi-choice'
+    elif 'mmt-bench' in dataset:
+        return 'multi-choice'
     elif listinstr(['mme', 'hallusion', 'pope'], dataset):
         return 'Y/N'
     elif 'coco' in dataset:

From 7b03d4a8a1c582d1b846e189a96fbeac6d8bca43 Mon Sep 17 00:00:00 2001
From: KainingYing <kaining.ying.cv@gmail.com>
Date: Mon, 17 Jun 2024 21:05:04 +0800
Subject: [PATCH 2/5] remove the comments

---
 vlmeval/utils/dataset_config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py
index 9f2e7a6f..b2df6796 100644
--- a/vlmeval/utils/dataset_config.py
+++ b/vlmeval/utils/dataset_config.py
@@ -45,10 +45,10 @@
     'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
     'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
     # MMT-Bench
-    'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. Multi-image samples are inputted as separate images.
-    'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. ulti-image samples are merged into a single large image for processing.
-    'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx', # VAL Split (10%), evaluated on local. Multi-image samples are inputted as separate images.
-    'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx' # VAL Split (10%), evaluated on local. ulti-image samples are merged into a single large image for processing.
+    'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx',
+    'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx',
+    'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx',
+    'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx'
 }
 
 dataset_md5_dict = {

From 41a9b7038d6cb2b45a1a5dedc9157b79595252ea Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Mon, 17 Jun 2024 22:25:01 +0800
Subject: [PATCH 3/5] update MMTBench

---
 run.py                              |  12 +-
 vlmeval/evaluate/__init__.py        |   1 -
 vlmeval/evaluate/mmtbench_eval.py   | 364 ----------------------------
 vlmeval/evaluate/multiple_choice.py | 115 ++++++++-
 vlmeval/utils/__init__.py           |   5 +-
 vlmeval/utils/dataset.py            |  24 --
 vlmeval/utils/dataset_config.py     |  15 +-
 vlmeval/utils/result_transfer.py    |  97 ++++++++
 8 files changed, 225 insertions(+), 408 deletions(-)
 delete mode 100644 vlmeval/evaluate/mmtbench_eval.py
 create mode 100644 vlmeval/utils/result_transfer.py

diff --git a/run.py b/run.py
index fe1eb268..e572b724 100644
--- a/run.py
+++ b/run.py
@@ -4,7 +4,7 @@
 from vlmeval.evaluate import *
 from vlmeval.inference import infer_data_job
 from vlmeval.config import supported_VLM
-from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full, MMMU_result_transfer
+from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full, MMMU_result_transfer, MMTBench_result_transfer
 
 
 def parse_args():
@@ -90,6 +90,10 @@ def main():
                     result_json = MMMU_result_transfer(result_file)
                     logger.info(f'Transfer MMMU_TEST result to json for official evaluation, json file saved in {result_json}')  # noqa: E501
                     continue
+                elif 'MMT-Bench_ALL' in dataset_name:
+                    submission_file = MMTBench_result_transfer(result_file, **judge_kwargs)
+                    logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}')  # noqa: E501
+                    continue
 
             if dataset_name in [
                 'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN'
@@ -119,12 +123,6 @@ def main():
                 judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
             if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
                 judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
-            
-            if rank == 0:
-                if 'MMT-Bench_ALL' in dataset_name:
-                    submission_file = MMTBench_result_transfer(result_file, dataset=dataset_name, **judge_kwargs)
-                    logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}')  # noqa: E501
-                    continue
 
             if rank == 0 and args.mode == 'all':
                 if DATASET_TYPE(dataset_name) == 'multi-choice':
diff --git a/vlmeval/evaluate/__init__.py b/vlmeval/evaluate/__init__.py
index fad24540..10248c4b 100644
--- a/vlmeval/evaluate/__init__.py
+++ b/vlmeval/evaluate/__init__.py
@@ -7,4 +7,3 @@
 from .llavabench import LLaVABench_eval
 from .misc import build_judge
 from .OCRBench import OCRBench_eval
-from .mmtbench_eval import *
diff --git a/vlmeval/evaluate/mmtbench_eval.py b/vlmeval/evaluate/mmtbench_eval.py
deleted file mode 100644
index 4deb70dd..00000000
--- a/vlmeval/evaluate/mmtbench_eval.py
+++ /dev/null
@@ -1,364 +0,0 @@
-import os.path as osp
-
-import pandas as pd
-from tqdm import tqdm
-import numpy as np
-
-from vlmeval.evaluate.misc import build_judge
-from vlmeval.utils import can_infer, track_progress_rich, TSVDataset
-from vlmeval.smp import *
-
-
-abbrs = {
-    "visual_recognition": "VR",
-    "localization": "Loc",
-    "ocr": "OCR",
-    "counting": "Count",
-    "hallucination": "HLN",
-    "image_retrieval": "IR",
-    "threed": "3D",
-    "visual_captioning": "VC",
-    "visual_grounding": "VG",
-    "doc_understanding": "DU",
-    "action_recognition": "AR",
-    "pixel_level_perception": "PLP",
-    "image-to-image_translation": "I2IT",
-    "relation_reasoning": "RR",
-    "intelligence_quotient_test": "IQT",
-    "emotion": "Emo",
-    "visual_illusion": "VI",
-    "meme_understanding": "MemU",
-    "visual_prompt_understanding": "VPU",
-    "anomaly_detection": "AND",
-    "keypoint_detection": "KD",
-    "visual_commonsense_reasoning": "VCR",
-    "image_evaluation_judgement": "IEJ",
-    "multiple_image_analysis": "MIA",
-    "cross_image_matching": "CIM",
-    "temporal_understanding": "TU",
-    "visual_code": "VP",
-    "medical_understanding": "MedU",
-    "autonomous_driving": "AUD",
-    "discipline_knowledge_reasoning": "DKR",
-    "embodied_ai": "EA",
-    "gui_navigation": "GN"
-}
-
-
-def report_acc(df):
-    # assert group in [None, 'category', 'l2-category']
-    res = defaultdict(list)
-    res['split'] = list()
-    res['Overall'] = list()
-    for _, name in abbrs.items():
-        res[name] = list()
-
-    if 'split' in df:
-        splits = list(set(df['split']))
-        res['split'] = splits
-
-    else:
-        df['split'] = ['none'] * len(df)
-        res['split'] = ['none']
-
-    for group in [None, 'category', 'l2-category']:
-        if group is None:
-            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
-            res['Overall'].extend([np.mean(df['hit'])])
-        elif group not in df:
-            continue
-        elif group == "category":
-            abilities = list(set(df[group]))
-            abilities.sort()
-            for ab in abilities:
-                ab_name = ab
-                sub_df = df[df[group] == ab]
-                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
-                res[ab_name].extend([np.mean(sub_df['hit'])])
-        else:
-            abilities = list(set(df[group]))
-            abilities.sort()
-            for ab in abilities:
-                sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
-                sub_task_acc = []
-                for sub_task_name in sub_task_name_list:
-                    sub_df = df[df['category'] == sub_task_name]
-                    sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
-                
-                new_acc = []
-                for i in range(len(sub_task_acc[0])):
-                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
-                ab_name = abbrs[ab] if ab in abbrs else ab
-                res[ab_name] = new_acc
-
-                sub_task_acc = []
-                for sub_task_name in sub_task_name_list:
-                    sub_df = df[df['category'] == sub_task_name]
-                    sub_task_acc.append([np.mean(sub_df['hit'])])
-                new_acc = []
-                for i in range(len(sub_task_acc[0])):
-                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
-
-                res[ab_name].extend(new_acc)
-
-    res['split'].append('ALL')
-    return pd.DataFrame(res)
-
-
-def build_choices(item):
-    ret = {}
-    for ch in string.ascii_uppercase:
-        if ch in item and (not pd.isna(item[ch])):
-            ret[ch] = item[ch]
-    return ret
-
-
-def prefetch_answer(item):
-    choices = build_choices(item)
-    return can_infer(item['prediction'], choices)
-
-
-def prefetch_sub_data(sub_data, answer_map, verbose=False):
-    lt = len(sub_data)
-    GT, PRED = [], []
-    for i in range(lt):
-        item = sub_data.iloc[i]
-        idx = item['index']
-        GT.append(answer_map[idx])
-        PRED.append(prefetch_answer(item))
-        if PRED[-1] and (PRED[-1] in string.ascii_uppercase):
-            
-            return dict(opt=PRED[-1])
-    flag = True
-    for p in PRED:
-        if not p:
-            flag = False
-    ret = (dict(opt=PRED[-1]), ) if flag else (None, )
-    ret = ret + (GT, PRED) if verbose else ret
-    return ret if len(ret) > 1 else ret[0]
-
-
-def eval_sub_data(model, sub_data, answer_map):
-    res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True)
-    if res is not None:
-        return res
-
-    lt = len(sub_data)
-    log = ''
-    for i in range(lt):
-        if PRED[i]:
-            log += f'Rolling {i} Matched.\n'
-        else:
-            res = extract_answer_from_item(model, sub_data.iloc[i])
-            opt, match_log = res['opt'], res['log']
-            PRED[i] = opt
-            return dict(opt = opt)
-            # if PRED[i] != GT[i]:
-            #     # log += (
-            #     #     f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; "
-            #     #     f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n'
-            #     # )
-            #     return dict(opt = opt)
-            # else:
-            #     # log += (
-            #     #     f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, "
-            #     #     f'Pre-fetched is {PRED[i]}.\n'
-            #     # )
-            pass
-
-    return dict(opt = opt)
-
-
-def extract_answer_from_item(model, item):
-    logger = get_logger('Evaluation')
-    # It will return: (pred, raw, llm_time)
-    choices = build_choices(item)
-    option_str = build_option_str(choices)
-
-    if cn_string(item['question']):
-        prompt = build_prompt_cn(item['question'], option_str, item['prediction'])
-    else:
-        prompt = build_prompt(item['question'], option_str, item['prediction'])
-    retry = 3
-
-    ret = can_infer(item['prediction'], choices)
-    if ret:
-        return dict(opt=ret, log=item['prediction'])
-
-    while retry:
-        ans = model.generate(prompt)
-        if 'Failed to obtain answer via API' in ans:
-            logger.warning('GPT API failed to answer. ')
-        else:
-            ret = can_infer(ans, choices)
-            if ret:
-                return dict(opt=ret, log=ans)
-            else:
-                logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
-        retry -= 1
-
-        if retry == 0:
-            options = list(choices) + ['Z'] if 'Z' not in choices else []
-            return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
-
-
-def build_prompt(question, options, prediction):
-    tmpl = (
-        'You are an AI assistant who will help me to match '
-        'an answer with several options of a single-choice question. '
-        'You are provided with a question, several options, and an answer, '
-        'and you need to find which option is most similar to the answer. '
-        'If the meaning of all options are significantly different from the answer, output Z. '
-        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
-        'Example 1: \n'
-        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
-        'Answer: a cute teddy bear\nYour output: A\n'
-        'Example 2: \n'
-        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
-        'Answer: Spider\nYour output: Z\n'
-        'Example 3: \n'
-        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
-    )
-    return tmpl.format(question, options, prediction)
-
-
-def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16):
-    prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups]
-    remain = []
-    for dg, pf in zip(data_groups, prefetched):
-        if pf:
-            result[dg.iloc[0]['index'] % 1e6] = pf
-        else:
-            remain.append(dg)
-    dump(result, result_file)
-    tups = [(model, x, answer_map) for x in remain]
-    keys = [x.iloc[0]['index'] % 1e6 for x in remain]
-    if len(tups) == 0:
-        return
-    
-    assert model
-
-    res = track_progress_rich(
-        eval_sub_data,
-        tups,
-        nproc=nproc,
-        chunksize=nproc,
-        save=result_file,
-        keys=keys)
-    result = load(result_file)
-    for k, v in zip(keys, res):
-        if k in result:
-            # assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log']
-            pass
-        else:
-            result[k] = v
-    dump(result, result_file)
-
-
-
-def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
-    logger = get_logger('Evaluation')
-    INTERNAL = os.environ.get('INTERNAL', 0)
-
-    nproc = judge_kwargs.pop('nproc', 4)
-
-    rd.seed(2680)
-    suffix = eval_file.split('.')[-1]
-    model = judge_kwargs['model']
-    assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125']
-    name_str_map = {
-        'chatgpt-0613': 'openai',
-        'gpt-4-0125': 'gpt4'
-    }
-    name_str = name_str_map[model] if model in name_str_map else model
-
-    if model == 'exact_matching':
-        model = None
-    else:
-        if INTERNAL or gpt_key_set():
-            model = build_judge(**judge_kwargs)
-        else:
-            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
-            model = None
-
-    logger.info(f'Evaluating {eval_file}')
-    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
-    result = {}
-    if osp.exists(result_file):
-        result = load(result_file)
-
-    data = load(eval_file)
-    data = data.sort_values(by='index')
-    data['prediction'] = [str(x) for x in data['prediction']]
-    for k in data.keys():
-        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
-
-    if dataset != 'default':
-        meta = TSVDataset(dataset).data
-    else:
-        logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ')
-        meta = load(eval_file)
-        assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.'
-
-    answer_map = {i: 'A' for i, c in zip(meta['index'], meta['index'])}  # 123
-    cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None
-    l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None
-    split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None
-
-    if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]):
-        cate_map = None
-    if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]):
-        l2_cate_map = None
-    if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]):
-        split_map = None
-
-    data = data[data['index'].isin(cate_map)]
-    data_main = data[data['index'] < int(1e6)]
-    meta_idx_set = set(meta['index'])
-    data_main = data_main[data_main['index'].isin(meta_idx_set)]
-
-    lt = len(data_main)
-
-    data_groups = []
-    for i in tqdm(range(lt)):
-        # Dealing with the normal part
-        item_main = data_main.iloc[i]
-        idx = item_main['index']
-
-        if idx in result:
-            continue
-
-        sub_data = data[data['index'] % int(1e6) == idx]
-        data_groups.append(sub_data)
-
-    if len(data_groups):
-         eval_data_groups(
-            model=model,
-            data_groups=data_groups,
-            answer_map=answer_map,
-            nproc=nproc,
-            result=result,
-            result_file=result_file)
-
-    tmp_pth = f'/tmp/{timestr()}.xlsx'
-    dump(data_main, tmp_pth)
-    data_main = load(tmp_pth)
-
-    res = load(result_file)
-    indices = data_main['index']
-
-    data_main['opt'] = [res[i]['opt'] for i in indices]
-    # data_main['log'] = [res[i]['log'] for i in indices]
-
-    main_idx = data_main['index']
-    if cate_map is not None:
-        data_main['category'] = [cate_map[i] for i in main_idx]
-    if l2_cate_map is not None:
-        data_main['l2-category'] = [l2_cate_map[i] for i in main_idx]
-    if split_map is not None:
-        data_main['split'] = [split_map[i] for i in indices]
-
-    # load split
-    output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
-    dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
-    return output_path
diff --git a/vlmeval/evaluate/multiple_choice.py b/vlmeval/evaluate/multiple_choice.py
index 50d52bf2..23104d5f 100644
--- a/vlmeval/evaluate/multiple_choice.py
+++ b/vlmeval/evaluate/multiple_choice.py
@@ -8,7 +8,7 @@
 
 INTERNAL = os.environ.get('INTERNAL', 0)
 
-abbrs = {
+MMB_abbrs = {
     'coarse_perception': 'CP',
     'finegrained_perception (instance-level)': 'FP-S',
     'finegrained_perception (cross-instance)': 'FP-C',
@@ -17,6 +17,41 @@
     'attribute_reasoning': 'AR'
 }
 
+MMT_abbrs = {
+    'visual_recognition': 'VR',
+    'localization': 'Loc',
+    'ocr': 'OCR',
+    'counting': 'Count',
+    'hallucination': 'HLN',
+    'image_retrieval': 'IR',
+    'threed': '3D',
+    'visual_captioning': 'VC',
+    'visual_grounding': 'VG',
+    'doc_understanding': 'DU',
+    'action_recognition': 'AR',
+    'pixel_level_perception': 'PLP',
+    'image-to-image_translation': 'I2IT',
+    'relation_reasoning': 'RR',
+    'intelligence_quotient_test': 'IQT',
+    'emotion': 'Emo',
+    'visual_illusion': 'VI',
+    'meme_understanding': 'MemU',
+    'visual_prompt_understanding': 'VPU',
+    'anomaly_detection': 'AND',
+    'keypoint_detection': 'KD',
+    'visual_commonsense_reasoning': 'VCR',
+    'image_evaluation_judgement': 'IEJ',
+    'multiple_image_analysis': 'MIA',
+    'cross_image_matching': 'CIM',
+    'temporal_understanding': 'TU',
+    'visual_code': 'VP',
+    'medical_understanding': 'MedU',
+    'autonomous_driving': 'AUD',
+    'discipline_knowledge_reasoning': 'DKR',
+    'embodied_ai': 'EA',
+    'gui_navigation': 'GN'
+}
+
 
 def MMMU_preproc(data):
     logger = get_logger('Evaluation')
@@ -54,9 +89,69 @@ def report_acc(df):
             abilities = list(set(df[group]))
             abilities.sort()
             for ab in abilities:
-                ab_name = abbrs[ab] if ab in abbrs else ab
+                ab_name = MMB_abbrs[ab] if ab in MMB_abbrs else ab
+                sub_df = df[df[group] == ab]
+                res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+    return pd.DataFrame(res)
+
+
+def report_acc_MMT(df):
+    # assert group in [None, 'category', 'l2-category']
+    res = defaultdict(list)
+    res['split'] = list()
+    res['Overall'] = list()
+    for _, name in MMT_abbrs.items():
+        res[name] = list()
+
+    if 'split' in df:
+        splits = list(set(df['split']))
+        res['split'] = splits
+
+    else:
+        df['split'] = ['none'] * len(df)
+        res['split'] = ['none']
+
+    for group in [None, 'category', 'l2-category']:
+        if group is None:
+            res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']]
+            res['Overall'].extend([np.mean(df['hit'])])
+        elif group not in df:
+            continue
+        elif group == 'category':
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                ab_name = ab
                 sub_df = df[df[group] == ab]
                 res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]
+                res[ab_name].extend([np.mean(sub_df['hit'])])
+        else:
+            abilities = list(set(df[group]))
+            abilities.sort()
+            for ab in abilities:
+                sub_task_name_list = df[df['l2-category'] == ab]['category'].unique()
+                sub_task_acc = []
+                for sub_task_name in sub_task_name_list:
+                    sub_df = df[df['category'] == sub_task_name]
+                    sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']])
+
+                new_acc = []
+                for i in range(len(sub_task_acc[0])):
+                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
+                ab_name = MMT_abbrs[ab] if ab in MMT_abbrs else ab
+                res[ab_name] = new_acc
+
+                sub_task_acc = []
+                for sub_task_name in sub_task_name_list:
+                    sub_df = df[df['category'] == sub_task_name]
+                    sub_task_acc.append([np.mean(sub_df['hit'])])
+                new_acc = []
+                for i in range(len(sub_task_acc[0])):
+                    new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc]))
+
+                res[ab_name].extend(new_acc)
+
+    res['split'].append('ALL')
     return pd.DataFrame(res)
 
 
@@ -142,6 +237,7 @@ def extract_answer_from_item(model, item):
             return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ')
 
 
+# For Circular Evaluation
 def prefetch_sub_data(sub_data, answer_map, verbose=False):
     lt = len(sub_data)
     GT, PRED = [], []
@@ -165,6 +261,7 @@ def prefetch_sub_data(sub_data, answer_map, verbose=False):
     return ret if len(ret) > 1 else ret[0]
 
 
+# For Circular Evaluation
 def eval_sub_data(model, sub_data, answer_map):
     res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True)
     if res is not None:
@@ -194,6 +291,7 @@ def eval_sub_data(model, sub_data, answer_map):
     return dict(hit=1, log=log)
 
 
+# For Circular Evaluation
 def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16):
     prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups]
     remain = []
@@ -269,6 +367,7 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
             logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
             model = None
 
+    # Load finished evaluation results
     logger.info(f'Evaluating {eval_file}')
     result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl')
     result = {}
@@ -278,9 +377,11 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
     data = load(eval_file)
     data = data.sort_values(by='index')
     data['prediction'] = [str(x) for x in data['prediction']]
+    # If not choice label, then use lower case
     for k in data.keys():
         data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
 
+    # Load meta data: when dataset is `default`, will use eval_file as meta data
     if dataset != 'default':
         meta = TSVDataset(dataset).data
     else:
@@ -288,6 +389,7 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
         meta = load(eval_file)
         assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.'
 
+    # Build Answer / Category / L2-Category / Split Map
     answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
     cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None
     l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None
@@ -300,10 +402,12 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
     if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]):
         split_map = None
 
+    # Change MMMU open-ended questions to multiple-choice ones for evaluation
     if listinstr(['MMMU'], dataset):
         data = MMMU_preproc(data)
         answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
 
+    # Only keep those lines in the meta data
     data = data[data['index'].isin(answer_map)]
     data_main = data[data['index'] < int(1e6)]
     meta_idx_set = set(meta['index'])
@@ -359,7 +463,12 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs):
     dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
     data_main = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}'))
 
-    acc = report_acc(data_main)
+    # May have different report acc functions for different datasets
+    if 'MMT' in dataset:
+        acc = report_acc_MMT(data_main)
+    else:
+        acc = report_acc(data_main)
+
     score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
     dump(acc, score_file)
     logger.info(f'multiple_choice_eval successfully finished evaluating {eval_file}, results saved in {score_file}')
diff --git a/vlmeval/utils/__init__.py b/vlmeval/utils/__init__.py
index 93036789..32fdd38b 100644
--- a/vlmeval/utils/__init__.py
+++ b/vlmeval/utils/__init__.py
@@ -2,11 +2,12 @@
 from .mp_util import track_progress_rich
 from .custom_prompt import CustomPrompt
 from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full
-from .dataset import TSVDataset, split_MMMU, MMMU_result_transfer
+from .dataset import TSVDataset, split_MMMU
+from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer
 
 
 __all__ = [
     'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
     'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt',
-    'split_MMMU', 'abbr2full'
+    'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer'
 ]
diff --git a/vlmeval/utils/dataset.py b/vlmeval/utils/dataset.py
index bf4909fa..6b903602 100644
--- a/vlmeval/utils/dataset.py
+++ b/vlmeval/utils/dataset.py
@@ -3,7 +3,6 @@
 from ..smp import *
 from .dataset_config import dataset_URLs, dataset_md5_dict, DATASET_TYPE
 from .custom_prompt import CustomPrompt
-from .matching_util import can_infer
 
 
 def isliststr(s):
@@ -46,29 +45,6 @@ def split_MMMU(msgs):
     return segs
 
 
-def MMMU_result_transfer(result_path):
-    res = {}
-    result_data = load(result_path)
-    mcq = result_data['A'].notna()
-    lt = len(result_data)
-    for i in range(lt):
-        line = result_data.iloc[i]
-        if mcq[i]:
-            options = {
-                cand: line[cand]
-                for cand in string.ascii_uppercase
-                if cand in line and not pd.isna(line[cand])
-            }
-            prediction = line['prediction']
-            infer_prediction = can_infer(prediction, options)
-            res[line['id']] = infer_prediction
-        else:
-            res[line['id']] = line['prediction']
-    result_json = result_path.replace('.xlsx', '.json')
-    dump(res, result_json)
-    return result_json
-
-
 class TSVDataset(CustomPrompt):
 
     def __init__(self, dataset='MMBench', skip_noimg=True):
diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py
index b2df6796..813c0380 100644
--- a/vlmeval/utils/dataset_config.py
+++ b/vlmeval/utils/dataset_config.py
@@ -45,10 +45,10 @@
     'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv',
     'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
     # MMT-Bench
-    'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx',
-    'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx',
-    'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx',
-    'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx'
+    'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL_MI.tsv',
+    'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL.tsv',
+    'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL_MI.tsv',
+    'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL.tsv',
 }
 
 dataset_md5_dict = {
@@ -139,9 +139,10 @@
 def DATASET_TYPE(dataset):
     # Dealing with Custom Dataset
     dataset = dataset.lower()
-    if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d', 'mmstar', 'realworldqa'], dataset):
-        return 'multi-choice'
-    elif 'mmt-bench' in dataset:
+    if listinstr([
+        'mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d',
+        'mmstar', 'realworldqa', 'mmt-bench'
+    ], dataset):
         return 'multi-choice'
     elif listinstr(['mme', 'hallusion', 'pope'], dataset):
         return 'Y/N'
diff --git a/vlmeval/utils/result_transfer.py b/vlmeval/utils/result_transfer.py
new file mode 100644
index 00000000..7de633eb
--- /dev/null
+++ b/vlmeval/utils/result_transfer.py
@@ -0,0 +1,97 @@
+from ..evaluate.misc import build_judge
+from ..evaluate.multiple_choice import extract_answer_from_item
+
+from ..smp import *
+from .matching_util import can_infer
+from .mp_util import track_progress_rich
+
+
+def MMMU_result_transfer(result_path):
+    res = {}
+    result_data = load(result_path)
+    mcq = result_data['A'].notna()
+    lt = len(result_data)
+    for i in range(lt):
+        line = result_data.iloc[i]
+        if mcq[i]:
+            options = {
+                cand: line[cand]
+                for cand in string.ascii_uppercase
+                if cand in line and not pd.isna(line[cand])
+            }
+            prediction = line['prediction']
+            infer_prediction = can_infer(prediction, options)
+            res[line['id']] = infer_prediction
+        else:
+            res[line['id']] = line['prediction']
+    result_json = result_path.replace('.xlsx', '.json')
+    dump(res, result_json)
+    return result_json
+
+
+def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs):
+    logger = get_logger('Evaluation')
+    INTERNAL = os.environ.get('INTERNAL', 0)
+    nproc = judge_kwargs.pop('nproc', 4)
+
+    rd.seed(2680)
+    suffix = eval_file.split('.')[-1]
+    model = judge_kwargs['model']
+    assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125']
+    name_str_map = {
+        'chatgpt-0613': 'openai',
+        'gpt-4-0125': 'gpt4'
+    }
+    name_str = name_str_map[model] if model in name_str_map else model
+
+    if model == 'exact_matching':
+        model = None
+    else:
+        if INTERNAL or gpt_key_set():
+            model = build_judge(**judge_kwargs)
+        else:
+            logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
+            model = None
+
+    logger.info(f'Evaluating {eval_file}')
+    result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl')
+    result = {}
+    if osp.exists(result_file):
+        result = load(result_file)
+
+    data = load(eval_file)
+    assert 'index' in data, 'Essentail columns missing in the eval_file.'
+
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+    idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))}
+    idx2lines = {k: v for k, v in idx2lines.items() if k not in result}
+
+    indices = list(idx2lines.keys())
+    lines = [idx2lines[i] for i in indices]
+    tups = [(model, line) for line in lines]
+    res = track_progress_rich(
+        extract_answer_from_item,
+        tups,
+        nproc=nproc,
+        chunksize=nproc,
+        save=result_file,
+        keys=indices)
+
+    for i, r in zip(indices, res):
+        if i in result:
+            assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log']
+        else:
+            result[i] = r
+
+    indices = list(data['index'])
+    data['opt'] = [result[i]['opt'] for i in data['index']]
+    data['log'] = [result[i]['log'] for i in data['index']]
+
+    # load split
+    output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')
+    dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv'))
+    return output_path

From acc2f23bdb80a0caf45514f9c59e49cef1641b7d Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Tue, 18 Jun 2024 10:10:46 +0800
Subject: [PATCH 4/5] update

---
 run.py | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/run.py b/run.py
index e572b724..35a1da3b 100644
--- a/run.py
+++ b/run.py
@@ -85,6 +85,25 @@ def main():
                 api_nproc=args.nproc,
                 ignore_failed=args.ignore)
 
+            # Set the judge kwargs first before evaluation or dumping
+            judge_kwargs = {
+                'nproc': args.nproc,
+                'verbose': args.verbose,
+            }
+            if args.retry is not None:
+                judge_kwargs['retry'] = args.retry
+            if args.judge is not None:
+                judge_kwargs['model'] = args.judge
+            else:
+                if DATASET_TYPE(dataset_name) in ['multi-choice', 'Y/N']:
+                    judge_kwargs['model'] = 'chatgpt-0613'
+                elif listinstr(['MMVet', 'MathVista', 'LLaVABench'], dataset_name):
+                    judge_kwargs['model'] = 'gpt-4-turbo'
+            if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
+                judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+            if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
+                judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+
             if rank == 0:
                 if dataset_name in ['MMMU_TEST']:
                     result_json = MMMU_result_transfer(result_file)
@@ -106,24 +125,6 @@ def main():
                     )
                     continue
 
-            judge_kwargs = {
-                'nproc': args.nproc,
-                'verbose': args.verbose,
-            }
-            if args.retry is not None:
-                judge_kwargs['retry'] = args.retry
-            if args.judge is not None:
-                judge_kwargs['model'] = args.judge
-            else:
-                if DATASET_TYPE(dataset_name) in ['multi-choice', 'Y/N']:
-                    judge_kwargs['model'] = 'chatgpt-0613'
-                elif listinstr(['MMVet', 'MathVista', 'LLaVABench'], dataset_name):
-                    judge_kwargs['model'] = 'gpt-4-turbo'
-            if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
-                judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
-            if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
-                judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
-
             if rank == 0 and args.mode == 'all':
                 if DATASET_TYPE(dataset_name) == 'multi-choice':
                     dataset_name = 'default' if custom_flag else dataset_name

From 6004766d510dcce49591e4e9bbea1af2a4f41d0a Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Tue, 18 Jun 2024 11:00:28 +0800
Subject: [PATCH 5/5] update README

---
 README.md       | 20 +++++++++++---------
 README_zh-CN.md | 10 ++++++----
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 9b09a664..40fa8366 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,8 @@ English | [<a href="README_zh-CN.md">简体中文</a>]
 
 ## 🆕 News
 
+- **[2024-06-18]** We have supported [**MMT-Bench**](https://mmt-bench.github.io), thanks to [**KainingYing**](https://github.com/KainingYing)🔥🔥🔥
+- **[2024-06-05]** We have supported [**WeMM**](https://github.com/scenarios/WeMM), thanks to [**scenarios**](https://github.com/scenarios)🔥🔥🔥
 - **[2024-05-27]** We have supported [**Mini InternVL**](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5), thanks to [**czczup**](https://github.com/czczup)🔥🔥🔥
 - **[2024-05-25]** We have supported [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790),  thanks to [**Bohao-Lee**](https://github.com/Bohao-Lee)🔥🔥🔥
 - **[2024-05-24]** We have supported [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) and [**CogVLM2-Llama3-chat**](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B) 🔥🔥🔥
@@ -33,8 +35,6 @@ English | [<a href="README_zh-CN.md">简体中文</a>]
 - **[2024-05-15]** We have supported [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448), a versatile and lightweight vision-language model released by Google 🔥🔥🔥
 - **[2024-05-14]** We have supported [**GPT-4o**](https://openai.com/index/hello-gpt-4o/) 🔥🔥🔥
 - **[2024-05-07]** We have supported [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py), thanks to [**YJY123**](https://github.com/YJY123) 🔥🔥🔥
-- **[2024-05-06]** We have launched a discord channel for VLMEvalKit users: https://discord.gg/evDT4GZmxN. Latest updates and discussion will be posted here
-- **[2024-05-06]** We have supported 2 VLMs based on Llama3 🔥🔥🔥: Bunny-llama3-8B (SigLIP, image size 384) and llava-llama-3-8b (CLIP-L, image size 336), you can now evaluate both models on dozens of datasets we supported
 
 ## 📊 Datasets, Models, and Evaluation Results
 
@@ -50,7 +50,7 @@ English | [<a href="README_zh-CN.md">简体中文</a>]
 | ------------------------------------------------------------ | ------------------------------------------------------ | --------- | --------- | --------- | --------- |
 | [**MMBench Series**](https://github.com/open-compass/mmbench/): <br>MMBench, MMBench-CN, CCBench | MMBench_DEV_[EN/CN]<br>MMBench_TEST_[EN/CN]<br>MMBench_DEV_[EN/CN]_V11<br>MMBench_TEST_[EN/CN]_V11<br>CCBench | Multi-choice <br>Question (MCQ) | [**MMStar**](https://github.com/MMStar-Benchmark/MMStar) | MMStar | MCQ |
 | [**MME**](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) | MME | Yes or No (Y/N)                                         | [**SEEDBench Series**](https://github.com/AILab-CVC/SEED-Bench) | SEEDBench_IMG, SEEDBench2_Plus             | MCQ                                                |
-| [**MM-Vet**](https://github.com/yuweihao/MM-Vet)             | MMVet  | VQA                                              | [**MMMU**](https://mmmu-benchmark.github.io)  | MMMU_DEV_VAL/MMMU_TEST                        | MCQ                                |
+| [**MM-Vet**](https://github.com/yuweihao/MM-Vet)             | MMVet  | VQA                                              | [**MMMU**](https://mmmu-benchmark.github.io)  | MMMU_[DEV_VAL/TEST]                      | MCQ                                |
 | [**MathVista**](https://mathvista.github.io)                 | MathVista_MINI | VQA                                         | [**ScienceQA_IMG**](https://scienceqa.github.io) | ScienceQA_[VAL/TEST]                     | MCQ                        |
 | [**COCO Caption**](https://cocodataset.org)                  | COCO_VAL | Caption                                              | [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) | HallusionBench                                | Y/N                             |
 | [**OCRVQA**](https://ocr-vqa.github.io)*                     | OCRVQA_[TESTCORE/TEST] | VQA                                 | [**TextVQA**](https://textvqa.org)* | TextVQA_VAL                      | VQA                              |
@@ -59,6 +59,7 @@ English | [<a href="README_zh-CN.md">简体中文</a>]
 | [**InfoVQA**](https://www.docvqa.org/datasets/infographicvqa)+ | InfoVQA_[VAL/TEST] | VQA | [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR) | OCRBench | VQA |
 | [**RealWorldQA**](https://x.ai/blog/grok-1.5v)            | RealWorldQA | MCQ                                          | [**POPE**](https://github.com/AoiDragon/POPE) | POPE                                           | Y/N                                            |
 | [**Core-MM**](https://github.com/core-mm/core-mm)-          | CORE_MM | VQA                                               | [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790) | SEEDBench2_Plus | MCQ |
+| [**MMT-Bench**](https://mmt-bench.github.io) | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI] | MCQ |  |  |  |
 
 **\*** We only provide a subset of the evaluation results, since some VLMs do not yield reasonable results under the zero-shot setting
 
@@ -80,11 +81,11 @@ VLMEvalKit will use an **judge LLM** to extract answer from the output if you se
 | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
 | [**mPLUG-Owl2**](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2)🎞️ | [**OpenFlamingo-v2**](https://github.com/mlfoundations/open_flamingo)🎞️ | [**PandaGPT-13B**](https://github.com/yxuansu/PandaGPT)      | [**Qwen-VL**](https://huggingface.co/Qwen/Qwen-VL)🎞️🚅, [**Qwen-VL-Chat**](https://huggingface.co/Qwen/Qwen-VL-Chat)🎞️**🚅** |
 | [**VisualGLM-6B**](https://huggingface.co/THUDM/visualglm-6b)🚅 | [**InternLM-XComposer-7B**](https://huggingface.co/internlm/internlm-xcomposer-7b)🚅🎞️ | [**ShareGPT4V-[7B/13B]**](https://sharegpt4v.github.io)🚅     | [**TransCore-M**](https://github.com/PCIResearch/TransCore-M) |
-| [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)🚅 | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)🚅 | [**SharedCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)🚅 | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)🚅 |
+| [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)🚅 | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)🚅 | [**ShareCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)🚅 | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)🚅 |
 | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)🚅, [**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)🚅 | [**EMU2-Chat**](https://github.com/baaivision/Emu)🚅🎞️         | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B)  | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)🚅    |
-| [**InternLM-XComposer2-[1.8B/7B]**](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)🚅🎞️ | [**MiniCPM-[V1/V2/V2.5]**](https://huggingface.co/openbmb/MiniCPM-V)🚅 | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-2-Plus/V1-5]**](https://github.com/OpenGVLab/InternVL)🚅, [**Mini-InternVL-Chat-2B-V1-5**](https://github.com/OpenGVLab/InternVL)🚅 |
+| [**InternLM-XComposer2-[1.8B/7B]**](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)🚅🎞️ | [**MiniCPM-[V1/V2/V2.5]**](https://huggingface.co/openbmb/MiniCPM-V)🚅 | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-2-Plus/V1-5]**](https://github.com/OpenGVLab/InternVL)🚅, <br>[**Mini-InternVL-Chat-2B-V1-5**](https://github.com/OpenGVLab/InternVL)🚅 |
 | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅 | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
-| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅 |                                                              |
+| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅 | [**WeMM**](https://github.com/scenarios/WeMM)🚅               |
 
 🎞️: Support multiple images as inputs.
 
@@ -94,9 +95,10 @@ VLMEvalKit will use an **judge LLM** to extract answer from the output if you se
 
 Note that some VLMs may not be able to run under certain transformer versions, we recommend the following settings to evaluate each VLM:
 
-- **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `SharedCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`.
-- **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-V (v1, v2)`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`.
-- **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`, `Phi-3-Vision`.
+- **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`.
+- **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`.
+- **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
+- **Please use** `transformers==latest` **for**: `PaliGemma-3B`.
 
 ```python
 # Demo
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 7bb1d15a..cc808b5e 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -23,6 +23,8 @@
 
 ## 🆕 更新
 
+- **[2024-06-18]** 支持了 [**MMT-Bench**](https://mmt-bench.github.io)，感谢 [**KainingYing**](https://github.com/KainingYing)🔥🔥🔥
+- **[2024-06-05]** 支持了 [**WeMM**](https://github.com/scenarios/WeMM)，感谢 [**scenarios**](https://github.com/scenarios)🔥🔥🔥
 - **[2024-05-27]** 支持了 [**Mini InternVL**](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5), 感谢 [**czczup**](https://github.com/czczup)🔥🔥🔥
 - **[2024-05-25]** 支持了 [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790)，感谢 [**Bohao-Lee**](https://github.com/Bohao-Lee)🔥🔥🔥
 - **[2024-05-24]** 支持了 [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) 和 [**CogVLM2-Llama3-chat**](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B) 🔥🔥🔥
@@ -31,8 +33,6 @@
 - **[2024-05-15]** 支持了 [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448), 一个谷歌开源的 3B 多模态模型。 🔥🔥🔥
 - **[2024-05-14]** 支持了 [**GPT-4o**](https://openai.com/index/hello-gpt-4o/) 🔥🔥🔥
 - **[2024-05-07]** 支持了 [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py), 感谢 [**YJY123**](https://github.com/YJY123) 🔥🔥🔥
-- **[2024-05-06]** 成立了 VLMEvalKit 用户群组的 Discord 频道: https://discord.gg/evDT4GZmxN，将在这里分享关于 VLMEvalKit 的更新并进行讨论
-- **[2024-05-06]** 支持了两个基于 Llama3 的 VLM 🔥🔥🔥: Bunny-llama3-8B (SigLIP, 输入图像大小 384) 和 llava-llama-3-8b (CLIP-L, 输入图像大小 336), 用户可在我们支持的数十个测试基准上测试这两个模型
 
 ## 📊 评测结果，支持的数据集和模型 <a id="data-model-results"></a>
 ### 评测结果
@@ -56,6 +56,7 @@
 | [**InfoVQA**](https://www.docvqa.org/datasets/infographicvqa)+ | InfoVQA_[VAL/TEST] | VQA | [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR) | OCRBench | VQA |
 | [**RealWorldQA**](https://x.ai/blog/grok-1.5v)            | RealWorldQA | MCQ                                          | [**POPE**](https://github.com/AoiDragon/POPE) | POPE                                           | Y/N                                            |
 | [**Core-MM**](https://github.com/core-mm/core-mm)-          | CORE_MM | VQA                                               | [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790) | SEEDBench2_Plus | MCQ |
+| [**MMT-Bench**](https://mmt-bench.github.io) | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI] | MCQ |  |  |  |
 
 **\*** 我们只提供了部分模型上的测试结果，剩余模型无法在 zero-shot 设定下测试出合理的精度
 
@@ -82,7 +83,7 @@
 | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)🚅, [**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)🚅 | [**EMU2-Chat**](https://github.com/baaivision/Emu)🚅🎞️         | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B)  | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)🚅    |
 | [**InternLM-XComposer2-[1.8B/7B]**](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)🚅🎞️ | [**MiniCPM-[V1/V2/V2.5]**](https://huggingface.co/openbmb/MiniCPM-V)🚅 | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-2-Plus/V1-5]**](https://github.com/OpenGVLab/InternVL)🚅, [**Mini-InternVL-Chat-2B-V1-5**](https://github.com/OpenGVLab/InternVL)🚅 |
 | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅 | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
-| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B)🚅  | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) 🚅 |                                                              |
+| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B)🚅  | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) 🚅 | [**WeMM**](https://github.com/scenarios/WeMM)🚅               |
 
 🎞️ 表示支持多图片输入。
 
@@ -96,7 +97,8 @@
 
 - **请用** `transformers==4.33.0` **来运行**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `SharedCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`.
 - **请用** `transformers==4.37.0 ` **来运行**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-V (v1, v2)`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`.
-- **请用** `transformers==4.40.0 ` **来运行**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`， `Phi-3-Vision`.
+- **请用** `transformers==4.40.0 ` **来运行**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`， `Phi-3-Vision`，`WeMM`.
+- **请用** `transformers==latest` **来运行**: `PaliGemma-3B`.
 
 **如何测试一个 VLM 是否可以正常运行:**