From 67b1a96039ef22166c3d038a497d8c3b0164b0fc Mon Sep 17 00:00:00 2001 From: KainingYing Date: Mon, 17 Jun 2024 20:24:55 +0800 Subject: [PATCH 1/5] update mmt-bench --- run.py | 6 + vlmeval/evaluate/__init__.py | 1 + vlmeval/evaluate/mmtbench_eval.py | 364 ++++++++++++++++++++++++++++++ vlmeval/utils/dataset_config.py | 17 ++ 4 files changed, 388 insertions(+) create mode 100644 vlmeval/evaluate/mmtbench_eval.py diff --git a/run.py b/run.py index ba3153a4..fe1eb268 100644 --- a/run.py +++ b/run.py @@ -119,6 +119,12 @@ def main(): judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] + + if rank == 0: + if 'MMT-Bench_ALL' in dataset_name: + submission_file = MMTBench_result_transfer(result_file, dataset=dataset_name, **judge_kwargs) + logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}') # noqa: E501 + continue if rank == 0 and args.mode == 'all': if DATASET_TYPE(dataset_name) == 'multi-choice': diff --git a/vlmeval/evaluate/__init__.py b/vlmeval/evaluate/__init__.py index 10248c4b..fad24540 100644 --- a/vlmeval/evaluate/__init__.py +++ b/vlmeval/evaluate/__init__.py @@ -7,3 +7,4 @@ from .llavabench import LLaVABench_eval from .misc import build_judge from .OCRBench import OCRBench_eval +from .mmtbench_eval import * diff --git a/vlmeval/evaluate/mmtbench_eval.py b/vlmeval/evaluate/mmtbench_eval.py new file mode 100644 index 00000000..4deb70dd --- /dev/null +++ b/vlmeval/evaluate/mmtbench_eval.py @@ -0,0 +1,364 @@ +import os.path as osp + +import pandas as pd +from tqdm import tqdm +import numpy as np + +from vlmeval.evaluate.misc import build_judge +from vlmeval.utils import can_infer, track_progress_rich, TSVDataset +from vlmeval.smp import * + + +abbrs = { + "visual_recognition": "VR", + "localization": "Loc", + "ocr": "OCR", + "counting": "Count", + "hallucination": "HLN", + "image_retrieval": "IR", + "threed": "3D", + "visual_captioning": "VC", + "visual_grounding": "VG", + "doc_understanding": "DU", + "action_recognition": "AR", + "pixel_level_perception": "PLP", + "image-to-image_translation": "I2IT", + "relation_reasoning": "RR", + "intelligence_quotient_test": "IQT", + "emotion": "Emo", + "visual_illusion": "VI", + "meme_understanding": "MemU", + "visual_prompt_understanding": "VPU", + "anomaly_detection": "AND", + "keypoint_detection": "KD", + "visual_commonsense_reasoning": "VCR", + "image_evaluation_judgement": "IEJ", + "multiple_image_analysis": "MIA", + "cross_image_matching": "CIM", + "temporal_understanding": "TU", + "visual_code": "VP", + "medical_understanding": "MedU", + "autonomous_driving": "AUD", + "discipline_knowledge_reasoning": "DKR", + "embodied_ai": "EA", + "gui_navigation": "GN" +} + + +def report_acc(df): + # assert group in [None, 'category', 'l2-category'] + res = defaultdict(list) + res['split'] = list() + res['Overall'] = list() + for _, name in abbrs.items(): + res[name] = list() + + if 'split' in df: + splits = list(set(df['split'])) + res['split'] = splits + + else: + df['split'] = ['none'] * len(df) + res['split'] = ['none'] + + for group in [None, 'category', 'l2-category']: + if group is None: + res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']] + res['Overall'].extend([np.mean(df['hit'])]) + elif group not in df: + continue + elif group == "category": + abilities = list(set(df[group])) + abilities.sort() + for ab in abilities: + ab_name = ab + sub_df = df[df[group] == ab] + res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']] + res[ab_name].extend([np.mean(sub_df['hit'])]) + else: + abilities = list(set(df[group])) + abilities.sort() + for ab in abilities: + sub_task_name_list = df[df['l2-category'] == ab]['category'].unique() + sub_task_acc = [] + for sub_task_name in sub_task_name_list: + sub_df = df[df['category'] == sub_task_name] + sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]) + + new_acc = [] + for i in range(len(sub_task_acc[0])): + new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc])) + ab_name = abbrs[ab] if ab in abbrs else ab + res[ab_name] = new_acc + + sub_task_acc = [] + for sub_task_name in sub_task_name_list: + sub_df = df[df['category'] == sub_task_name] + sub_task_acc.append([np.mean(sub_df['hit'])]) + new_acc = [] + for i in range(len(sub_task_acc[0])): + new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc])) + + res[ab_name].extend(new_acc) + + res['split'].append('ALL') + return pd.DataFrame(res) + + +def build_choices(item): + ret = {} + for ch in string.ascii_uppercase: + if ch in item and (not pd.isna(item[ch])): + ret[ch] = item[ch] + return ret + + +def prefetch_answer(item): + choices = build_choices(item) + return can_infer(item['prediction'], choices) + + +def prefetch_sub_data(sub_data, answer_map, verbose=False): + lt = len(sub_data) + GT, PRED = [], [] + for i in range(lt): + item = sub_data.iloc[i] + idx = item['index'] + GT.append(answer_map[idx]) + PRED.append(prefetch_answer(item)) + if PRED[-1] and (PRED[-1] in string.ascii_uppercase): + + return dict(opt=PRED[-1]) + flag = True + for p in PRED: + if not p: + flag = False + ret = (dict(opt=PRED[-1]), ) if flag else (None, ) + ret = ret + (GT, PRED) if verbose else ret + return ret if len(ret) > 1 else ret[0] + + +def eval_sub_data(model, sub_data, answer_map): + res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True) + if res is not None: + return res + + lt = len(sub_data) + log = '' + for i in range(lt): + if PRED[i]: + log += f'Rolling {i} Matched.\n' + else: + res = extract_answer_from_item(model, sub_data.iloc[i]) + opt, match_log = res['opt'], res['log'] + PRED[i] = opt + return dict(opt = opt) + # if PRED[i] != GT[i]: + # # log += ( + # # f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; " + # # f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n' + # # ) + # return dict(opt = opt) + # else: + # # log += ( + # # f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, " + # # f'Pre-fetched is {PRED[i]}.\n' + # # ) + pass + + return dict(opt = opt) + + +def extract_answer_from_item(model, item): + logger = get_logger('Evaluation') + # It will return: (pred, raw, llm_time) + choices = build_choices(item) + option_str = build_option_str(choices) + + if cn_string(item['question']): + prompt = build_prompt_cn(item['question'], option_str, item['prediction']) + else: + prompt = build_prompt(item['question'], option_str, item['prediction']) + retry = 3 + + ret = can_infer(item['prediction'], choices) + if ret: + return dict(opt=ret, log=item['prediction']) + + while retry: + ans = model.generate(prompt) + if 'Failed to obtain answer via API' in ans: + logger.warning('GPT API failed to answer. ') + else: + ret = can_infer(ans, choices) + if ret: + return dict(opt=ret, log=ans) + else: + logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}') + retry -= 1 + + if retry == 0: + options = list(choices) + ['Z'] if 'Z' not in choices else [] + return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ') + + +def build_prompt(question, options, prediction): + tmpl = ( + 'You are an AI assistant who will help me to match ' + 'an answer with several options of a single-choice question. ' + 'You are provided with a question, several options, and an answer, ' + 'and you need to find which option is most similar to the answer. ' + 'If the meaning of all options are significantly different from the answer, output Z. ' + 'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n' + 'Example 1: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n' + 'Answer: a cute teddy bear\nYour output: A\n' + 'Example 2: \n' + 'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n' + 'Answer: Spider\nYour output: Z\n' + 'Example 3: \n' + 'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ' + ) + return tmpl.format(question, options, prediction) + + +def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16): + prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups] + remain = [] + for dg, pf in zip(data_groups, prefetched): + if pf: + result[dg.iloc[0]['index'] % 1e6] = pf + else: + remain.append(dg) + dump(result, result_file) + tups = [(model, x, answer_map) for x in remain] + keys = [x.iloc[0]['index'] % 1e6 for x in remain] + if len(tups) == 0: + return + + assert model + + res = track_progress_rich( + eval_sub_data, + tups, + nproc=nproc, + chunksize=nproc, + save=result_file, + keys=keys) + result = load(result_file) + for k, v in zip(keys, res): + if k in result: + # assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log'] + pass + else: + result[k] = v + dump(result, result_file) + + + +def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs): + logger = get_logger('Evaluation') + INTERNAL = os.environ.get('INTERNAL', 0) + + nproc = judge_kwargs.pop('nproc', 4) + + rd.seed(2680) + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125'] + name_str_map = { + 'chatgpt-0613': 'openai', + 'gpt-4-0125': 'gpt4' + } + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + else: + if INTERNAL or gpt_key_set(): + model = build_judge(**judge_kwargs) + else: + logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + logger.info(f'Evaluating {eval_file}') + result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl') + result = {} + if osp.exists(result_file): + result = load(result_file) + + data = load(eval_file) + data = data.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + for k in data.keys(): + data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + + if dataset != 'default': + meta = TSVDataset(dataset).data + else: + logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ') + meta = load(eval_file) + assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.' + + answer_map = {i: 'A' for i, c in zip(meta['index'], meta['index'])} # 123 + cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None + l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None + split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None + + if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]): + cate_map = None + if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]): + l2_cate_map = None + if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]): + split_map = None + + data = data[data['index'].isin(cate_map)] + data_main = data[data['index'] < int(1e6)] + meta_idx_set = set(meta['index']) + data_main = data_main[data_main['index'].isin(meta_idx_set)] + + lt = len(data_main) + + data_groups = [] + for i in tqdm(range(lt)): + # Dealing with the normal part + item_main = data_main.iloc[i] + idx = item_main['index'] + + if idx in result: + continue + + sub_data = data[data['index'] % int(1e6) == idx] + data_groups.append(sub_data) + + if len(data_groups): + eval_data_groups( + model=model, + data_groups=data_groups, + answer_map=answer_map, + nproc=nproc, + result=result, + result_file=result_file) + + tmp_pth = f'/tmp/{timestr()}.xlsx' + dump(data_main, tmp_pth) + data_main = load(tmp_pth) + + res = load(result_file) + indices = data_main['index'] + + data_main['opt'] = [res[i]['opt'] for i in indices] + # data_main['log'] = [res[i]['log'] for i in indices] + + main_idx = data_main['index'] + if cate_map is not None: + data_main['category'] = [cate_map[i] for i in main_idx] + if l2_cate_map is not None: + data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] + if split_map is not None: + data_main['split'] = [split_map[i] for i in indices] + + # load split + output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv') + dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')) + return output_path diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py index 8b13eab5..9f2e7a6f 100644 --- a/vlmeval/utils/dataset_config.py +++ b/vlmeval/utils/dataset_config.py @@ -44,6 +44,11 @@ 'MMStar': 'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv', 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv', 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv', + # MMT-Bench + 'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. Multi-image samples are inputted as separate images. + 'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. ulti-image samples are merged into a single large image for processing. + 'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx', # VAL Split (10%), evaluated on local. Multi-image samples are inputted as separate images. + 'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx' # VAL Split (10%), evaluated on local. ulti-image samples are merged into a single large image for processing. } dataset_md5_dict = { @@ -89,6 +94,11 @@ 'MMStar': 'e1ecd2140806c1b1bbf54b43372efb9e', 'RealWorldQA': '92321028d2bc29040284b6674721e48f', 'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5', + # MMT-Bench + 'MMT-Bench_ALL_MI': '5272157097e19cdd7cb41e412ab3b7c7', + 'MMT-Bench_ALL': 'b273a2f4c596fe4f2605de0494cd632f', + 'MMT-Bench_VAL_MI': 'c7d7b998eb5cd9aa36c7d4f721472462', + 'MMT-Bench_VAL': '8dd4b730f53dbf9c3aed90ca31c928e0' } img_root_map = {k: k for k in dataset_URLs} @@ -116,6 +126,11 @@ 'MathVista_MINI': 'MathVista', 'HallusionBench': 'Hallusion', 'DocVQA_VAL': 'DocVQA', + # MMT-Bench + 'MMT-Bench_ALL_MI': 'MMT-Bench', + 'MMT-Bench_ALL': 'MMT-Bench', + 'MMT-Bench_VAL_MI': 'MMT-Bench', + 'MMT-Bench_VAL': 'MMT-Bench' }) assert set(dataset_URLs) == set(img_root_map) @@ -126,6 +141,8 @@ def DATASET_TYPE(dataset): dataset = dataset.lower() if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d', 'mmstar', 'realworldqa'], dataset): return 'multi-choice' + elif 'mmt-bench' in dataset: + return 'multi-choice' elif listinstr(['mme', 'hallusion', 'pope'], dataset): return 'Y/N' elif 'coco' in dataset: From 7b03d4a8a1c582d1b846e189a96fbeac6d8bca43 Mon Sep 17 00:00:00 2001 From: KainingYing Date: Mon, 17 Jun 2024 21:05:04 +0800 Subject: [PATCH 2/5] remove the comments --- vlmeval/utils/dataset_config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py index 9f2e7a6f..b2df6796 100644 --- a/vlmeval/utils/dataset_config.py +++ b/vlmeval/utils/dataset_config.py @@ -45,10 +45,10 @@ 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv', 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv', # MMT-Bench - 'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. Multi-image samples are inputted as separate images. - 'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx', # FULL Split, evaluated on server. ulti-image samples are merged into a single large image for processing. - 'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx', # VAL Split (10%), evaluated on local. Multi-image samples are inputted as separate images. - 'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx' # VAL Split (10%), evaluated on local. ulti-image samples are merged into a single large image for processing. + 'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx', + 'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx', + 'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx', + 'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx' } dataset_md5_dict = { From 41a9b7038d6cb2b45a1a5dedc9157b79595252ea Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Mon, 17 Jun 2024 22:25:01 +0800 Subject: [PATCH 3/5] update MMTBench --- run.py | 12 +- vlmeval/evaluate/__init__.py | 1 - vlmeval/evaluate/mmtbench_eval.py | 364 ---------------------------- vlmeval/evaluate/multiple_choice.py | 115 ++++++++- vlmeval/utils/__init__.py | 5 +- vlmeval/utils/dataset.py | 24 -- vlmeval/utils/dataset_config.py | 15 +- vlmeval/utils/result_transfer.py | 97 ++++++++ 8 files changed, 225 insertions(+), 408 deletions(-) delete mode 100644 vlmeval/evaluate/mmtbench_eval.py create mode 100644 vlmeval/utils/result_transfer.py diff --git a/run.py b/run.py index fe1eb268..e572b724 100644 --- a/run.py +++ b/run.py @@ -4,7 +4,7 @@ from vlmeval.evaluate import * from vlmeval.inference import infer_data_job from vlmeval.config import supported_VLM -from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full, MMMU_result_transfer +from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full, MMMU_result_transfer, MMTBench_result_transfer def parse_args(): @@ -90,6 +90,10 @@ def main(): result_json = MMMU_result_transfer(result_file) logger.info(f'Transfer MMMU_TEST result to json for official evaluation, json file saved in {result_json}') # noqa: E501 continue + elif 'MMT-Bench_ALL' in dataset_name: + submission_file = MMTBench_result_transfer(result_file, **judge_kwargs) + logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}') # noqa: E501 + continue if dataset_name in [ 'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN' @@ -119,12 +123,6 @@ def main(): judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] - - if rank == 0: - if 'MMT-Bench_ALL' in dataset_name: - submission_file = MMTBench_result_transfer(result_file, dataset=dataset_name, **judge_kwargs) - logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation (https://eval.ai/web/challenges/challenge-page/2328/overview), submission file saved in {submission_file}') # noqa: E501 - continue if rank == 0 and args.mode == 'all': if DATASET_TYPE(dataset_name) == 'multi-choice': diff --git a/vlmeval/evaluate/__init__.py b/vlmeval/evaluate/__init__.py index fad24540..10248c4b 100644 --- a/vlmeval/evaluate/__init__.py +++ b/vlmeval/evaluate/__init__.py @@ -7,4 +7,3 @@ from .llavabench import LLaVABench_eval from .misc import build_judge from .OCRBench import OCRBench_eval -from .mmtbench_eval import * diff --git a/vlmeval/evaluate/mmtbench_eval.py b/vlmeval/evaluate/mmtbench_eval.py deleted file mode 100644 index 4deb70dd..00000000 --- a/vlmeval/evaluate/mmtbench_eval.py +++ /dev/null @@ -1,364 +0,0 @@ -import os.path as osp - -import pandas as pd -from tqdm import tqdm -import numpy as np - -from vlmeval.evaluate.misc import build_judge -from vlmeval.utils import can_infer, track_progress_rich, TSVDataset -from vlmeval.smp import * - - -abbrs = { - "visual_recognition": "VR", - "localization": "Loc", - "ocr": "OCR", - "counting": "Count", - "hallucination": "HLN", - "image_retrieval": "IR", - "threed": "3D", - "visual_captioning": "VC", - "visual_grounding": "VG", - "doc_understanding": "DU", - "action_recognition": "AR", - "pixel_level_perception": "PLP", - "image-to-image_translation": "I2IT", - "relation_reasoning": "RR", - "intelligence_quotient_test": "IQT", - "emotion": "Emo", - "visual_illusion": "VI", - "meme_understanding": "MemU", - "visual_prompt_understanding": "VPU", - "anomaly_detection": "AND", - "keypoint_detection": "KD", - "visual_commonsense_reasoning": "VCR", - "image_evaluation_judgement": "IEJ", - "multiple_image_analysis": "MIA", - "cross_image_matching": "CIM", - "temporal_understanding": "TU", - "visual_code": "VP", - "medical_understanding": "MedU", - "autonomous_driving": "AUD", - "discipline_knowledge_reasoning": "DKR", - "embodied_ai": "EA", - "gui_navigation": "GN" -} - - -def report_acc(df): - # assert group in [None, 'category', 'l2-category'] - res = defaultdict(list) - res['split'] = list() - res['Overall'] = list() - for _, name in abbrs.items(): - res[name] = list() - - if 'split' in df: - splits = list(set(df['split'])) - res['split'] = splits - - else: - df['split'] = ['none'] * len(df) - res['split'] = ['none'] - - for group in [None, 'category', 'l2-category']: - if group is None: - res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']] - res['Overall'].extend([np.mean(df['hit'])]) - elif group not in df: - continue - elif group == "category": - abilities = list(set(df[group])) - abilities.sort() - for ab in abilities: - ab_name = ab - sub_df = df[df[group] == ab] - res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']] - res[ab_name].extend([np.mean(sub_df['hit'])]) - else: - abilities = list(set(df[group])) - abilities.sort() - for ab in abilities: - sub_task_name_list = df[df['l2-category'] == ab]['category'].unique() - sub_task_acc = [] - for sub_task_name in sub_task_name_list: - sub_df = df[df['category'] == sub_task_name] - sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]) - - new_acc = [] - for i in range(len(sub_task_acc[0])): - new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc])) - ab_name = abbrs[ab] if ab in abbrs else ab - res[ab_name] = new_acc - - sub_task_acc = [] - for sub_task_name in sub_task_name_list: - sub_df = df[df['category'] == sub_task_name] - sub_task_acc.append([np.mean(sub_df['hit'])]) - new_acc = [] - for i in range(len(sub_task_acc[0])): - new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc])) - - res[ab_name].extend(new_acc) - - res['split'].append('ALL') - return pd.DataFrame(res) - - -def build_choices(item): - ret = {} - for ch in string.ascii_uppercase: - if ch in item and (not pd.isna(item[ch])): - ret[ch] = item[ch] - return ret - - -def prefetch_answer(item): - choices = build_choices(item) - return can_infer(item['prediction'], choices) - - -def prefetch_sub_data(sub_data, answer_map, verbose=False): - lt = len(sub_data) - GT, PRED = [], [] - for i in range(lt): - item = sub_data.iloc[i] - idx = item['index'] - GT.append(answer_map[idx]) - PRED.append(prefetch_answer(item)) - if PRED[-1] and (PRED[-1] in string.ascii_uppercase): - - return dict(opt=PRED[-1]) - flag = True - for p in PRED: - if not p: - flag = False - ret = (dict(opt=PRED[-1]), ) if flag else (None, ) - ret = ret + (GT, PRED) if verbose else ret - return ret if len(ret) > 1 else ret[0] - - -def eval_sub_data(model, sub_data, answer_map): - res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True) - if res is not None: - return res - - lt = len(sub_data) - log = '' - for i in range(lt): - if PRED[i]: - log += f'Rolling {i} Matched.\n' - else: - res = extract_answer_from_item(model, sub_data.iloc[i]) - opt, match_log = res['opt'], res['log'] - PRED[i] = opt - return dict(opt = opt) - # if PRED[i] != GT[i]: - # # log += ( - # # f"Failed in Rolling {i}: Answer is {GT[i]}; Prediction is {sub_data.iloc[i]['prediction']}; " - # # f'Pre-fetched is {PRED[i]}; Match Log is {match_log}.\n' - # # ) - # return dict(opt = opt) - # else: - # # log += ( - # # f"Rolling {i}: Answer is {GT[i]}, Prediction is {sub_data.iloc[i]['prediction']}, " - # # f'Pre-fetched is {PRED[i]}.\n' - # # ) - pass - - return dict(opt = opt) - - -def extract_answer_from_item(model, item): - logger = get_logger('Evaluation') - # It will return: (pred, raw, llm_time) - choices = build_choices(item) - option_str = build_option_str(choices) - - if cn_string(item['question']): - prompt = build_prompt_cn(item['question'], option_str, item['prediction']) - else: - prompt = build_prompt(item['question'], option_str, item['prediction']) - retry = 3 - - ret = can_infer(item['prediction'], choices) - if ret: - return dict(opt=ret, log=item['prediction']) - - while retry: - ans = model.generate(prompt) - if 'Failed to obtain answer via API' in ans: - logger.warning('GPT API failed to answer. ') - else: - ret = can_infer(ans, choices) - if ret: - return dict(opt=ret, log=ans) - else: - logger.warning(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}') - retry -= 1 - - if retry == 0: - options = list(choices) + ['Z'] if 'Z' not in choices else [] - return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ') - - -def build_prompt(question, options, prediction): - tmpl = ( - 'You are an AI assistant who will help me to match ' - 'an answer with several options of a single-choice question. ' - 'You are provided with a question, several options, and an answer, ' - 'and you need to find which option is most similar to the answer. ' - 'If the meaning of all options are significantly different from the answer, output Z. ' - 'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n' - 'Example 1: \n' - 'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n' - 'Answer: a cute teddy bear\nYour output: A\n' - 'Example 2: \n' - 'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n' - 'Answer: Spider\nYour output: Z\n' - 'Example 3: \n' - 'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: ' - ) - return tmpl.format(question, options, prediction) - - -def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16): - prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups] - remain = [] - for dg, pf in zip(data_groups, prefetched): - if pf: - result[dg.iloc[0]['index'] % 1e6] = pf - else: - remain.append(dg) - dump(result, result_file) - tups = [(model, x, answer_map) for x in remain] - keys = [x.iloc[0]['index'] % 1e6 for x in remain] - if len(tups) == 0: - return - - assert model - - res = track_progress_rich( - eval_sub_data, - tups, - nproc=nproc, - chunksize=nproc, - save=result_file, - keys=keys) - result = load(result_file) - for k, v in zip(keys, res): - if k in result: - # assert result[k]['hit'] == v['hit'] and result[k]['log'] == v['log'] - pass - else: - result[k] = v - dump(result, result_file) - - - -def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs): - logger = get_logger('Evaluation') - INTERNAL = os.environ.get('INTERNAL', 0) - - nproc = judge_kwargs.pop('nproc', 4) - - rd.seed(2680) - suffix = eval_file.split('.')[-1] - model = judge_kwargs['model'] - assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125'] - name_str_map = { - 'chatgpt-0613': 'openai', - 'gpt-4-0125': 'gpt4' - } - name_str = name_str_map[model] if model in name_str_map else model - - if model == 'exact_matching': - model = None - else: - if INTERNAL or gpt_key_set(): - model = build_judge(**judge_kwargs) - else: - logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') - model = None - - logger.info(f'Evaluating {eval_file}') - result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl') - result = {} - if osp.exists(result_file): - result = load(result_file) - - data = load(eval_file) - data = data.sort_values(by='index') - data['prediction'] = [str(x) for x in data['prediction']] - for k in data.keys(): - data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) - - if dataset != 'default': - meta = TSVDataset(dataset).data - else: - logger.warning('Dataset is not provided, try to use the original `eval_file` as meta data. ') - meta = load(eval_file) - assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.' - - answer_map = {i: 'A' for i, c in zip(meta['index'], meta['index'])} # 123 - cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None - l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None - split_map = {i: c for i, c in zip(meta['index'], meta['split'])} if 'split' in meta else None - - if cate_map is not None and np.all([pd.isna(x) for x in cate_map.values()]): - cate_map = None - if l2_cate_map is not None and np.all([pd.isna(x) for x in l2_cate_map.values()]): - l2_cate_map = None - if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]): - split_map = None - - data = data[data['index'].isin(cate_map)] - data_main = data[data['index'] < int(1e6)] - meta_idx_set = set(meta['index']) - data_main = data_main[data_main['index'].isin(meta_idx_set)] - - lt = len(data_main) - - data_groups = [] - for i in tqdm(range(lt)): - # Dealing with the normal part - item_main = data_main.iloc[i] - idx = item_main['index'] - - if idx in result: - continue - - sub_data = data[data['index'] % int(1e6) == idx] - data_groups.append(sub_data) - - if len(data_groups): - eval_data_groups( - model=model, - data_groups=data_groups, - answer_map=answer_map, - nproc=nproc, - result=result, - result_file=result_file) - - tmp_pth = f'/tmp/{timestr()}.xlsx' - dump(data_main, tmp_pth) - data_main = load(tmp_pth) - - res = load(result_file) - indices = data_main['index'] - - data_main['opt'] = [res[i]['opt'] for i in indices] - # data_main['log'] = [res[i]['log'] for i in indices] - - main_idx = data_main['index'] - if cate_map is not None: - data_main['category'] = [cate_map[i] for i in main_idx] - if l2_cate_map is not None: - data_main['l2-category'] = [l2_cate_map[i] for i in main_idx] - if split_map is not None: - data_main['split'] = [split_map[i] for i in indices] - - # load split - output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv') - dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')) - return output_path diff --git a/vlmeval/evaluate/multiple_choice.py b/vlmeval/evaluate/multiple_choice.py index 50d52bf2..23104d5f 100644 --- a/vlmeval/evaluate/multiple_choice.py +++ b/vlmeval/evaluate/multiple_choice.py @@ -8,7 +8,7 @@ INTERNAL = os.environ.get('INTERNAL', 0) -abbrs = { +MMB_abbrs = { 'coarse_perception': 'CP', 'finegrained_perception (instance-level)': 'FP-S', 'finegrained_perception (cross-instance)': 'FP-C', @@ -17,6 +17,41 @@ 'attribute_reasoning': 'AR' } +MMT_abbrs = { + 'visual_recognition': 'VR', + 'localization': 'Loc', + 'ocr': 'OCR', + 'counting': 'Count', + 'hallucination': 'HLN', + 'image_retrieval': 'IR', + 'threed': '3D', + 'visual_captioning': 'VC', + 'visual_grounding': 'VG', + 'doc_understanding': 'DU', + 'action_recognition': 'AR', + 'pixel_level_perception': 'PLP', + 'image-to-image_translation': 'I2IT', + 'relation_reasoning': 'RR', + 'intelligence_quotient_test': 'IQT', + 'emotion': 'Emo', + 'visual_illusion': 'VI', + 'meme_understanding': 'MemU', + 'visual_prompt_understanding': 'VPU', + 'anomaly_detection': 'AND', + 'keypoint_detection': 'KD', + 'visual_commonsense_reasoning': 'VCR', + 'image_evaluation_judgement': 'IEJ', + 'multiple_image_analysis': 'MIA', + 'cross_image_matching': 'CIM', + 'temporal_understanding': 'TU', + 'visual_code': 'VP', + 'medical_understanding': 'MedU', + 'autonomous_driving': 'AUD', + 'discipline_knowledge_reasoning': 'DKR', + 'embodied_ai': 'EA', + 'gui_navigation': 'GN' +} + def MMMU_preproc(data): logger = get_logger('Evaluation') @@ -54,9 +89,69 @@ def report_acc(df): abilities = list(set(df[group])) abilities.sort() for ab in abilities: - ab_name = abbrs[ab] if ab in abbrs else ab + ab_name = MMB_abbrs[ab] if ab in MMB_abbrs else ab + sub_df = df[df[group] == ab] + res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']] + return pd.DataFrame(res) + + +def report_acc_MMT(df): + # assert group in [None, 'category', 'l2-category'] + res = defaultdict(list) + res['split'] = list() + res['Overall'] = list() + for _, name in MMT_abbrs.items(): + res[name] = list() + + if 'split' in df: + splits = list(set(df['split'])) + res['split'] = splits + + else: + df['split'] = ['none'] * len(df) + res['split'] = ['none'] + + for group in [None, 'category', 'l2-category']: + if group is None: + res['Overall'] = [np.mean(df[df['split'] == sp]['hit']) for sp in res['split']] + res['Overall'].extend([np.mean(df['hit'])]) + elif group not in df: + continue + elif group == 'category': + abilities = list(set(df[group])) + abilities.sort() + for ab in abilities: + ab_name = ab sub_df = df[df[group] == ab] res[ab_name] = [np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']] + res[ab_name].extend([np.mean(sub_df['hit'])]) + else: + abilities = list(set(df[group])) + abilities.sort() + for ab in abilities: + sub_task_name_list = df[df['l2-category'] == ab]['category'].unique() + sub_task_acc = [] + for sub_task_name in sub_task_name_list: + sub_df = df[df['category'] == sub_task_name] + sub_task_acc.append([np.mean(sub_df[sub_df['split'] == sp]['hit']) for sp in res['split']]) + + new_acc = [] + for i in range(len(sub_task_acc[0])): + new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc])) + ab_name = MMT_abbrs[ab] if ab in MMT_abbrs else ab + res[ab_name] = new_acc + + sub_task_acc = [] + for sub_task_name in sub_task_name_list: + sub_df = df[df['category'] == sub_task_name] + sub_task_acc.append([np.mean(sub_df['hit'])]) + new_acc = [] + for i in range(len(sub_task_acc[0])): + new_acc.append(sum([_[i] for _ in sub_task_acc]) / len([_ for _ in sub_task_acc])) + + res[ab_name].extend(new_acc) + + res['split'].append('ALL') return pd.DataFrame(res) @@ -142,6 +237,7 @@ def extract_answer_from_item(model, item): return dict(opt=rd.choice(options), log='Failed to predict, thus randomly generate one. ') +# For Circular Evaluation def prefetch_sub_data(sub_data, answer_map, verbose=False): lt = len(sub_data) GT, PRED = [], [] @@ -165,6 +261,7 @@ def prefetch_sub_data(sub_data, answer_map, verbose=False): return ret if len(ret) > 1 else ret[0] +# For Circular Evaluation def eval_sub_data(model, sub_data, answer_map): res, GT, PRED = prefetch_sub_data(sub_data, answer_map, verbose=True) if res is not None: @@ -194,6 +291,7 @@ def eval_sub_data(model, sub_data, answer_map): return dict(hit=1, log=log) +# For Circular Evaluation def eval_data_groups(model, data_groups, answer_map, result, result_file, nproc=16): prefetched = [prefetch_sub_data(g, answer_map, verbose=False) for g in data_groups] remain = [] @@ -269,6 +367,7 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs): logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') model = None + # Load finished evaluation results logger.info(f'Evaluating {eval_file}') result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_result.pkl') result = {} @@ -278,9 +377,11 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs): data = load(eval_file) data = data.sort_values(by='index') data['prediction'] = [str(x) for x in data['prediction']] + # If not choice label, then use lower case for k in data.keys(): data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + # Load meta data: when dataset is `default`, will use eval_file as meta data if dataset != 'default': meta = TSVDataset(dataset).data else: @@ -288,6 +389,7 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs): meta = load(eval_file) assert 'index' in meta and 'answer' in meta, 'Essentail columns missing in the eval_file.' + # Build Answer / Category / L2-Category / Split Map answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])} cate_map = {i: c for i, c in zip(meta['index'], meta['category'])} if 'category' in meta else None l2_cate_map = {i: c for i, c in zip(meta['index'], meta['l2-category'])} if 'l2-category' in meta else None @@ -300,10 +402,12 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs): if split_map is not None and np.all([pd.isna(x) for x in split_map.values()]): split_map = None + # Change MMMU open-ended questions to multiple-choice ones for evaluation if listinstr(['MMMU'], dataset): data = MMMU_preproc(data) answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()} + # Only keep those lines in the meta data data = data[data['index'].isin(answer_map)] data_main = data[data['index'] < int(1e6)] meta_idx_set = set(meta['index']) @@ -359,7 +463,12 @@ def multiple_choice_eval(eval_file, dataset='default', **judge_kwargs): dump(data_main, eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) data_main = load(eval_file.replace(f'.{suffix}', f'_{name_str}_result.{suffix}')) - acc = report_acc(data_main) + # May have different report acc functions for different datasets + if 'MMT' in dataset: + acc = report_acc_MMT(data_main) + else: + acc = report_acc(data_main) + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') dump(acc, score_file) logger.info(f'multiple_choice_eval successfully finished evaluating {eval_file}, results saved in {score_file}') diff --git a/vlmeval/utils/__init__.py b/vlmeval/utils/__init__.py index 93036789..32fdd38b 100644 --- a/vlmeval/utils/__init__.py +++ b/vlmeval/utils/__init__.py @@ -2,11 +2,12 @@ from .mp_util import track_progress_rich from .custom_prompt import CustomPrompt from .dataset_config import dataset_URLs, img_root_map, DATASET_TYPE, abbr2full -from .dataset import TSVDataset, split_MMMU, MMMU_result_transfer +from .dataset import TSVDataset, split_MMMU +from .result_transfer import MMMU_result_transfer, MMTBench_result_transfer __all__ = [ 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 'TSVDataset', 'dataset_URLs', 'img_root_map', 'DATASET_TYPE', 'CustomPrompt', - 'split_MMMU', 'abbr2full' + 'split_MMMU', 'abbr2full', 'MMMU_result_transfer', 'MMTBench_result_transfer' ] diff --git a/vlmeval/utils/dataset.py b/vlmeval/utils/dataset.py index bf4909fa..6b903602 100644 --- a/vlmeval/utils/dataset.py +++ b/vlmeval/utils/dataset.py @@ -3,7 +3,6 @@ from ..smp import * from .dataset_config import dataset_URLs, dataset_md5_dict, DATASET_TYPE from .custom_prompt import CustomPrompt -from .matching_util import can_infer def isliststr(s): @@ -46,29 +45,6 @@ def split_MMMU(msgs): return segs -def MMMU_result_transfer(result_path): - res = {} - result_data = load(result_path) - mcq = result_data['A'].notna() - lt = len(result_data) - for i in range(lt): - line = result_data.iloc[i] - if mcq[i]: - options = { - cand: line[cand] - for cand in string.ascii_uppercase - if cand in line and not pd.isna(line[cand]) - } - prediction = line['prediction'] - infer_prediction = can_infer(prediction, options) - res[line['id']] = infer_prediction - else: - res[line['id']] = line['prediction'] - result_json = result_path.replace('.xlsx', '.json') - dump(res, result_json) - return result_json - - class TSVDataset(CustomPrompt): def __init__(self, dataset='MMBench', skip_noimg=True): diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py index b2df6796..813c0380 100644 --- a/vlmeval/utils/dataset_config.py +++ b/vlmeval/utils/dataset_config.py @@ -45,10 +45,10 @@ 'RealWorldQA': 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv', 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv', # MMT-Bench - 'MMT-Bench_ALL_MI': 'xxxxxxxxxxxxxxxxxx', - 'MMT-Bench_ALL': 'xxxxxxxxxxxxxxxxxxxxx', - 'MMT-Bench_VAL_MI': 'xxxxxxxxxxxxxxxxxx', - 'MMT-Bench_VAL': 'xxxxxxxxxxxxxxxxxxxxxx' + 'MMT-Bench_ALL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL_MI.tsv', + 'MMT-Bench_ALL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_ALL.tsv', + 'MMT-Bench_VAL_MI': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL_MI.tsv', + 'MMT-Bench_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/MMT-Bench_VAL.tsv', } dataset_md5_dict = { @@ -139,9 +139,10 @@ def DATASET_TYPE(dataset): # Dealing with Custom Dataset dataset = dataset.lower() - if listinstr(['mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d', 'mmstar', 'realworldqa'], dataset): - return 'multi-choice' - elif 'mmt-bench' in dataset: + if listinstr([ + 'mmbench', 'seedbench', 'ccbench', 'mmmu', 'scienceqa', 'ai2d', + 'mmstar', 'realworldqa', 'mmt-bench' + ], dataset): return 'multi-choice' elif listinstr(['mme', 'hallusion', 'pope'], dataset): return 'Y/N' diff --git a/vlmeval/utils/result_transfer.py b/vlmeval/utils/result_transfer.py new file mode 100644 index 00000000..7de633eb --- /dev/null +++ b/vlmeval/utils/result_transfer.py @@ -0,0 +1,97 @@ +from ..evaluate.misc import build_judge +from ..evaluate.multiple_choice import extract_answer_from_item + +from ..smp import * +from .matching_util import can_infer +from .mp_util import track_progress_rich + + +def MMMU_result_transfer(result_path): + res = {} + result_data = load(result_path) + mcq = result_data['A'].notna() + lt = len(result_data) + for i in range(lt): + line = result_data.iloc[i] + if mcq[i]: + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + prediction = line['prediction'] + infer_prediction = can_infer(prediction, options) + res[line['id']] = infer_prediction + else: + res[line['id']] = line['prediction'] + result_json = result_path.replace('.xlsx', '.json') + dump(res, result_json) + return result_json + + +def MMTBench_result_transfer(eval_file, dataset='default', **judge_kwargs): + logger = get_logger('Evaluation') + INTERNAL = os.environ.get('INTERNAL', 0) + nproc = judge_kwargs.pop('nproc', 4) + + rd.seed(2680) + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + assert model in ['chatgpt-0613', 'exact_matching', 'gpt-4-0125'] + name_str_map = { + 'chatgpt-0613': 'openai', + 'gpt-4-0125': 'gpt4' + } + name_str = name_str_map[model] if model in name_str_map else model + + if model == 'exact_matching': + model = None + else: + if INTERNAL or gpt_key_set(): + model = build_judge(**judge_kwargs) + else: + logger.error('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + + logger.info(f'Evaluating {eval_file}') + result_file = eval_file.replace(f'.{suffix}', f'_{name_str}_option.pkl') + result = {} + if osp.exists(result_file): + result = load(result_file) + + data = load(eval_file) + assert 'index' in data, 'Essentail columns missing in the eval_file.' + + data = data.sort_values(by='index') + data['prediction'] = [str(x) for x in data['prediction']] + for k in data.keys(): + data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k) + + idx2lines = {data.iloc[i]['index']: data.iloc[i] for i in range(len(data))} + idx2lines = {k: v for k, v in idx2lines.items() if k not in result} + + indices = list(idx2lines.keys()) + lines = [idx2lines[i] for i in indices] + tups = [(model, line) for line in lines] + res = track_progress_rich( + extract_answer_from_item, + tups, + nproc=nproc, + chunksize=nproc, + save=result_file, + keys=indices) + + for i, r in zip(indices, res): + if i in result: + assert result[i]['opt'] == r['opt'] and result[i]['log'] == r['log'] + else: + result[i] = r + + indices = list(data['index']) + data['opt'] = [result[i]['opt'] for i in data['index']] + data['log'] = [result[i]['log'] for i in data['index']] + + # load split + output_path = eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv') + dump(data, eval_file.replace(f'.{suffix}', f'_{name_str}_submission.tsv')) + return output_path From acc2f23bdb80a0caf45514f9c59e49cef1641b7d Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Tue, 18 Jun 2024 10:10:46 +0800 Subject: [PATCH 4/5] update --- run.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/run.py b/run.py index e572b724..35a1da3b 100644 --- a/run.py +++ b/run.py @@ -85,6 +85,25 @@ def main(): api_nproc=args.nproc, ignore_failed=args.ignore) + # Set the judge kwargs first before evaluation or dumping + judge_kwargs = { + 'nproc': args.nproc, + 'verbose': args.verbose, + } + if args.retry is not None: + judge_kwargs['retry'] = args.retry + if args.judge is not None: + judge_kwargs['model'] = args.judge + else: + if DATASET_TYPE(dataset_name) in ['multi-choice', 'Y/N']: + judge_kwargs['model'] = 'chatgpt-0613' + elif listinstr(['MMVet', 'MathVista', 'LLaVABench'], dataset_name): + judge_kwargs['model'] = 'gpt-4-turbo' + if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']): + judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] + if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): + judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] + if rank == 0: if dataset_name in ['MMMU_TEST']: result_json = MMMU_result_transfer(result_file) @@ -106,24 +125,6 @@ def main(): ) continue - judge_kwargs = { - 'nproc': args.nproc, - 'verbose': args.verbose, - } - if args.retry is not None: - judge_kwargs['retry'] = args.retry - if args.judge is not None: - judge_kwargs['model'] = args.judge - else: - if DATASET_TYPE(dataset_name) in ['multi-choice', 'Y/N']: - judge_kwargs['model'] = 'chatgpt-0613' - elif listinstr(['MMVet', 'MathVista', 'LLaVABench'], dataset_name): - judge_kwargs['model'] = 'gpt-4-turbo' - if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']): - judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] - if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): - judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] - if rank == 0 and args.mode == 'all': if DATASET_TYPE(dataset_name) == 'multi-choice': dataset_name = 'default' if custom_flag else dataset_name From 6004766d510dcce49591e4e9bbea1af2a4f41d0a Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Tue, 18 Jun 2024 11:00:28 +0800 Subject: [PATCH 5/5] update README --- README.md | 20 +++++++++++--------- README_zh-CN.md | 10 ++++++---- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9b09a664..40fa8366 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ English | [็ฎ€ไฝ“ไธญๆ–‡] ## ๐Ÿ†• News +- **[2024-06-18]** We have supported [**MMT-Bench**](https://mmt-bench.github.io), thanks to [**KainingYing**](https://github.com/KainingYing)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ +- **[2024-06-05]** We have supported [**WeMM**](https://github.com/scenarios/WeMM), thanks to [**scenarios**](https://github.com/scenarios)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-27]** We have supported [**Mini InternVL**](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5), thanks to [**czczup**](https://github.com/czczup)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-25]** We have supported [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790), thanks to [**Bohao-Lee**](https://github.com/Bohao-Lee)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-24]** We have supported [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) and [**CogVLM2-Llama3-chat**](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B) ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ @@ -33,8 +35,6 @@ English | [็ฎ€ไฝ“ไธญๆ–‡] - **[2024-05-15]** We have supported [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448), a versatile and lightweight vision-language model released by Google ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-14]** We have supported [**GPT-4o**](https://openai.com/index/hello-gpt-4o/) ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-07]** We have supported [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py), thanks to [**YJY123**](https://github.com/YJY123) ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ -- **[2024-05-06]** We have launched a discord channel for VLMEvalKit users: https://discord.gg/evDT4GZmxN. Latest updates and discussion will be posted here -- **[2024-05-06]** We have supported 2 VLMs based on Llama3 ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ: Bunny-llama3-8B (SigLIP, image size 384) and llava-llama-3-8b (CLIP-L, image size 336), you can now evaluate both models on dozens of datasets we supported ## ๐Ÿ“Š Datasets, Models, and Evaluation Results @@ -50,7 +50,7 @@ English | [็ฎ€ไฝ“ไธญๆ–‡] | ------------------------------------------------------------ | ------------------------------------------------------ | --------- | --------- | --------- | --------- | | [**MMBench Series**](https://github.com/open-compass/mmbench/):
MMBench, MMBench-CN, CCBench | MMBench_DEV_[EN/CN]
MMBench_TEST_[EN/CN]
MMBench_DEV_[EN/CN]_V11
MMBench_TEST_[EN/CN]_V11
CCBench | Multi-choice
Question (MCQ) | [**MMStar**](https://github.com/MMStar-Benchmark/MMStar) | MMStar | MCQ | | [**MME**](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) | MME | Yes or No (Y/N) | [**SEEDBench Series**](https://github.com/AILab-CVC/SEED-Bench) | SEEDBench_IMG, SEEDBench2_Plus | MCQ | -| [**MM-Vet**](https://github.com/yuweihao/MM-Vet) | MMVet | VQA | [**MMMU**](https://mmmu-benchmark.github.io) | MMMU_DEV_VAL/MMMU_TEST | MCQ | +| [**MM-Vet**](https://github.com/yuweihao/MM-Vet) | MMVet | VQA | [**MMMU**](https://mmmu-benchmark.github.io) | MMMU_[DEV_VAL/TEST] | MCQ | | [**MathVista**](https://mathvista.github.io) | MathVista_MINI | VQA | [**ScienceQA_IMG**](https://scienceqa.github.io) | ScienceQA_[VAL/TEST] | MCQ | | [**COCO Caption**](https://cocodataset.org) | COCO_VAL | Caption | [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) | HallusionBench | Y/N | | [**OCRVQA**](https://ocr-vqa.github.io)* | OCRVQA_[TESTCORE/TEST] | VQA | [**TextVQA**](https://textvqa.org)* | TextVQA_VAL | VQA | @@ -59,6 +59,7 @@ English | [็ฎ€ไฝ“ไธญๆ–‡] | [**InfoVQA**](https://www.docvqa.org/datasets/infographicvqa)+ | InfoVQA_[VAL/TEST] | VQA | [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR) | OCRBench | VQA | | [**RealWorldQA**](https://x.ai/blog/grok-1.5v) | RealWorldQA | MCQ | [**POPE**](https://github.com/AoiDragon/POPE) | POPE | Y/N | | [**Core-MM**](https://github.com/core-mm/core-mm)- | CORE_MM | VQA | [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790) | SEEDBench2_Plus | MCQ | +| [**MMT-Bench**](https://mmt-bench.github.io) | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI] | MCQ | | | | **\*** We only provide a subset of the evaluation results, since some VLMs do not yield reasonable results under the zero-shot setting @@ -80,11 +81,11 @@ VLMEvalKit will use an **judge LLM** to extract answer from the output if you se | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | | [**mPLUG-Owl2**](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2)๐ŸŽž๏ธ | [**OpenFlamingo-v2**](https://github.com/mlfoundations/open_flamingo)๐ŸŽž๏ธ | [**PandaGPT-13B**](https://github.com/yxuansu/PandaGPT) | [**Qwen-VL**](https://huggingface.co/Qwen/Qwen-VL)๐ŸŽž๏ธ๐Ÿš…, [**Qwen-VL-Chat**](https://huggingface.co/Qwen/Qwen-VL-Chat)๐ŸŽž๏ธ**๐Ÿš…** | | [**VisualGLM-6B**](https://huggingface.co/THUDM/visualglm-6b)๐Ÿš… | [**InternLM-XComposer-7B**](https://huggingface.co/internlm/internlm-xcomposer-7b)๐Ÿš…๐ŸŽž๏ธ | [**ShareGPT4V-[7B/13B]**](https://sharegpt4v.github.io)๐Ÿš… | [**TransCore-M**](https://github.com/PCIResearch/TransCore-M) | -| [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)๐Ÿš… | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)๐Ÿš… | [**SharedCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)๐Ÿš… | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)๐Ÿš… | +| [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)๐Ÿš… | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)๐Ÿš… | [**ShareCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)๐Ÿš… | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)๐Ÿš… | | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)๐Ÿš…, [**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)๐Ÿš… | [**EMU2-Chat**](https://github.com/baaivision/Emu)๐Ÿš…๐ŸŽž๏ธ | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B) | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)๐Ÿš… | -| [**InternLM-XComposer2-[1.8B/7B]**](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)๐Ÿš…๐ŸŽž๏ธ | [**MiniCPM-[V1/V2/V2.5]**](https://huggingface.co/openbmb/MiniCPM-V)๐Ÿš… | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-2-Plus/V1-5]**](https://github.com/OpenGVLab/InternVL)๐Ÿš…, [**Mini-InternVL-Chat-2B-V1-5**](https://github.com/OpenGVLab/InternVL)๐Ÿš… | +| [**InternLM-XComposer2-[1.8B/7B]**](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)๐Ÿš…๐ŸŽž๏ธ | [**MiniCPM-[V1/V2/V2.5]**](https://huggingface.co/openbmb/MiniCPM-V)๐Ÿš… | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-2-Plus/V1-5]**](https://github.com/OpenGVLab/InternVL)๐Ÿš…,
[**Mini-InternVL-Chat-2B-V1-5**](https://github.com/OpenGVLab/InternVL)๐Ÿš… | | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)๐ŸŽž๏ธ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)๐Ÿš… | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-Llama-3-8B-V)๐Ÿš… | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) | -| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) ๐Ÿš… | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) ๐Ÿš… | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)๐Ÿš… | | +| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) ๐Ÿš… | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) ๐Ÿš… | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)๐Ÿš… | [**WeMM**](https://github.com/scenarios/WeMM)๐Ÿš… | ๐ŸŽž๏ธ: Support multiple images as inputs. @@ -94,9 +95,10 @@ VLMEvalKit will use an **judge LLM** to extract answer from the output if you se Note that some VLMs may not be able to run under certain transformer versions, we recommend the following settings to evaluate each VLM: -- **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `SharedCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`. -- **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-V (v1, v2)`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`. -- **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`, `Phi-3-Vision`. +- **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`. +- **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`. +- **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`, `Phi-3-Vision`, `WeMM`. +- **Please use** `transformers==latest` **for**: `PaliGemma-3B`. ```python # Demo diff --git a/README_zh-CN.md b/README_zh-CN.md index 7bb1d15a..cc808b5e 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -23,6 +23,8 @@ ## ๐Ÿ†• ๆ›ดๆ–ฐ +- **[2024-06-18]** ๆ”ฏๆŒไบ† [**MMT-Bench**](https://mmt-bench.github.io)๏ผŒๆ„Ÿ่ฐข [**KainingYing**](https://github.com/KainingYing)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ +- **[2024-06-05]** ๆ”ฏๆŒไบ† [**WeMM**](https://github.com/scenarios/WeMM)๏ผŒๆ„Ÿ่ฐข [**scenarios**](https://github.com/scenarios)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-27]** ๆ”ฏๆŒไบ† [**Mini InternVL**](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5), ๆ„Ÿ่ฐข [**czczup**](https://github.com/czczup)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-25]** ๆ”ฏๆŒไบ† [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790)๏ผŒๆ„Ÿ่ฐข [**Bohao-Lee**](https://github.com/Bohao-Lee)๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-24]** ๆ”ฏๆŒไบ† [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) ๅ’Œ [**CogVLM2-Llama3-chat**](https://huggingface.co/THUDM/cogvlm2-llama3-chat-19B) ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ @@ -31,8 +33,6 @@ - **[2024-05-15]** ๆ”ฏๆŒไบ† [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448), ไธ€ไธช่ฐทๆญŒๅผ€ๆบ็š„ 3B ๅคšๆจกๆ€ๆจกๅž‹ใ€‚ ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-14]** ๆ”ฏๆŒไบ† [**GPT-4o**](https://openai.com/index/hello-gpt-4o/) ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ - **[2024-05-07]** ๆ”ฏๆŒไบ† [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py), ๆ„Ÿ่ฐข [**YJY123**](https://github.com/YJY123) ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ -- **[2024-05-06]** ๆˆ็ซ‹ไบ† VLMEvalKit ็”จๆˆท็พค็ป„็š„ Discord ้ข‘้“: https://discord.gg/evDT4GZmxN๏ผŒๅฐ†ๅœจ่ฟ™้‡Œๅˆ†ไบซๅ…ณไบŽ VLMEvalKit ็š„ๆ›ดๆ–ฐๅนถ่ฟ›่กŒ่ฎจ่ฎบ -- **[2024-05-06]** ๆ”ฏๆŒไบ†ไธคไธชๅŸบไบŽ Llama3 ็š„ VLM ๐Ÿ”ฅ๐Ÿ”ฅ๐Ÿ”ฅ: Bunny-llama3-8B (SigLIP, ่พ“ๅ…ฅๅ›พๅƒๅคงๅฐ 384) ๅ’Œ llava-llama-3-8b (CLIP-L, ่พ“ๅ…ฅๅ›พๅƒๅคงๅฐ 336), ็”จๆˆทๅฏๅœจๆˆ‘ไปฌๆ”ฏๆŒ็š„ๆ•ฐๅไธชๆต‹่ฏ•ๅŸบๅ‡†ไธŠๆต‹่ฏ•่ฟ™ไธคไธชๆจกๅž‹ ## ๐Ÿ“Š ่ฏ„ๆต‹็ป“ๆžœ๏ผŒๆ”ฏๆŒ็š„ๆ•ฐๆฎ้›†ๅ’Œๆจกๅž‹ ### ่ฏ„ๆต‹็ป“ๆžœ @@ -56,6 +56,7 @@ | [**InfoVQA**](https://www.docvqa.org/datasets/infographicvqa)+ | InfoVQA_[VAL/TEST] | VQA | [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR) | OCRBench | VQA | | [**RealWorldQA**](https://x.ai/blog/grok-1.5v) | RealWorldQA | MCQ | [**POPE**](https://github.com/AoiDragon/POPE) | POPE | Y/N | | [**Core-MM**](https://github.com/core-mm/core-mm)- | CORE_MM | VQA | [**SEEDBench2_Plus**](https://arxiv.org/abs/2404.16790) | SEEDBench2_Plus | MCQ | +| [**MMT-Bench**](https://mmt-bench.github.io) | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI] | MCQ | | | | **\*** ๆˆ‘ไปฌๅชๆไพ›ไบ†้ƒจๅˆ†ๆจกๅž‹ไธŠ็š„ๆต‹่ฏ•็ป“ๆžœ๏ผŒๅ‰ฉไฝ™ๆจกๅž‹ๆ— ๆณ•ๅœจ zero-shot ่ฎพๅฎšไธ‹ๆต‹่ฏ•ๅ‡บๅˆ็†็š„็ฒพๅบฆ @@ -82,7 +83,7 @@ | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)๐Ÿš…, [**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)๐Ÿš… | [**EMU2-Chat**](https://github.com/baaivision/Emu)๐Ÿš…๐ŸŽž๏ธ | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B) | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)๐Ÿš… | | [**InternLM-XComposer2-[1.8B/7B]**](https://huggingface.co/internlm/internlm-xcomposer2-vl-7b)๐Ÿš…๐ŸŽž๏ธ | [**MiniCPM-[V1/V2/V2.5]**](https://huggingface.co/openbmb/MiniCPM-V)๐Ÿš… | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-2-Plus/V1-5]**](https://github.com/OpenGVLab/InternVL)๐Ÿš…, [**Mini-InternVL-Chat-2B-V1-5**](https://github.com/OpenGVLab/InternVL)๐Ÿš… | | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)๐ŸŽž๏ธ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)๐Ÿš… | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-Llama-3-8B-V)๐Ÿš… | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) | -| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) ๐Ÿš… | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B)๐Ÿš… | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) ๐Ÿš… | | +| [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) ๐Ÿš… | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B)๐Ÿš… | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) ๐Ÿš… | [**WeMM**](https://github.com/scenarios/WeMM)๐Ÿš… | ๐ŸŽž๏ธ ่กจ็คบๆ”ฏๆŒๅคšๅ›พ็‰‡่พ“ๅ…ฅใ€‚ @@ -96,7 +97,8 @@ - **่ฏท็”จ** `transformers==4.33.0` **ๆฅ่ฟ่กŒ**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `SharedCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`. - **่ฏท็”จ** `transformers==4.37.0 ` **ๆฅ่ฟ่กŒ**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-V (v1, v2)`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`. -- **่ฏท็”จ** `transformers==4.40.0 ` **ๆฅ่ฟ่กŒ**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`๏ผŒ `Phi-3-Vision`. +- **่ฏท็”จ** `transformers==4.40.0 ` **ๆฅ่ฟ่กŒ**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `LLaVA-Next series`, `360VL-70B`๏ผŒ `Phi-3-Vision`๏ผŒ`WeMM`. +- **่ฏท็”จ** `transformers==latest` **ๆฅ่ฟ่กŒ**: `PaliGemma-3B`. **ๅฆ‚ไฝ•ๆต‹่ฏ•ไธ€ไธช VLM ๆ˜ฏๅฆๅฏไปฅๆญฃๅธธ่ฟ่กŒ:**