From 26b573f4609dd5720623cef4001e82b4722d5df5 Mon Sep 17 00:00:00 2001 From: echo840 <1731396519@qq.com> Date: Mon, 19 Feb 2024 14:49:21 +0800 Subject: [PATCH 1/2] Add OCRBench --- run.py | 4 ++- vlmeval/evaluate/OCRBench.py | 44 +++++++++++++++++++++++++++++++++ vlmeval/evaluate/__init__.py | 3 ++- vlmeval/utils/dataset_config.py | 7 ++++-- 4 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 vlmeval/evaluate/OCRBench.py diff --git a/run.py b/run.py index 1a91ff0b..b46aca84 100644 --- a/run.py +++ b/run.py @@ -1,7 +1,7 @@ import torch import torch.distributed as dist from vlmeval.smp import * -from vlmeval.evaluate import COCO_eval, YOrN_eval, MMVet_eval, multiple_choice_eval, VQAEval, MathVista_eval, LLaVABench_eval +from vlmeval.evaluate import COCO_eval, YOrN_eval, MMVet_eval, multiple_choice_eval, VQAEval, MathVista_eval, LLaVABench_eval, OCRBench_eval from vlmeval.inference import infer_data_job, prefetch_acc from vlmeval.config import supported_VLM from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full @@ -86,6 +86,8 @@ def main(): COCO_eval(result_file) elif dataset_name == 'MMVet': MMVet_eval(result_file, model='gpt-4-turbo', nproc=args.nproc, verbose=args.verbose) + elif dataset_name == 'OCRBench': + OCRBench_eval(result_file) elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA'], dataset_name): VQAEval(result_file, dataset_name) elif listinstr(['MathVista'], dataset_name): diff --git a/vlmeval/evaluate/OCRBench.py b/vlmeval/evaluate/OCRBench.py new file mode 100644 index 00000000..06841bc2 --- /dev/null +++ b/vlmeval/evaluate/OCRBench.py @@ -0,0 +1,44 @@ +from vlmeval.smp import * +OCRBench_score = {"Regular Text Recognition":0,"Irregular Text Recognition":0,"Artistic Text Recognition":0,"Handwriting Recognition":0, +"Digit String Recognition":0,"Non-Semantic Text Recognition":0,"Scene Text-centric VQA":0,"Doc-oriented VQA":0, +"Key Information Extraction":0,"Handwritten Mathematical Expression Recognition":0} +def OCRBench_eval(eval_file): + logger = get_logger('Evaluation') + + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + for i in tqdm(range(len(lines))): + line = lines[i] + predict = str(line['prediction']) + answers = eval(line['answer']) + category = line['category'] + if category == "Handwritten Mathematical Expression Recognition": + for j in range(len(answers)): + answer = answers[j].strip().replace("\n"," ").replace(" ","") + predict = predict.strip().replace("\n"," ").replace(" ","") + if answer in predict: + OCRBench_score[category]+= 1 + break + else: + for j in range(len(answers)): + answer = answers[j].lower().strip().replace("\n"," ") + predict = predict.lower().strip().replace("\n"," ") + if answer in predict: + OCRBench_score[category]+= 1 + break + final_score_dict = {} + final_score_dict['Text Recognition']=OCRBench_score['Regular Text Recognition']+OCRBench_score['Irregular Text Recognition']+OCRBench_score['Artistic Text Recognition']+OCRBench_score['Handwriting Recognition']+OCRBench_score['Digit String Recognition']+OCRBench_score['Non-Semantic Text Recognition'] + final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] + final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] + final_score_dict['Handwritten Mathematical Expression Recognition'] = OCRBench_score['Handwritten Mathematical Expression Recognition'] + final_score_dict['Final Score'] = final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] + final_score_dict['Handwritten Mathematical Expression Recognition'] + final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score'])/10 + score_pth = eval_file.replace('.xlsx','_score.json') + dump(final_score_dict, score_pth) + logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}') + logger.info(f'Score: ') + for key, value in final_score_dict.items(): + logger.info('{}:{}'.format(key, value)) + diff --git a/vlmeval/evaluate/__init__.py b/vlmeval/evaluate/__init__.py index 2bb3f310..a12673f2 100644 --- a/vlmeval/evaluate/__init__.py +++ b/vlmeval/evaluate/__init__.py @@ -5,4 +5,5 @@ from .vqa_eval import VQAEval from .mathvista_eval import MathVista_eval from .llavabench import LLaVABench_eval -from .misc import build_judge \ No newline at end of file +from .misc import build_judge +from .OCRBench import OCRBench_eval \ No newline at end of file diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py index f79225be..f4b7b69c 100644 --- a/vlmeval/utils/dataset_config.py +++ b/vlmeval/utils/dataset_config.py @@ -26,6 +26,7 @@ "DocVQA_VAL": "https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv", 'AI2D_TEST': "https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv", "LLaVABench": "https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv", + "OCRBench": 'OCRBench', } dataset_md5_dict = { @@ -53,7 +54,8 @@ 'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c', "DocVQA_VAL": 'ee0d8ae5527439438d08e154ef65d735', "AI2D_TEST": "0f593e0d1c7df9a3d69bf1f947e71975", - "LLaVABench": "d382a093f749a697820d3dadd61c8428" + "LLaVABench": "d382a093f749a697820d3dadd61c8428", + "OCRBench": 'OCRBench', } img_root_map = {k: k for k in dataset_URLs} @@ -73,6 +75,7 @@ 'ChartQA_VALTEST_HUMAN': 'ChartQA', 'HallusionBench': 'Hallusion', 'DocVQA_VAL': 'DocVQA', + "OCRBench": 'OCRBench', }) assert set(dataset_URLs) == set(img_root_map) == set(dataset_md5_dict) @@ -85,7 +88,7 @@ def DATASET_TYPE(dataset): return 'Y/N' elif 'coco' in dataset: return 'Caption' - elif listinstr(['ocrvqa', 'textvqa', 'chartqa', 'mathvista', 'docvqa', 'llavabench', 'mmvet'], dataset): + elif listinstr(['ocrvqa', 'textvqa', 'chartqa', 'mathvista', 'docvqa', 'llavabench', 'mmvet', 'OCRBench'], dataset): return 'VQA' else: return 'QA' From 292f0232ebe9070a8d84dc7fb4fca6f7f5010b71 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Sat, 24 Feb 2024 21:32:42 +0800 Subject: [PATCH 2/2] update dataset config --- vlmeval/utils/dataset_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py index f4b7b69c..c27208ce 100644 --- a/vlmeval/utils/dataset_config.py +++ b/vlmeval/utils/dataset_config.py @@ -26,7 +26,7 @@ "DocVQA_VAL": "https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv", 'AI2D_TEST': "https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv", "LLaVABench": "https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv", - "OCRBench": 'OCRBench', + "OCRBench": 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv', } dataset_md5_dict = { @@ -55,7 +55,7 @@ "DocVQA_VAL": 'ee0d8ae5527439438d08e154ef65d735', "AI2D_TEST": "0f593e0d1c7df9a3d69bf1f947e71975", "LLaVABench": "d382a093f749a697820d3dadd61c8428", - "OCRBench": 'OCRBench', + "OCRBench": 'e953d98a987cc6e26ef717b61260b778', } img_root_map = {k: k for k in dataset_URLs}