From 26b573f4609dd5720623cef4001e82b4722d5df5 Mon Sep 17 00:00:00 2001
From: echo840 <1731396519@qq.com>
Date: Mon, 19 Feb 2024 14:49:21 +0800
Subject: [PATCH 1/2] Add OCRBench

---
 run.py                          |  4 ++-
 vlmeval/evaluate/OCRBench.py    | 44 +++++++++++++++++++++++++++++++++
 vlmeval/evaluate/__init__.py    |  3 ++-
 vlmeval/utils/dataset_config.py |  7 ++++--
 4 files changed, 54 insertions(+), 4 deletions(-)
 create mode 100644 vlmeval/evaluate/OCRBench.py

diff --git a/run.py b/run.py
index 1a91ff0b..b46aca84 100644
--- a/run.py
+++ b/run.py
@@ -1,7 +1,7 @@
 import torch
 import torch.distributed as dist
 from vlmeval.smp import *
-from vlmeval.evaluate import COCO_eval, YOrN_eval, MMVet_eval, multiple_choice_eval, VQAEval, MathVista_eval, LLaVABench_eval
+from vlmeval.evaluate import COCO_eval, YOrN_eval, MMVet_eval, multiple_choice_eval, VQAEval, MathVista_eval, LLaVABench_eval, OCRBench_eval
 from vlmeval.inference import infer_data_job, prefetch_acc
 from vlmeval.config import supported_VLM
 from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full
@@ -86,6 +86,8 @@ def main():
                     COCO_eval(result_file)
                 elif dataset_name == 'MMVet':
                     MMVet_eval(result_file, model='gpt-4-turbo', nproc=args.nproc, verbose=args.verbose)
+                elif dataset_name == 'OCRBench':
+                    OCRBench_eval(result_file)
                 elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA'], dataset_name):
                     VQAEval(result_file, dataset_name)
                 elif listinstr(['MathVista'], dataset_name):
diff --git a/vlmeval/evaluate/OCRBench.py b/vlmeval/evaluate/OCRBench.py
new file mode 100644
index 00000000..06841bc2
--- /dev/null
+++ b/vlmeval/evaluate/OCRBench.py
@@ -0,0 +1,44 @@
+from vlmeval.smp import *
+OCRBench_score = {"Regular Text Recognition":0,"Irregular Text Recognition":0,"Artistic Text Recognition":0,"Handwriting Recognition":0,
+"Digit String Recognition":0,"Non-Semantic Text Recognition":0,"Scene Text-centric VQA":0,"Doc-oriented VQA":0,
+"Key Information Extraction":0,"Handwritten Mathematical Expression Recognition":0}
+def OCRBench_eval(eval_file):
+    logger = get_logger('Evaluation')
+
+    data = load(eval_file)
+    lt = len(data)
+    lines = [data.iloc[i] for i in range(lt)]
+    for i in tqdm(range(len(lines))):
+        line = lines[i]
+        predict = str(line['prediction'])
+        answers = eval(line['answer'])
+        category = line['category']
+        if category == "Handwritten Mathematical Expression Recognition":
+            for j in range(len(answers)):
+                answer = answers[j].strip().replace("\n"," ").replace(" ","")
+                predict = predict.strip().replace("\n"," ").replace(" ","")
+                if answer in predict:
+                    OCRBench_score[category]+= 1
+                    break
+        else:
+            for j in range(len(answers)):
+                answer = answers[j].lower().strip().replace("\n"," ")
+                predict = predict.lower().strip().replace("\n"," ")
+                if answer in predict:
+                    OCRBench_score[category]+= 1
+                    break
+    final_score_dict = {}
+    final_score_dict['Text Recognition']=OCRBench_score['Regular Text Recognition']+OCRBench_score['Irregular Text Recognition']+OCRBench_score['Artistic Text Recognition']+OCRBench_score['Handwriting Recognition']+OCRBench_score['Digit String Recognition']+OCRBench_score['Non-Semantic Text Recognition']
+    final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
+    final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
+    final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
+    final_score_dict['Handwritten Mathematical Expression Recognition'] = OCRBench_score['Handwritten Mathematical Expression Recognition'] 
+    final_score_dict['Final Score'] = final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] + final_score_dict['Handwritten Mathematical Expression Recognition']
+    final_score_dict['Final Score Norm'] = float(final_score_dict['Final Score'])/10
+    score_pth = eval_file.replace('.xlsx','_score.json')
+    dump(final_score_dict, score_pth)
+    logger.info(f'OCRBench_eval successfully finished evaluating {eval_file}, results saved in {score_pth}')
+    logger.info(f'Score: ')
+    for key, value in final_score_dict.items():
+        logger.info('{}:{}'.format(key, value))
+    
diff --git a/vlmeval/evaluate/__init__.py b/vlmeval/evaluate/__init__.py
index 2bb3f310..a12673f2 100644
--- a/vlmeval/evaluate/__init__.py
+++ b/vlmeval/evaluate/__init__.py
@@ -5,4 +5,5 @@
 from .vqa_eval import VQAEval
 from .mathvista_eval import MathVista_eval
 from .llavabench import LLaVABench_eval
-from .misc import build_judge
\ No newline at end of file
+from .misc import build_judge
+from .OCRBench import OCRBench_eval
\ No newline at end of file
diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py
index f79225be..f4b7b69c 100644
--- a/vlmeval/utils/dataset_config.py
+++ b/vlmeval/utils/dataset_config.py
@@ -26,6 +26,7 @@
     "DocVQA_VAL": "https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv",
     'AI2D_TEST': "https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv",
     "LLaVABench": "https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv",
+    "OCRBench": 'OCRBench',
 }
 
 dataset_md5_dict = {
@@ -53,7 +54,8 @@
     'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
     "DocVQA_VAL": 'ee0d8ae5527439438d08e154ef65d735',
     "AI2D_TEST": "0f593e0d1c7df9a3d69bf1f947e71975",
-    "LLaVABench": "d382a093f749a697820d3dadd61c8428"
+    "LLaVABench": "d382a093f749a697820d3dadd61c8428",
+    "OCRBench": 'OCRBench',
 }
 
 img_root_map = {k: k for k in dataset_URLs}
@@ -73,6 +75,7 @@
     'ChartQA_VALTEST_HUMAN': 'ChartQA',
     'HallusionBench': 'Hallusion',
     'DocVQA_VAL': 'DocVQA',
+    "OCRBench": 'OCRBench',
 })
 
 assert set(dataset_URLs) == set(img_root_map) == set(dataset_md5_dict)
@@ -85,7 +88,7 @@ def DATASET_TYPE(dataset):
         return 'Y/N'
     elif 'coco' in dataset:
         return 'Caption'
-    elif listinstr(['ocrvqa', 'textvqa', 'chartqa', 'mathvista', 'docvqa', 'llavabench', 'mmvet'], dataset):
+    elif listinstr(['ocrvqa', 'textvqa', 'chartqa', 'mathvista', 'docvqa', 'llavabench', 'mmvet', 'OCRBench'], dataset):
         return 'VQA'
     else:
         return 'QA'

From 292f0232ebe9070a8d84dc7fb4fca6f7f5010b71 Mon Sep 17 00:00:00 2001
From: kennymckormick <dhd@pku.edu.cn>
Date: Sat, 24 Feb 2024 21:32:42 +0800
Subject: [PATCH 2/2] update dataset config

---
 vlmeval/utils/dataset_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vlmeval/utils/dataset_config.py b/vlmeval/utils/dataset_config.py
index f4b7b69c..c27208ce 100644
--- a/vlmeval/utils/dataset_config.py
+++ b/vlmeval/utils/dataset_config.py
@@ -26,7 +26,7 @@
     "DocVQA_VAL": "https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv",
     'AI2D_TEST': "https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv",
     "LLaVABench": "https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv",
-    "OCRBench": 'OCRBench',
+    "OCRBench": 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv',
 }
 
 dataset_md5_dict = {
@@ -55,7 +55,7 @@
     "DocVQA_VAL": 'ee0d8ae5527439438d08e154ef65d735',
     "AI2D_TEST": "0f593e0d1c7df9a3d69bf1f947e71975",
     "LLaVABench": "d382a093f749a697820d3dadd61c8428",
-    "OCRBench": 'OCRBench',
+    "OCRBench": 'e953d98a987cc6e26ef717b61260b778',
 }
 
 img_root_map = {k: k for k in dataset_URLs}