diff --git a/.gitignore b/.gitignore index 1f5ca8f..5153994 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,6 @@ baselines/ dataset_funcs/ mmocr-dev-1.x/work_dirs add_data/ -mmocr-0.x/ \ No newline at end of file +mmocr-0.x/ +mae/output_dir +*.pyc \ No newline at end of file diff --git a/README.md b/README.md index eb9f388..f9d8f38 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,24 @@ -/# Union14M Dataset +
Union14M is a large scene text recognition (STR) dataset collected from 17 publicly available datasets, which contains 4M of labeled data (Union14M-L) and 10M of unlabeled data (Union14M-U), intended to provide a more profound analysis for the STR community
+ +
- arXiv •
Introduction •
Download •
MAERec •
@@ -26,21 +37,26 @@
- To explore the challenges that STR models still face, we consolidate a large-scale STR dataset for analysis and identified seven open challenges. Furthermore, we propose a challenge-driven benchmark to facilitate the future development of STR. Additionally, we reveal that the utilization of massive unlabeled data through self-supervised pre-training can remarkably enhance the performance of the STR model in real-world scenarios, suggesting a practical solution for STR from a data perspective. We hope this work can spark future research beyond the realm of existing data paradigms.
## 2. Contents
-- [1. Introduction](#1-introduction)
-- [2. Contents](#2-contents)
-- [3. Union14M Dataset](#3-union14m-dataset)
- - [3.1. Union14M-L](#31-union14m-l)
- - [3.2. Union14M-U](#32-union14m-u)
- - [3.3. Union14M-Benchmark](#33-union14m-benchmark)
- - [3.4. Download](#34-download)
-- [4. STR Models trained on Union14M-L](#4-str-models-trained-on-union14m-l)
- - [4.1. Checkpoints](#41-checkpoints)
-- [5. MAERec](#5-maerec)
- - [5.1. Pre-training](#51-pre-training)
- - [5.2. Fine-tuning](#52-fine-tuning)
- - [5.3 Inferencing](#53-inferencing)
-- [6. QAs](#6-qas)
-- [7. License](#7-license)
+- [Rethinking Scene Text Recognition: A Data Perspective](#rethinking-scene-text-recognition-a-data-perspective)
+ - [1. Introduction](#1-introduction)
+ - [2. Contents](#2-contents)
+ - [3. Union14M Dataset](#3-union14m-dataset)
+ - [3.1. Union14M-L](#31-union14m-l)
+ - [3.2. Union14M-U](#32-union14m-u)
+ - [3.3. Union14M-Benchmark](#33-union14m-benchmark)
+ - [3.4. Download](#34-download)
+ - [4. STR Models trained on Union14M-L](#4-str-models-trained-on-union14m-l)
+ - [4.1. Checkpoints](#41-checkpoints)
+ - [5. MAERec](#5-maerec)
+ - [5.1. Pre-training](#51-pre-training)
+ - [5.2. Fine-tuning](#52-fine-tuning)
+ - [5.3. Evaluation](#53-evaluation)
+ - [5.4. Inferencing](#54-inferencing)
+ - [5.4. ONNX Conversion](#54-onnx-conversion)
+ - [6. QAs](#6-qas)
+ - [7. License](#7-license)
+ - [8. Acknowledgement](#8-acknowledgement)
+ - [9. Citation](#9-citation)
## 3. Union14M Dataset
### 3.1. Union14M-L
@@ -73,6 +89,7 @@
| Union14M-U (36.63GB) | [Google Drive (8 GB)]() | [Baidu Netdisk]() |
| 6 Common Benchmarks (17.6MB) | [Google Drive (8 GB)]() | [Baidu Netdisk](https://pan.baidu.com/s/1XifQS0v-0YxEXkGTfWMDWQ?pwd=35cz) |
+
- The Structure of Union14M will be organized as follows:
@@ -109,7 +126,7 @@
Structure of Union14M-U
- We store images in LMDB format, and the structure of Union14M-U will be organized as belows. Here is an example of using [LMDB Example]()
+ We store images in [LMDB](https://github.com/Mountchicken/Efficient-Deep-Learning/blob/main/Efficient_DataProcessing.md#21-efficient-data-storage-methods) format, and the structure of Union14M-U will be organized as belows. Here is an example of using [LMDB Example]()
```text
|--Union14M-U
|--book32_lmdb
@@ -122,7 +139,7 @@
- We train serval STR models on Union14M-L using [MMOCR-1.0](https://github.com/open-mmlab/mmocr/tree/dev-1.x)
### 4.1. Checkpoints
-- Evaluated on both common benchmarks and Union14M-Benchmark. Accuracy (WAICS) in $\color{grey}{grey}$ are original implementation (Trained on synthtic datasest), and accuracay in $\color{green}{green}$ are trained on Union14M-L. Our models are trained to predict **upper & lower text, symbols and space.**
+- Evaluated on both common benchmarks and Union14M-Benchmark. Accuracy (WAICS) in $\color{grey}{grey}$ are original implementation (Trained on synthtic datasest), and accuracay in $\color{green}{green}$ are trained on Union14M-L. All the re-trained models are trained to predict **upper & lower text, symbols and space.**
| Models | Checkpoint | IIIT5K | SVT | IC13-1015 | IC15-2077 | SVTP | CUTE80 | Avg. |
| :---------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------: | :--------------------------------------------: | :--------------------------------------------: | :--------------------------------------------: | :--------------------------------------------: | :--------------------------------------------: | :--------------------------------------------: |
@@ -155,29 +172,42 @@
### 5.1. Pre-training
-- Pre-trained ViT
+- ViT pretrained on Union14M-U.
- | Variants | Input Size | Patch Size | Embedding | Depth | Heads | Parameters | Download |
- | --------- | ---------- | ---------- | --------- | ----- | ----- | ---------- | --------------------------------------------------------------------------------------- |
- | ViT-Small | 32x128 | 4x4 | 384 | 12 | 6 | | [Google Drive]() / [BaiduYun](https://pan.baidu.com/s/1nZL5veMyWhxpk8DGj0UZMw?pwd=xecv) |
- | ViT-Base | 32x128 | 4x4 | 768 | 12 | 12 | | [Google Drive]() / [BaiduYun](https://pan.baidu.com/s/17CjAOV-1kf1__a2RBo9NUg?pwd=3rvx) |
+ | Variants | Input Size | Patch Size | Embedding | Depth | Heads | Parameters | Download |
+ | -------- | ---------- | ---------- | --------- | ----- | ----- | ---------- | --------------------------------------------------------------------------------------- |
+ | ViT-S | 32x128 | 4x4 | 384 | 12 | 6 | 21M | [Google Drive]() / [BaiduYun](https://pan.baidu.com/s/1nZL5veMyWhxpk8DGj0UZMw?pwd=xecv) |
+ | ViT-B | 32x128 | 4x4 | 768 | 12 | 12 | 85M | [Google Drive]() / [BaiduYun](https://pan.baidu.com/s/17CjAOV-1kf1__a2RBo9NUg?pwd=3rvx) |
- If you want to pre-train the ViT backbone on your own dataset, check [pre-training](docs/pretrain.md)
+
+
### 5.2. Fine-tuning
-- Fine-tuned MAERec
+- MAERec finetuned on Union14M-L
- | Variants | Acc on Common Benchmarks | Acc on Union14M-Benchmarks | Download |
- | ------------ | ------------------------ | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
- | MAERec-Small | 95.1 | 78.6 | [Google Drive](https://drive.google.com/file/d/1dKLS_r3_ysWK155pSmkm7NBf5ALsEJYd/view?usp=sharing) / [BaiduYun](https://pan.baidu.com/s/1wFhLQLrn9dm77TMpdxyNAg?pwd=trg4) |
- | MAERec-Base | 96.2 | 85.2 | [Google Drive](https://drive.google.com/file/d/13E0cmvksKwvjNuR62xZhwkg8eQJfb_Hp/view?usp=sharing) / [BaiduYun](https://pan.baidu.com/s/1EhoJ-2WqkzOQFCNg55-KcA?pwd=5yx1) |
+ | Variants | Acc on Common Benchmarks | Acc on Union14M-Benchmarks | Download |
+ | -------- | ------------------------ | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+ | MAERec-S | 95.1 | 78.6 | [Google Drive](https://drive.google.com/file/d/1dKLS_r3_ysWK155pSmkm7NBf5ALsEJYd/view?usp=sharing) / [BaiduYun](https://pan.baidu.com/s/1wFhLQLrn9dm77TMpdxyNAg?pwd=trg4) |
+ | MAERec-B | 96.2 | 85.2 | [Google Drive](https://drive.google.com/file/d/13E0cmvksKwvjNuR62xZhwkg8eQJfb_Hp/view?usp=sharing) / [BaiduYun](https://pan.baidu.com/s/1EhoJ-2WqkzOQFCNg55-KcA?pwd=5yx1) |
- If you want to fine-tune MAERec on your own dataset, check [fine-tuning](docs/finetune.md)
-### 5.3 Inferencing
+### 5.3. Evaluation
+- If you want to evaluate MAERec on benchmarks, check [evaluation](docs/evaluation.md)
+
+### 5.4. Inferencing
- If you want to inferencing MAERec on your raw pictures, check [inferencing](docs/inferencing.md)
+
+### 5.4. ONNX Conversion
+
## 6. QAs
## 7. License
+- The repository is released under the [MIT license](LICENSE).
+
+## 8. Acknowledgement
+- We sincerely thank all the constructors of the 17 datasets used in Union14M, and also the developers of MMOCR, which is a powerful toolbox for OCR research.
+## 9. Citation
diff --git a/docs/pretrain.md b/docs/pretrain.md
index 12f460e..28baf67 100644
--- a/docs/pretrain.md
+++ b/docs/pretrain.md
@@ -1,29 +1,54 @@
## Pre-training Using MAE
We adopt the framework of [MAE](http://openaccess.thecvf.com/content/CVPR2022/html/He_Masked_Autoencoders_Are_Scalable_Vision_Learners_CVPR_2022_paper.html) for pre-training. The code is heavily borrowed from [Masked Autoencoders: A PyTorch Implementation](https://github.com/facebookresearch/mae).
-### 1. Install
+### 1. Installation
```bash
-conda create -n mae python=3.7
+cd mae/
+conda create -n mae python=3.8
conda activate mae
pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
pip install -r requirements.txt
```
-- **Attention**: This repo is based on `timm==0.3.2`, for which a [fix](https://github.com/huggingface/pytorch-image-models/issues/420#issuecomment-776459842) is needed to work with PyTorch 1.8.1+.
+- **Attention**: The pre-training code is based on `timm==0.3.2`, for which a [fix](https://github.com/huggingface/pytorch-image-models/issues/420#issuecomment-776459842) is needed to work with PyTorch 1.8.1+. Add the below code to `timm/models/layers/helpers.py`:
+ ```python
+ import torch
-### 2. Prepare dataset
-- You need to prepare the dataset(s) in torchvision.datasets.ImageFolder format. The basic structure of the dataset is as follows:
- ```text
- |--dataset
- |--subfolder1
- |--image1.jpg
- |--image2.jpg
- |--...
- |--subfolder2
- |--image1.jpg
- |--image2.jpg
- |--...
+ TORCH_MAJOR = int(torch.__version__.split('.')[0])
+ TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+ if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+ from torch._six import container_abcs
+ else:
+ import collections.abc as container_abcs
```
-- You can aslo use Union14M-U for pre-training, which is organized in ImageFolder format.
+
+### 2. Prepare dataset
+- We support two types of datasets: ImageFolder and LMDB.
+ - torchvision.datasets.ImageFolder format:
+ ```text
+ |--dataset
+ |--book32
+ |--image1.jpg
+ |--image2.jpg
+ |--...
+ |--openvino
+ |--image1.jpg
+ |--image2.jpg
+ |--...
+ ```
+ - LMDB format. To know more about LMDB structure and how to create LMDB, you should not miss this [repo](https://github.com/Mountchicken/Efficient-Deep-Learning/blob/main/Efficient_DataProcessing.md#21-efficient-data-storage-methods).
+ ```text
+ |--dataset
+ |--book32
+ |--data.mdb
+ |--lock.mdb
+ |--openvino
+ |--data.mdb
+ |--lock.mdb
+ |--cc
+ |--data.mdb
+ |--lock.mdb
+ ```
### 3. Pre-training
- Pre-training ViT-Small on Union14M-U with 4 gpus:
@@ -38,8 +63,9 @@ pip install -r requirements.txt
--norm_pix_loss \
--blr 1.5e-4 \
--weight_decay 0.05 \
- --data_path Union14M-U/book32 Union14M-U/openvino /Union14M-U/CC
+ --data_path ../data/Union14M-U/book32_lmdb ../data/Union14M-U/cc_lmdb ../data/Union14M-U/openvino_lmdb
```
+- To pretrain ViT-Base, use `--model mae_vit_base_patch4`.
- Here the effective batch size is 256 (batch_size per gpu) * 1 (nodes) * 4 (gpus per node) = 1024. If memory or # gpus is limited, use --accum_iter to maintain the effective batch size, which is batch_size (per gpu) * nodes * 8 (gpus per node) * accum_iter.
- Here we use --norm_pix_loss as the target for better representation learning. To train a baseline model (e.g., for visualization), use pixel-based construction and turn off --norm_pix_loss.
- To train ViT-Base set --model mae_vit_base_patch4
diff --git a/mae/datasets/lmdb_dataset.py b/mae/datasets/lmdb_dataset.py
new file mode 100644
index 0000000..38cbe97
--- /dev/null
+++ b/mae/datasets/lmdb_dataset.py
@@ -0,0 +1,56 @@
+import lmdb
+import sys
+import six
+from torch.utils.data import Dataset
+from PIL import Image
+
+
+class lmdbDataset(Dataset):
+ """LMDB dataset for raw images.
+
+ Args:
+ root (str): Root path for lmdb files.
+ transform (callable, optional): A function/transform that takes in an
+ PIL image and returns a transformed version.
+ """
+
+ def __init__(self, root: str = None, transform=None):
+ self.env = lmdb.open(
+ root,
+ max_readers=1,
+ readonly=True,
+ lock=False,
+ readahead=False,
+ meminit=False)
+
+ if not self.env:
+ print('cannot creat lmdb from %s' % (root))
+ sys.exit(0)
+
+ with self.env.begin(write=False) as txn:
+ nSamples = int(txn.get('num-samples'.encode()))
+ self.nSamples = nSamples
+ self.transform = transform
+
+ def __len__(self):
+ return self.nSamples
+
+ def __getitem__(self, index):
+ assert index <= len(self), 'index range error'
+ index += 1
+ with self.env.begin(write=False) as txn:
+ img_key = 'image-%09d' % index
+ imgbuf = txn.get(img_key.encode())
+
+ buf = six.BytesIO()
+ buf.write(imgbuf)
+ buf.seek(0)
+ try:
+ img = Image.open(buf).convert('RGB')
+ except IOError:
+ print('Corrupted image for %d' % index)
+ return self[index + 1]
+
+ img = self.transform(img)
+
+ return img, 'test'
diff --git a/mae/main_pretrain.py b/mae/main_pretrain.py
index 72d0a7c..3a56f12 100644
--- a/mae/main_pretrain.py
+++ b/mae/main_pretrain.py
@@ -21,6 +21,7 @@
import util.misc as misc
from engine_pretrain import train_one_epoch
from util.misc import NativeScalerWithGradNormCount as NativeScaler
+from datasets.lmdb_dataset import lmdbDataset
assert timm.__version__ == "0.3.2" # version check
@@ -172,15 +173,27 @@ def main(args):
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
+ # check if it is lmdb dataset
if isinstance(args.data_path, list):
- dataset_train = datasets.ImageFolder(args.data_path[0],
- transform_train)
+ files = os.listdir(args.data_path[0])
+ else:
+ files = os.listdir(args.data_path)
+ for f in files:
+ if '.mdb' in f:
+ dataset_type = lmdbDataset
+ break
+ if os.path.isdir(os.path.join(args.data_path, f)):
+ dataset_type = datasets.ImageFolder
+ break
+
+ if isinstance(args.data_path, list):
+ dataset_train = dataset_type(args.data_path[0], transform_train)
for p in args.data_path[1:]:
dataset_train = torch.utils.data.ConcatDataset(
[dataset_train,
- datasets.ImageFolder(p, transform_train)])
+ dataset_type(p, transform_train)])
else:
- dataset_train = datasets.ImageFolder(
+ dataset_train = dataset_type(
os.path.join(args.data_path), transform=transform_train)
print(dataset_train)
@@ -273,8 +286,10 @@ def main(args):
epoch=epoch)
log_stats = {
- **{f'train_{k}': v
- for k, v in train_stats.items()},
+ **{
+ f'train_{k}': v
+ for k, v in train_stats.items()
+ },
'epoch': epoch,
}
diff --git a/mae/requirements.txt b/mae/requirements.txt
index 460e1d5..a19b8ba 100644
--- a/mae/requirements.txt
+++ b/mae/requirements.txt
@@ -1,2 +1,4 @@
timm==0.3.2
tensorboard==2.11.0
+lmdb==1.4.1
+numpy<=1.23.0
diff --git a/mae/util/__pycache__/lr_sched.cpython-38.pyc b/mae/util/__pycache__/lr_sched.cpython-38.pyc
index 13dcaf8..fd36c71 100644
Binary files a/mae/util/__pycache__/lr_sched.cpython-38.pyc and b/mae/util/__pycache__/lr_sched.cpython-38.pyc differ
diff --git a/mae/util/__pycache__/misc.cpython-38.pyc b/mae/util/__pycache__/misc.cpython-38.pyc
index ad6bf32..440d2a1 100644
Binary files a/mae/util/__pycache__/misc.cpython-38.pyc and b/mae/util/__pycache__/misc.cpython-38.pyc differ
diff --git a/mae/util/__pycache__/pos_embed.cpython-38.pyc b/mae/util/__pycache__/pos_embed.cpython-38.pyc
index 3d1ea60..2afb810 100644
Binary files a/mae/util/__pycache__/pos_embed.cpython-38.pyc and b/mae/util/__pycache__/pos_embed.cpython-38.pyc differ