update pr

PaddlePaddle · Jun 12, 2022 · 47cb274 · 47cb274
1 parent a2ef524
commit 47cb274
Show file tree

Hide file tree

Showing 10 changed files with 62 additions and 329 deletions.
diff --git a/configs/rec/rec_r31_robustscanner.yml b/configs/rec/rec_r31_robustscanner.yml
@@ -15,7 +15,7 @@ Global:
  infer_img: ./inference/rec_inference
  # for data or label process
  character_dict_path: ppocr/utils/dict90.txt
- max_text_length: 40
+ max_text_length: &max_text_length 40
  infer_mode: False
  use_space_char: False
  rm_symbol: True
@@ -38,7 +38,7 @@ Architecture:
  algorithm: RobustScanner
  Transform:
  Backbone:
- name: ResNet31V2
+ name: ResNet31
  Head:
  name: RobustScannerHead
  enc_outchannles: 128
@@ -49,7 +49,7 @@ Architecture:
  mask: True
  padding_idx: 92
  encode_value: False
- max_seq_len: 40
+ max_text_length: *max_text_length
 
 Loss:
  name: SARLoss
@@ -64,8 +64,9 @@ Metric:
 
 Train:
  dataset:
- name: LMDBDataSet
- data_dir: I:/dataset/OCR/deep_text_recognition/data_lmdb/evaluation/CUTE80
+ name: SimpleDataSet
+ label_file_list: ['./train_data/train_list.txt']
+ data_dir: ./train_data/
  transforms:
  - DecodeImage: # load image
  img_mode: BGR
@@ -74,35 +75,35 @@ Train:
  - RobustScannerRecResizeImg:
  image_shape: [3, 48, 48, 160] # h:48 w:[48,160]
  width_downsample_ratio: 0.25
- max_seq_len: 40
+ max_text_length: *max_text_length
  - KeepKeys:
  keep_keys: ['image', 'label', 'valid_ratio', 'word_positons'] # dataloader will return list in this order
  loader:
  shuffle: True
- batch_size_per_card: 4
+ batch_size_per_card: 64
  drop_last: True
- num_workers: 0
+ num_workers: 8
  use_shared_memory: False
 
 Eval:
  dataset:
  name: LMDBDataSet
- data_dir: I:/dataset/OCR/deep_text_recognition/data_lmdb/evaluation/CUTE80
+ data_dir: ./train_data/data_lmdb_release/evaluation/
  transforms:
  - DecodeImage: # load image
  img_mode: BGR
  channel_first: False
  - SARLabelEncode: # Class handling label
  - RobustScannerRecResizeImg:
  image_shape: [3, 48, 48, 160]
- max_seq_len: 40
+ max_seq_len: *max_text_length
  width_downsample_ratio: 0.25
  - KeepKeys:
  keep_keys: ['image', 'label', 'valid_ratio', 'word_positons'] # dataloader will return list in this order
  loader:
  shuffle: False
  drop_last: False
- batch_size_per_card: 1
- num_workers: 0
+ batch_size_per_card: 64
+ num_workers: 4
  use_shared_memory: False
 
diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md
@@ -85,7 +85,7 @@
 |SAR|Resnet31| 87.20% | rec_r31_sar | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_r31_sar_train.tar) |
 |SEED|Aster_Resnet| 85.35% | rec_resnet_stn_bilstm_att | [训练模型](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) |
 |SVTR|SVTR-Tiny| 89.25% | rec_svtr_tiny_none_ctc_en | [训练模型](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/rec_svtr_tiny_none_ctc_en_train.tar) |
-|RobustScanner|ResNet31V2| 87.77% | rec_r31_robustscanner | [训练模型]() |
+|RobustScanner|ResNet31V2| 87.77% | rec_r31_robustscanner | coming soon |
 
 
 <a name="2"></a>

diff --git a/doc/doc_ch/algorithm_rec_robustscanner.md b/doc/doc_ch/algorithm_rec_robustscanner.md
@@ -26,7 +26,7 @@ Zhang
 
 |模型|骨干网络|配置文件|Acc|下载链接|
 | --- | --- | --- | --- | --- |
-|RobustScanner|ResNet31V2|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|[训练模型]()|
+|RobustScanner|ResNet31|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|coming soon|
 
 注：除了使用MJSynth和SynthText两个文字识别数据集外，还加入了[SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg)数据（提取码：627x），和部分真实数据，具体数据细节可以参考论文。
 
@@ -71,7 +71,7 @@ python3 tools/infer_rec.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pr
 
 <a name="4-1"></a>
 ### 4.1 Python推理
-首先将RobustScanner文本识别训练过程中保存的模型，转换成inference model。（ [模型下载地址]() )，可以使用如下命令进行转换：
+首先将RobustScanner文本识别训练过程中保存的模型，转换成inference model。可以使用如下命令进行转换：
 
 ```
 python3 tools/export_model.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r31_robustscanner
@@ -85,7 +85,7 @@ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png"
 <a name="4-2"></a>
 ### 4.2 C++推理
 
-由于C++预处理后处理还未支持SAR，所以暂未支持
+由于C++预处理后处理还未支持RobustScanner，所以暂未支持
 
 <a name="4-3"></a>
 ### 4.3 Serving服务化部署
@@ -104,11 +104,10 @@ python3 tools/infer/predict_rec.py --image_dir="./doc/imgs_words/en/word_1.png"
 ## 引用
 
 ```bibtex
-@article{Li2019ShowAA,
- title={Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition},
- author={Hui Li and Peng Wang and Chunhua Shen and Guyu Zhang},
- journal={ArXiv},
- year={2019},
- volume={abs/1811.00751}
+@article{2020RobustScanner,
+ title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition},
+ author={Xiaoyu Yue and Zhanghui Kuang and Chenhao Lin and Hongbin Sun and Wayne Zhang},
+ journal={ECCV2020},
+ year={2020},
 }
 ```
diff --git a/doc/doc_en/algorithm_rec_robustscanner_en.md b/doc/doc_en/algorithm_rec_robustscanner_en.md
@@ -1,4 +1,4 @@
-# SAR
+# RobustScanner
 
 - [1. Introduction](#1)
 - [2. Environment](#2)
@@ -26,7 +26,7 @@ Using MJSynth and SynthText two text recognition datasets for training, and eval
 
 |Model|Backbone|config|Acc|Download link|
 | --- | --- | --- | --- | --- |
-|RobustScanner|ResNet31V2|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|[train model]()|
+|RobustScanner|ResNet31V2|[rec_r31_robustscanner.yml](../../configs/rec/rec_r31_robustscanner.yml)|87.77%|coming soon|
 
 Note:In addition to using the two text recognition datasets MJSynth and SynthText, [SynthAdd](https://pan.baidu.com/share/init?surl=uV0LtoNmcxbO-0YA7Ch4dg) data (extraction code: 627x), and some real data are used in training, the specific data details can refer to the paper.
 
@@ -71,7 +71,7 @@ python3 tools/infer_rec.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pr
 
 <a name="4-1"></a>
 ### 4.1 Python Inference
-First, the model saved during the RobustScanner text recognition training process is converted into an inference model. ( [Model download link]() ), you can use the following command to convert:
+First, the model saved during the RobustScanner text recognition training process is converted into an inference model. you can use the following command to convert:
 
 ```
 python3 tools/export_model.py -c configs/rec/rec_r31_robustscanner.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=./inference/rec_r31_robustscanner
@@ -105,11 +105,10 @@ Not supported
 ## Citation
 
 ```bibtex
-@article{Li2019ShowAA,
- title={Show, Attend and Read: A Simple and Strong Baseline for Irregular Text Recognition},
- author={Hui Li and Peng Wang and Chunhua Shen and Guyu Zhang},
- journal={ArXiv},
- year={2019},
- volume={abs/1811.00751}
+@article{2020RobustScanner,
+ title={RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition},
+ author={Xiaoyu Yue and Zhanghui Kuang and Chenhao Lin and Hongbin Sun and Wayne Zhang},
+ journal={ECCV2020},
+ year={2020},
 }
 ```
diff --git a/ppocr/data/imaug/rec_img_aug.py b/ppocr/data/imaug/rec_img_aug.py
@@ -268,16 +268,16 @@ def __call__(self, data):
  return data
 
 class RobustScannerRecResizeImg(object):
- def __init__(self, image_shape, max_seq_len, width_downsample_ratio=0.25, **kwargs):
+ def __init__(self, image_shape, max_text_length, width_downsample_ratio=0.25, **kwargs):
  self.image_shape = image_shape
  self.width_downsample_ratio = width_downsample_ratio
- self.max_seq_len = max_seq_len
+ self.max_text_length = max_text_length
 
  def __call__(self, data):
  img = data['image']
  norm_img, resize_shape, pad_shape, valid_ratio = resize_norm_img_sar(
  img, self.image_shape, self.width_downsample_ratio)
- word_positons = robustscanner_other_inputs(self.max_seq_len)
+ word_positons = np.array(range(0, self.max_text_length)).astype('int64')
  data['image'] = norm_img
  data['resized_shape'] = resize_shape
  data['pad_shape'] = pad_shape
@@ -429,9 +429,6 @@ def srn_other_inputs(image_shape, num_heads, max_text_length):
  gsrm_slf_attn_bias2
  ]
 
-def robustscanner_other_inputs(max_text_length):
- word_pos = np.array(range(0, max_text_length)).astype('int64')
- return word_pos
 
 def flag():
  """

diff --git a/ppocr/modeling/backbones/__init__.py b/ppocr/modeling/backbones/__init__.py
@@ -28,7 +28,6 @@ def build_backbone(config, model_type):
  from .rec_mv1_enhance import MobileNetV1Enhance
  from .rec_nrtr_mtb import MTB
  from .rec_resnet_31 import ResNet31
- from .rec_resnet_31_v2 import ResNet31V2
  from .rec_resnet_aster import ResNet_ASTER
  from .rec_micronet import MicroNet
  from .rec_efficientb3_pren import EfficientNetb3_PREN

diff --git a/ppocr/modeling/backbones/rec_resnet_31.py b/ppocr/modeling/backbones/rec_resnet_31.py
@@ -27,16 +27,20 @@
 import paddle.nn.functional as F
 import numpy as np
 
-__all__ = ["ResNet31"]
+__all__ = ["ResNet31V2"]
 
 
+conv_weight_attr = nn.initializer.KaimingNormal()
+bn_weight_attr = ParamAttr(initializer=nn.initializer.Uniform(), learning_rate=1)
+
 def conv3x3(in_channel, out_channel, stride=1):
  return nn.Conv2D(
  in_channel,
  out_channel,
  kernel_size=3,
  stride=stride,
  padding=1,
+ weight_attr=conv_weight_attr,
  bias_attr=False)
 
 
@@ -46,10 +50,10 @@ class BasicBlock(nn.Layer):
  def __init__(self, in_channels, channels, stride=1, downsample=False):
  super().__init__()
  self.conv1 = conv3x3(in_channels, channels, stride)
- self.bn1 = nn.BatchNorm2D(channels)
+ self.bn1 = nn.BatchNorm2D(channels, weight_attr=bn_weight_attr)
  self.relu = nn.ReLU()
  self.conv2 = conv3x3(channels, channels)
- self.bn2 = nn.BatchNorm2D(channels)
+ self.bn2 = nn.BatchNorm2D(channels, weight_attr=bn_weight_attr)
  self.downsample = downsample
  if downsample:
  self.downsample = nn.Sequential(
@@ -58,8 +62,9 @@ def __init__(self, in_channels, channels, stride=1, downsample=False):
  channels * self.expansion,
  1,
  stride,
+ weight_attr=conv_weight_attr,
  bias_attr=False),
- nn.BatchNorm2D(channels * self.expansion), )
+ nn.BatchNorm2D(channels * self.expansion, weight_attr=bn_weight_attr))
  else:
  self.downsample = nn.Sequential()
  self.stride = stride
@@ -108,40 +113,40 @@ def __init__(self,
 
  # conv 1 (Conv Conv)
  self.conv1_1 = nn.Conv2D(
- in_channels, channels[0], kernel_size=3, stride=1, padding=1)
- self.bn1_1 = nn.BatchNorm2D(channels[0])
+ in_channels, channels[0], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr)
+ self.bn1_1 = nn.BatchNorm2D(channels[0], weight_attr=bn_weight_attr)
  self.relu1_1 = nn.ReLU()
 
  self.conv1_2 = nn.Conv2D(
- channels[0], channels[1], kernel_size=3, stride=1, padding=1)
- self.bn1_2 = nn.BatchNorm2D(channels[1])
+ channels[0], channels[1], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr)
+ self.bn1_2 = nn.BatchNorm2D(channels[1], weight_attr=bn_weight_attr)
  self.relu1_2 = nn.ReLU()
 
  # conv 2 (Max-pooling, Residual block, Conv)
  self.pool2 = nn.MaxPool2D(
  kernel_size=2, stride=2, padding=0, ceil_mode=True)
  self.block2 = self._make_layer(channels[1], channels[2], layers[0])
  self.conv2 = nn.Conv2D(
- channels[2], channels[2], kernel_size=3, stride=1, padding=1)
- self.bn2 = nn.BatchNorm2D(channels[2])
+ channels[2], channels[2], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr)
+ self.bn2 = nn.BatchNorm2D(channels[2], weight_attr=bn_weight_attr)
  self.relu2 = nn.ReLU()
 
  # conv 3 (Max-pooling, Residual block, Conv)
  self.pool3 = nn.MaxPool2D(
  kernel_size=2, stride=2, padding=0, ceil_mode=True)
  self.block3 = self._make_layer(channels[2], channels[3], layers[1])
  self.conv3 = nn.Conv2D(
- channels[3], channels[3], kernel_size=3, stride=1, padding=1)
- self.bn3 = nn.BatchNorm2D(channels[3])
+ channels[3], channels[3], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr)
+ self.bn3 = nn.BatchNorm2D(channels[3], weight_attr=bn_weight_attr)
  self.relu3 = nn.ReLU()
 
  # conv 4 (Max-pooling, Residual block, Conv)
  self.pool4 = nn.MaxPool2D(
  kernel_size=(2, 1), stride=(2, 1), padding=0, ceil_mode=True)
  self.block4 = self._make_layer(channels[3], channels[4], layers[2])
  self.conv4 = nn.Conv2D(
- channels[4], channels[4], kernel_size=3, stride=1, padding=1)
- self.bn4 = nn.BatchNorm2D(channels[4])
+ channels[4], channels[4], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr)
+ self.bn4 = nn.BatchNorm2D(channels[4], weight_attr=bn_weight_attr)
  self.relu4 = nn.ReLU()
 
  # conv 5 ((Max-pooling), Residual block, Conv)
@@ -151,8 +156,8 @@ def __init__(self,
  kernel_size=2, stride=2, padding=0, ceil_mode=True)
  self.block5 = self._make_layer(channels[4], channels[5], layers[3])
  self.conv5 = nn.Conv2D(
- channels[5], channels[5], kernel_size=3, stride=1, padding=1)
- self.bn5 = nn.BatchNorm2D(channels[5])
+ channels[5], channels[5], kernel_size=3, stride=1, padding=1, weight_attr=conv_weight_attr)
+ self.bn5 = nn.BatchNorm2D(channels[5], weight_attr=bn_weight_attr)
  self.relu5 = nn.ReLU()
 
  self.out_channels = channels[-1]
@@ -168,8 +173,9 @@ def _make_layer(self, input_channels, output_channels, blocks):
  output_channels,
  kernel_size=1,
  stride=1,
+ weight_attr=conv_weight_attr,
  bias_attr=False),
- nn.BatchNorm2D(output_channels), )
+ nn.BatchNorm2D(output_channels, weight_attr=bn_weight_attr))
 
  layers.append(
  BasicBlock(