fix infer_rec for attention

PaddlePaddle · Jun 3, 2020 · b722eb5 · b722eb5
2 parents b4c5dac + ade18e1
commit b722eb5
Show file tree

Hide file tree

Showing 17 changed files with 108 additions and 49 deletions.
diff --git a/configs/rec/rec_chinese_lite_train.yml b/configs/rec/rec_chinese_lite_train.yml
@@ -1,21 +1,22 @@
 Global:
  algorithm: CRNN
- use_gpu: true
+ use_gpu: false
  epoch_num: 3000
  log_smooth_window: 20
  print_batch_step: 10
  save_model_dir: ./output/rec_CRNN
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 320]
  max_text_length: 25
  character_type: ch
  character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
  loss_type: ctc
  reader_yml: ./configs/rec/rec_chinese_reader.yml
- pretrain_weights: 
+ pretrain_weights: output/rec_CRNN/rec_mv3_crnn/best_accuracy
  checkpoints:
  save_inference_dir:
  infer_img:

diff --git a/configs/rec/rec_icdar15_train.yml b/configs/rec/rec_icdar15_train.yml
@@ -8,13 +8,14 @@ Global:
  save_epoch_step: 300
  eval_batch_step: 500
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25
  character_type: en
  loss_type: ctc
  reader_yml: ./configs/rec/rec_icdar15_reader.yml
- pretrain_weights: ./pretrain_models/rec_mv3_none_bilstm_ctc/best_accuracy 
+ pretrain_weights:
  checkpoints:
  save_inference_dir:
  infer_img:

diff --git a/configs/rec/rec_mv3_none_bilstm_ctc.yml b/configs/rec/rec_mv3_none_bilstm_ctc.yml
@@ -1,20 +1,21 @@
 Global:
  algorithm: CRNN
- use_gpu: true
+ use_gpu: false
  epoch_num: 72
  log_smooth_window: 20
  print_batch_step: 10
  save_model_dir: output/rec_CRNN
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25
  character_type: en
  loss_type: ctc
  reader_yml: ./configs/rec/rec_benchmark_reader.yml
- pretrain_weights: ./output/rec_CRNN/rec_mv3_none_bilstm_ctc/best_accuracy
+ pretrain_weights:
  checkpoints:
  save_inference_dir:
  infer_img:

diff --git a/configs/rec/rec_mv3_none_none_ctc.yml b/configs/rec/rec_mv3_none_none_ctc.yml
@@ -8,6 +8,7 @@ Global:
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/configs/rec/rec_mv3_tps_bilstm_attn.yml b/configs/rec/rec_mv3_tps_bilstm_attn.yml
@@ -1,13 +1,14 @@
 Global:
  algorithm: RARE
- use_gpu: true
+ use_gpu: false
  epoch_num: 72
  log_smooth_window: 20
  print_batch_step: 10
  save_model_dir: output/rec_RARE
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/configs/rec/rec_mv3_tps_bilstm_ctc.yml b/configs/rec/rec_mv3_tps_bilstm_ctc.yml
@@ -8,6 +8,7 @@ Global:
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/configs/rec/rec_r34_vd_none_bilstm_ctc.yml b/configs/rec/rec_r34_vd_none_bilstm_ctc.yml
@@ -8,6 +8,7 @@ Global:
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/configs/rec/rec_r34_vd_none_none_ctc.yml b/configs/rec/rec_r34_vd_none_none_ctc.yml
@@ -8,6 +8,7 @@ Global:
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/configs/rec/rec_r34_vd_tps_bilstm_attn.yml b/configs/rec/rec_r34_vd_tps_bilstm_attn.yml
@@ -8,6 +8,7 @@ Global:
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml b/configs/rec/rec_r34_vd_tps_bilstm_ctc.yml
@@ -8,6 +8,7 @@ Global:
  save_epoch_step: 3
  eval_batch_step: 2000
  train_batch_size_per_card: 256
+ drop_last: true
  test_batch_size_per_card: 256
  image_shape: [3, 32, 100]
  max_text_length: 25

diff --git a/ppocr/data/det/db_process.py b/ppocr/data/det/db_process.py
@@ -17,6 +17,8 @@
 import numpy as np
 import json
 import sys
+from ppocr.utils.utility import initial_logger
+logger = initial_logger()
 
 from .data_augment import AugmentData
 from .random_crop_data import RandomCropData
@@ -100,6 +102,7 @@ def __call__(self, label_infor):
  img_path, gt_label = self.convert_label_infor(label_infor)
  imgvalue = cv2.imread(img_path)
  if imgvalue is None:
+ logger.info("{} does not exist!".format(img_path))
  return None
  data = self.make_data_dict(imgvalue, gt_label)
  data = AugmentData(data)

diff --git a/ppocr/data/rec/dataset_traversal.py b/ppocr/data/rec/dataset_traversal.py
@@ -43,6 +43,7 @@ def __init__(self, params):
  self.mode = params['mode']
  if params['mode'] == 'train':
  self.batch_size = params['train_batch_size_per_card']
+ self.drop_last = params['drop_last']
  else:
  self.batch_size = params['test_batch_size_per_card']
  self.infer_img = params['infer_img']
@@ -99,7 +100,7 @@ def __call__(self, process_id):
  process_id = 0
 
  def sample_iter_reader():
- if self.infer_img is not None:
+ if self.mode != 'train' and self.infer_img is not None:
  image_file_list = get_image_file_list(self.infer_img)
  for single_img in image_file_list:
  img = cv2.imread(single_img)
@@ -146,10 +147,11 @@ def batch_iter_reader():
  if len(batch_outs) == self.batch_size:
  yield batch_outs
  batch_outs = []
- if len(batch_outs) != 0:
- yield batch_outs
+ if not self.drop_last:
+ if len(batch_outs) != 0:
+ yield batch_outs
 
- if self.infer_img is None:
+ if self.mode != 'train' and self.infer_img is None:
  return batch_iter_reader
  return sample_iter_reader
 
@@ -171,6 +173,7 @@ def __init__(self, params):
  self.infer_img = params['infer_img']
  if params['mode'] == 'train':
  self.batch_size = params['train_batch_size_per_card']
+ self.drop_last = params['drop_last']
  else:
  self.batch_size = params['test_batch_size_per_card']
 
@@ -226,8 +229,9 @@ def batch_iter_reader():
  if len(batch_outs) == self.batch_size:
  yield batch_outs
  batch_outs = []
- if len(batch_outs) != 0:
- yield batch_outs
+ if not self.drop_last:
+ if len(batch_outs) != 0:
+ yield batch_outs
 
  if self.infer_img is None:
  return batch_iter_reader

diff --git a/ppocr/data/rec/img_tools.py b/ppocr/data/rec/img_tools.py
@@ -51,7 +51,7 @@ def resize_norm_img(img, image_shape):
 def resize_norm_img_chinese(img, image_shape):
  imgC, imgH, imgW = image_shape
  # todo: change to 0 and modified image shape
- max_wh_ratio = 10
+ max_wh_ratio = 0
  h, w = img.shape[0], img.shape[1]
  ratio = w * 1.0 / h
  max_wh_ratio = max(max_wh_ratio, ratio)

diff --git a/ppocr/modeling/architectures/rec_model.py b/ppocr/modeling/architectures/rec_model.py
@@ -110,7 +110,11 @@ def __call__(self, mode):
  return loader, outputs
  elif mode == "export":
  predict = predicts['predict']
- predict = fluid.layers.softmax(predict)
+ if self.loss_type == "ctc":
+ predict = fluid.layers.softmax(predict)
  return [image, {'decoded_out': decoded_out, 'predicts': predict}]
  else:
- return loader, {'decoded_out': decoded_out}
+ predict = predicts['predict']
+ if self.loss_type == "ctc":
+ predict = fluid.layers.softmax(predict)
+ return loader, {'decoded_out': decoded_out, 'predicts': predict}
diff --git a/ppocr/modeling/heads/rec_attention_head.py b/ppocr/modeling/heads/rec_attention_head.py
@@ -123,6 +123,8 @@ def gru_attention_infer(self, decoder_boot, max_length, char_num,
 
  full_ids = fluid.layers.fill_constant_batch_size_like(
  input=init_state, shape=[-1, 1], dtype='int64', value=1)
+ full_scores = fluid.layers.fill_constant_batch_size_like(
+ input=init_state, shape=[-1, 1], dtype='float32', value=1)
 
  cond = layers.less_than(x=counter, y=array_len)
  while_op = layers.While(cond=cond)
@@ -171,6 +173,9 @@ def gru_attention_infer(self, decoder_boot, max_length, char_num,
  new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
  fluid.layers.assign(new_ids, full_ids)
 
+ new_scores = fluid.layers.concat([full_scores, topk_scores], axis=1)
+ fluid.layers.assign(new_scores, full_scores)
+
  layers.increment(x=counter, value=1, in_place=True)
 
  # update the memories
@@ -184,7 +189,7 @@ def gru_attention_infer(self, decoder_boot, max_length, char_num,
  length_cond = layers.less_than(x=counter, y=array_len)
  finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
  layers.logical_and(x=length_cond, y=finish_cond, out=cond)
- return full_ids
+ return full_ids, full_scores
 
  def __call__(self, inputs, labels=None, mode=None):
  encoder_features = self.encoder(inputs)
@@ -223,10 +228,10 @@ def __call__(self, inputs, labels=None, mode=None):
  decoder_size, char_num)
  _, decoded_out = layers.topk(input=predict, k=1)
  decoded_out = layers.lod_reset(decoded_out, y=label_out)
- predicts = {'predict': predict, 'decoded_out': decoded_out}
+ predicts = {'predict':predict, 'decoded_out':decoded_out}
  else:
- ids = self.gru_attention_infer(
+ ids, predict = self.gru_attention_infer(
  decoder_boot, self.max_length, char_num, word_vector_dim,
  encoded_vector, encoded_proj, decoder_size)
- predicts = {'decoded_out': ids}
+ predicts = {'predict':predict, 'decoded_out':ids}
  return predicts
diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py
@@ -80,26 +80,43 @@ def __call__(self, img_list):
  starttime = time.time()
  self.input_tensor.copy_from_cpu(norm_img_batch)
  self.predictor.zero_copy_run()
- rec_idx_batch = self.output_tensors[0].copy_to_cpu()
- rec_idx_lod = self.output_tensors[0].lod()[0]
- predict_batch = self.output_tensors[1].copy_to_cpu()
- predict_lod = self.output_tensors[1].lod()[0]
- elapse = time.time() - starttime
- predict_time += elapse
- starttime = time.time()
- for rno in range(len(rec_idx_lod) - 1):
- beg = rec_idx_lod[rno]
- end = rec_idx_lod[rno + 1]
- rec_idx_tmp = rec_idx_batch[beg:end, 0]
- preds_text = self.char_ops.decode(rec_idx_tmp)
- beg = predict_lod[rno]
- end = predict_lod[rno + 1]
- probs = predict_batch[beg:end, :]
- ind = np.argmax(probs, axis=1)
- blank = probs.shape[1]
- valid_ind = np.where(ind != (blank - 1))[0]
- score = np.mean(probs[valid_ind, ind[valid_ind]])
- rec_res.append([preds_text, score])
+
+ if args.rec_algorithm != "RARE":
+ rec_idx_batch = self.output_tensors[0].copy_to_cpu()
+ rec_idx_lod = self.output_tensors[0].lod()[0]
+ predict_batch = self.output_tensors[1].copy_to_cpu()
+ predict_lod = self.output_tensors[1].lod()[0]
+ elapse = time.time() - starttime
+ predict_time += elapse
+ for rno in range(len(rec_idx_lod) - 1):
+ beg = rec_idx_lod[rno]
+ end = rec_idx_lod[rno + 1]
+ rec_idx_tmp = rec_idx_batch[beg:end, 0]
+ preds_text = self.char_ops.decode(rec_idx_tmp)
+ beg = predict_lod[rno]
+ end = predict_lod[rno + 1]
+ probs = predict_batch[beg:end, :]
+ ind = np.argmax(probs, axis=1)
+ blank = probs.shape[1]
+ valid_ind = np.where(ind != (blank - 1))[0]
+ score = np.mean(probs[valid_ind, ind[valid_ind]])
+ rec_res.append([preds_text, score])
+ else:
+ rec_idx_batch = self.output_tensors[0].copy_to_cpu()
+ predict_batch = self.output_tensors[1].copy_to_cpu()
+ for rno in range(len(rec_idx_batch)):
+ end_pos = np.where(rec_idx_batch[rno, :] == 1)[0]
+ if len(end_pos) <= 1:
+ preds = rec_idx_batch[rno, 1:]
+ score = np.mean(predict_batch[rno, 1:])
+ else:
+ preds = rec_idx_batch[rno, 1:end_pos[1]]
+ score = np.mean(predict_batch[rno, 1:end_pos[1]])
+ #todo: why index has 2 offset
+ preds = preds - 2
+ preds_text = self.char_ops.decode(preds)
+ rec_res.append([preds_text, score])
+
  return rec_res, predict_time
 
 
@@ -116,7 +133,13 @@ def __call__(self, img_list):
  continue
  valid_image_file_list.append(image_file)
  img_list.append(img)
- rec_res, predict_time = text_recognizer(img_list)
+ try:
+ rec_res, predict_time = text_recognizer(img_list)
+ except:
+ logger.info(
+ "ERROR!! \nInput image shape is not equal with config. TPS does not support variable shape.\n"
+ "Please set --rec_image_shape=input_shape and --rec_char_type='ch' ")
+ exit()
  for ino in range(len(img_list)):
  print("Predicts of %s:%s" % (valid_image_file_list[ino], rec_res[ino]))
  print("Total predict time for %d images:%.3f" %

diff --git a/tools/infer_rec.py b/tools/infer_rec.py
@@ -55,6 +55,7 @@ def main():
  program.merge_config(FLAGS.opt)
  logger.info(config)
  char_ops = CharacterOps(config['Global'])
+ loss_type = config['Global']['loss_type']
  config['Global']['char_ops'] = char_ops
 
  # check if set use_gpu=True in paddlepaddle cpu version
@@ -85,29 +86,38 @@ def main():
  if len(infer_list) == 0:
  logger.info("Can not find img in infer_img dir.")
  for i in range(max_img_num):
- print("infer_img:", infer_list[i])
+ print("infer_img:%s" % infer_list[i])
  img = next(blobs)
  predict = exe.run(program=eval_prog,
  feed={"image": img},
  fetch_list=fetch_varname_list,
  return_numpy=False)
-
- preds = np.array(predict[0])
- if preds.shape[1] == 1:
+ if loss_type == "ctc":
+ preds = np.array(predict[0])
  preds = preds.reshape(-1)
  preds_lod = predict[0].lod()[0]
  preds_text = char_ops.decode(preds)
- else:
+ probs = np.array(predict[1])
+ ind = np.argmax(probs, axis=1)
+ blank = probs.shape[1]
+ valid_ind = np.where(ind != (blank - 1))[0]
+ score = np.mean(probs[valid_ind, ind[valid_ind]])
+ elif loss_type == "attention":
+ preds = np.array(predict[0])
+ probs = np.array(predict[1])
  end_pos = np.where(preds[0, :] == 1)[0]
  if len(end_pos) <= 1:
- preds_text = preds[0, 1:]
+ preds = preds[0, 1:]
+ score = np.mean(probs[0, 1:])
  else:
- preds_text = preds[0, 1:end_pos[1]]
- preds_text = preds_text.reshape(-1)
- preds_text = char_ops.decode(preds_text)
+ preds = preds[0, 1:end_pos[1]]
+ score = np.mean(probs[0, 1:end_pos[1]])
+ preds = preds.reshape(-1)
+ preds_text = char_ops.decode(preds)
 
  print("\t index:", preds)
  print("\t word :", preds_text)
+ print("\t score :", score)
 
  # save for inference model
  target_var = []