Cherrypicking PaddlePaddleGH-10217 and PaddlePaddleGH-10216 to Paddle…

…Paddle:Release/2.7 (PaddlePaddle#10655) * Don't break overall processing on a bad image * Add preprocessing common to OCR tasks Add preprocessing to options
truongnn1106 · Aug 16, 2023 · b17c2f3 · b17c2f3
1 parent 6859e14
commit b17c2f3
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 21 deletions.
diff --git a/paddleocr.py b/paddleocr.py
@@ -46,7 +46,7 @@ def _import_file(module_name, file_path, make_importable=False):
 ppstructure = importlib.import_module('ppstructure', 'paddleocr')
 from ppocr.utils.logging import get_logger
 from tools.infer import predict_system
-from ppocr.utils.utility import check_and_read, get_image_file_list
+from ppocr.utils.utility import check_and_read, get_image_file_list, alpha_to_color, binarize_img
 from ppocr.utils.network import maybe_download, download_with_progressbar, is_link, confirm_model_dir_url
 from tools.infer.utility import draw_ocr, str2bool, check_gpu
 from ppstructure.utility import init_args, draw_structure_result
@@ -512,7 +512,7 @@ def get_model_config(type, version, model_type, lang):
 
 def img_decode(content: bytes):
  np_arr = np.frombuffer(content, dtype=np.uint8)
- return cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
+ return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
 
 
 def check_img(img):
@@ -616,22 +616,25 @@ def __init__(self, **kwargs):
  super().__init__(params)
  self.page_num = params.page_num
 
- def ocr(self, img, det=True, rec=True, cls=True):
+ def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, alpha_color=(255, 255, 255)):
  """
- ocr with paddleocr
+ OCR with PaddleOCR
  args：
- img: img for ocr, support ndarray, img_path and list or ndarray
- det: use text detection or not. If false, only rec will be exec. Default is True
- rec: use text recognition or not. If false, only det will be exec. Default is True
- cls: use angle classifier or not. Default is True. If true, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
+ img: img for OCR, support ndarray, img_path and list or ndarray
+ det: use text detection or not. If False, only rec will be exec. Default is True
+ rec: use text recognition or not. If False, only det will be exec. Default is True
+ cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
+ bin: binarize image to black and white. Default is False.
+ inv: invert image colors. Default is False.
+ alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
  """
  assert isinstance(img, (np.ndarray, list, str, bytes))
  if isinstance(img, list) and det == True:
  logger.error('When input a list of images, det must be false')
  exit(0)
  if cls == True and self.use_angle_cls == False:
  logger.warning(
- 'Since the angle classifier is not initialized, the angle classifier will not be uesd during the forward process'
+ 'Since the angle classifier is not initialized, it will not be used during the forward process'
  )
 
  img = check_img(img)
@@ -642,18 +645,35 @@ def ocr(self, img, det=True, rec=True, cls=True):
  imgs = img[:self.page_num]
  else:
  imgs = [img]
+
+ def preprocess_image(_image):
+ _image = alpha_to_color(_image, alpha_color)
+ if inv:
+ _image = cv2.bitwise_not(_image)
+ if bin:
+ _image = binarize_img(_image)
+ return _image
+
  if det and rec:
  ocr_res = []
  for idx, img in enumerate(imgs):
+ img = preprocess_image(img)
  dt_boxes, rec_res, _ = self.__call__(img, cls)
+ if not dt_boxes and not rec_res:
+ ocr_res.append(None)
+ continue
  tmp_res = [[box.tolist(), res]
  for box, res in zip(dt_boxes, rec_res)]
  ocr_res.append(tmp_res)
  return ocr_res
  elif det and not rec:
  ocr_res = []
  for idx, img in enumerate(imgs):
+ img = preprocess_image(img)
  dt_boxes, elapse = self.text_detector(img)
+ if not dt_boxes:
+ ocr_res.append(None)
+ continue
  tmp_res = [box.tolist() for box in dt_boxes]
  ocr_res.append(tmp_res)
  return ocr_res
@@ -662,6 +682,7 @@ def ocr(self, img, det=True, rec=True, cls=True):
  cls_res = []
  for idx, img in enumerate(imgs):
  if not isinstance(img, list):
+ img = preprocess_image(img)
  img = [img]
  if self.use_angle_cls and cls:
  img, cls_res_tmp, elapse = self.text_classifier(img)
@@ -763,10 +784,15 @@ def main():
  img_name = os.path.basename(img_path).split('.')[0]
  logger.info('{}{}{}'.format('*' * 10, img_path, '*' * 10))
  if args.type == 'ocr':
- result = engine.ocr(img_path,
- det=args.det,
- rec=args.rec,
- cls=args.use_angle_cls)
+ result = engine.ocr(
+ img_path,
+ det=args.det,
+ rec=args.rec,
+ cls=args.use_angle_cls,
+ bin=args.binarize,
+ inv=args.invert,
+ alpha_color=args.alphacolor
+ )
  if result is not None:
  for idx in range(len(result)):
  res = result[idx]

diff --git a/ppocr/utils/utility.py b/ppocr/utils/utility.py
@@ -75,6 +75,25 @@ def get_image_file_list(img_file):
  imgs_lists = sorted(imgs_lists)
  return imgs_lists
 
+def binarize_img(img):
+ if len(img.shape) == 3 and img.shape[2] == 3:
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # conversion to grayscale image
+ # use cv2 threshold binarization
+ _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+ img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
+ return img
+
+def alpha_to_color(img, alpha_color=(255, 255, 255)):
+ if len(img.shape) == 3 and img.shape[2] == 4:
+ B, G, R, A = cv2.split(img)
+ alpha = A / 255
+
+ R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8)
+ G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8)
+ B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8)
+
+ img = cv2.merge((B, G, R))
+ return img
 
 def check_and_read(img_path):
  if os.path.basename(img_path)[-3:] in ['gif', 'GIF']:

diff --git a/ppstructure/utility.py b/ppstructure/utility.py
@@ -15,7 +15,7 @@
 import ast
 from PIL import Image, ImageDraw, ImageFont
 import numpy as np
-from tools.infer.utility import draw_ocr_box_txt, str2bool, init_args as infer_args
+from tools.infer.utility import draw_ocr_box_txt, str2bool, str2int_tuple, init_args as infer_args
 
 
 def init_args():
@@ -98,6 +98,21 @@ def init_args():
  type=str2bool,
  default=False,
  help='Whether to use pdf2docx api')
+ parser.add_argument(
+ "--invert",
+ type=str2bool,
+ default=False,
+ help='Whether to invert image before processing')
+ parser.add_argument(
+ "--binarize",
+ type=str2bool,
+ default=False,
+ help='Whether to threshold binarize image before processing')
+ parser.add_argument(
+ "--alphacolor",
+ type=str2int_tuple,
+ default=(255, 255, 255),
+ help='Replacement color for the alpha channel, if the latter is present; R,G,B integers')
 
  return parser
 

diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py
@@ -65,15 +65,25 @@ def draw_crop_rec_res(self, output_dir, img_crop_list, rec_res):
  self.crop_image_res_index += bbox_num
 
  def __call__(self, img, cls=True):
- time_dict = {'det': 0, 'rec': 0, 'csl': 0, 'all': 0}
+ time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
+
+ if img is None:
+ logger.debug("no valid image provided")
+ return None, None, time_dict
+
  start = time.time()
  ori_im = img.copy()
  dt_boxes, elapse = self.text_detector(img)
  time_dict['det'] = elapse
- logger.debug("dt_boxes num : {}, elapse : {}".format(
- len(dt_boxes), elapse))
+
  if dt_boxes is None:
- return None, None
+ logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
+ end = time.time()
+ time_dict['all'] = end - start
+ return None, None, time_dict
+ else:
+ logger.debug("dt_boxes num : {}, elapsed : {}".format(
+ len(dt_boxes), elapse))
  img_crop_list = []
 
  dt_boxes = sorted_boxes(dt_boxes)
@@ -89,12 +99,12 @@ def __call__(self, img, cls=True):
  img_crop_list, angle_list, elapse = self.text_classifier(
  img_crop_list)
  time_dict['cls'] = elapse
- logger.debug("cls num : {}, elapse : {}".format(
+ logger.debug("cls num : {}, elapsed : {}".format(
  len(img_crop_list), elapse))
 
  rec_res, elapse = self.text_recognizer(img_crop_list)
  time_dict['rec'] = elapse
- logger.debug("rec_res num : {}, elapse : {}".format(
+ logger.debug("rec_res num : {}, elapsed : {}".format(
  len(rec_res), elapse))
  if self.args.save_crop_res:
  self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,

diff --git a/tools/infer/utility.py b/tools/infer/utility.py
@@ -28,8 +28,10 @@
 
 
 def str2bool(v):
- return v.lower() in ("true", "t", "1")
+ return v.lower() in ("true", "yes", "t", "y", "1")
 
+def str2int_tuple(v):
+ return tuple([int(i.strip()) for i in v.split(",")])
 
 def init_args():
  parser = argparse.ArgumentParser()