diff --git a/LICENSE b/LICENSE index 261eeb9e9f..5fe86943b3 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,5 @@ +Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -186,7 +188,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/PPOCRLabel/PPOCRLabel.py b/PPOCRLabel/PPOCRLabel.py index 0a3ae1cb3b..6c8154d1c9 100644 --- a/PPOCRLabel/PPOCRLabel.py +++ b/PPOCRLabel/PPOCRLabel.py @@ -1617,8 +1617,9 @@ def showBoundingBoxFromPPlabel(self, filePath): key_cls = 'None' if not self.kie_mode else box.get('key_cls', 'None') shapes.append((box['transcription'], box['points'], None, key_cls, box.get('difficult', False))) - self.loadLabels(shapes) - self.canvas.verified = False + if shapes != []: + self.loadLabels(shapes) + self.canvas.verified = False def validFilestate(self, filePath): if filePath not in self.fileStatedict.keys(): @@ -2203,7 +2204,7 @@ def reRecognition(self): msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' QMessageBox.information(self, "Information", msg) return - result = self.ocr.ocr(img_crop, cls=True, det=False) + result = self.ocr.ocr(img_crop, cls=True, det=False)[0] if result[0][0] != '': if shape.line_color == DEFAULT_LOCK_COLOR: shape.label = result[0][0] @@ -2264,7 +2265,7 @@ def singleRerecognition(self): msg = 'Can not recognise the detection box in ' + self.filePath + '. Please change manually' QMessageBox.information(self, "Information", msg) return - result = self.ocr.ocr(img_crop, cls=True, det=False) + result = self.ocr.ocr(img_crop, cls=True, det=False)[0] if result[0][0] != '': result.insert(0, box) print('result in reRec is ', result) @@ -2415,12 +2416,12 @@ def cellreRecognition(self): # merge the text result in the cell texts = '' probs = 0. # the probability of the cell is avgerage prob of every text box in the cell - bboxes = self.ocr.ocr(img_crop, det=True, rec=False, cls=False) + bboxes = self.ocr.ocr(img_crop, det=True, rec=False, cls=False)[0] if len(bboxes) > 0: bboxes.reverse() # top row text at first for _bbox in bboxes: patch = get_rotate_crop_image(img_crop, np.array(_bbox, np.float32)) - rec_res = self.ocr.ocr(patch, det=False, rec=True, cls=False) + rec_res = self.ocr.ocr(patch, det=False, rec=True, cls=False)[0] text = rec_res[0][0] if text != '': texts += text + ('' if text[0].isalpha() else ' ') # add space between english word diff --git a/PPOCRLabel/README.md b/PPOCRLabel/README.md index 9c483e1fef..ec933a8397 100644 --- a/PPOCRLabel/README.md +++ b/PPOCRLabel/README.md @@ -103,11 +103,11 @@ python PPOCRLabel.py --kie True # [KIE mode] for [detection + recognition + keyw ``` #### 1.2.3 Build and Install the Whl Package Locally -Compile and install a new whl package, where 1.0.2 is the version number, you can specify the new version in 'setup.py'. +Compile and install a new whl package, where 0.0.0 is the version number, you can specify the new version in 'setup.py'. ```bash cd ./PPOCRLabel python3 setup.py bdist_wheel -pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl +pip3 install dist/PPOCRLabel-0.0.0-py2.py3-none-any.whl ``` diff --git a/PPOCRLabel/README_ch.md b/PPOCRLabel/README_ch.md index afe1a08ff9..5359afc4f1 100644 --- a/PPOCRLabel/README_ch.md +++ b/PPOCRLabel/README_ch.md @@ -101,12 +101,12 @@ python PPOCRLabel.py --lang ch #### 1.2.3 本地构建whl包并安装 -编译与安装新的whl包,其中1.0.2为版本号,可在 `setup.py` 中指定新版本。 +编译与安装新的whl包,其中0.0.0为版本号,可在 `setup.py` 中指定新版本。 ```bash cd ./PPOCRLabel python3 setup.py bdist_wheel -pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple +pip3 install dist/PPOCRLabel-0.0.0-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple ``` diff --git a/PPOCRLabel/libs/autoDialog.py b/PPOCRLabel/libs/autoDialog.py index 189a590de8..55636eec0f 100644 --- a/PPOCRLabel/libs/autoDialog.py +++ b/PPOCRLabel/libs/autoDialog.py @@ -40,7 +40,7 @@ def run(self): if self.model == 'paddle': h, w, _ = cv2.imdecode(np.fromfile(Imgpath, dtype=np.uint8), 1).shape if h > 32 and w > 32: - self.result_dic = self.ocr.ocr(Imgpath, cls=True, det=True) + self.result_dic = self.ocr.ocr(Imgpath, cls=True, det=True)[0] else: print('The size of', Imgpath, 'is too small to be recognised') self.result_dic = None diff --git a/PPOCRLabel/requirements.txt b/PPOCRLabel/requirements.txt index fd42a2de25..a10b3453a9 100644 --- a/PPOCRLabel/requirements.txt +++ b/PPOCRLabel/requirements.txt @@ -1,3 +1,3 @@ pyqt5 -paddleocr==2.6.0.0 +paddleocr xlrd==1.2.0 diff --git a/PPOCRLabel/setup.py b/PPOCRLabel/setup.py index a112df544e..9770b632bd 100644 --- a/PPOCRLabel/setup.py +++ b/PPOCRLabel/setup.py @@ -33,10 +33,10 @@ def readme(): package_dir={'PPOCRLabel': ''}, include_package_data=True, entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]}, - version='2.1.2', + version='2.1.3', install_requires=requirements, license='Apache License 2.0', - description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models', + description='PPOCRLabelv2 is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PP-OCR model to automatically detect and re-recognize data. It is written in Python3 and PyQT5, supporting rectangular box, table, irregular text and key information annotation modes. Annotations can be directly used for the training of PP-OCR detection and recognition models.', long_description=readme(), long_description_content_type='text/markdown', url='https://github.com/PaddlePaddle/PaddleOCR', diff --git a/README.md b/README.md index 8e869f6de5..7ad7d02860 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,10 @@ PaddleOCR aims to create multilingual, awesome, leading, and practical OCR tools ## 📣 Recent updates -- 💥 **Live Preview: Oct 24 - Oct 26, China Standard Time, 20:30**, Engineers@PaddleOCR will show PP-StructureV2 optimization strategy for 3 days. - - Scan the QR code below using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, get the live link and 20G OCR learning materials (including PDF2Word application, 10 models in vertical scenarios, etc.) +- 🔨**2022.11 Add implementation of [4 cutting-edge algorithms](doc/doc_ch/algorithm_overview.md)**:Text Detection [DRRG](doc/doc_en/algorithm_det_drrg_en.md), Text Recognition [RFL](./doc/doc_en/algorithm_rec_rfl_en.md), Image Super-Resolution [Text Telescope](doc/doc_en/algorithm_sr_telescope_en.md),Handwrittem Mathematical Expression Recognition [CAN](doc/doc_en/algorithm_rec_can_en.md) +- **2022.10 release [optimized JS version PP-OCRv3 model](./deploy/paddlejs/README.md)** with 4.3M model size, 8x faster inference time, and a ready-to-use web demo +- 💥 **Live Playback: Introduction to PP-StructureV2 optimization strategy**. Scan [the QR code below](#Community) using WeChat, follow the PaddlePaddle official account and fill out the questionnaire to join the WeChat group, get the live link and 20G OCR learning materials (including PDF2Word application, 10 models in vertical scenarios, etc.) -
- -
- **🔥2022.8.24 Release PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - Release [PP-StructureV2](./ppstructure/),with functions and performance fully upgraded, adapted to Chinese scenes, and new support for [Layout Recovery](./ppstructure/recovery) and **one line command to convert PDF to Word**; @@ -74,6 +72,7 @@ PaddleOCR support a variety of cutting-edge algorithms related to OCR, and devel - [Dive Into OCR ](./doc/doc_en/ocr_book_en.md) + ## 👫 Community - For international developers, we regard [PaddleOCR Discussions](https://github.com/PaddlePaddle/PaddleOCR/discussions) as our international community platform. All ideas and questions can be discussed here in English. diff --git a/README_ch.md b/README_ch.md index 5fec27bd66..514f2f4622 100755 --- a/README_ch.md +++ b/README_ch.md @@ -27,29 +27,19 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 📣 近期更新 -- **💥 直播预告:10.24-10.26日每晚8点半**,PaddleOCR研发团队详解PP-StructureV2优化策略。微信扫描下方二维码,关注公众号并填写问卷后进入官方交流群,获取直播链接与20G重磅OCR学习大礼包(内含PDF转Word应用程序、10种垂类模型、《动手学OCR》电子书等) - -
- -
- +- 🔨**2022.11 新增实现[4种前沿算法](doc/doc_ch/algorithm_overview.md)**:文本检测 [DRRG](doc/doc_ch/algorithm_det_drrg.md), 文本识别 [RFL](doc/doc_ch/algorithm_rec_rfl.md), 文本超分[Text Telescope](doc/doc_ch/algorithm_sr_telescope.md),公式识别[CAN](doc/doc_ch/algorithm_rec_can.md.md) +- **2022.10 优化[JS版PP-OCRv3模型](./deploy/paddlejs/README_ch.md)**:模型大小仅4.3M,预测速度提升8倍,配套web demo开箱即用 +- **💥 直播回放:PaddleOCR研发团队详解PP-StructureV2优化策略**。微信扫描[下方二维码](#开源社区),关注公众号并填写问卷后进入官方交流群,获取直播回放链接与20G重磅OCR学习大礼包(内含PDF转Word应用程序、10种垂类模型、《动手学OCR》电子书等) - **🔥2022.8.24 发布 PaddleOCR [release/2.6](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.6)** - 发布[PP-StructureV2](./ppstructure/README_ch.md),系统功能性能全面升级,适配中文场景,新增支持[版面复原](./ppstructure/recovery/README_ch.md),支持**一行命令完成PDF转Word**; - [版面分析](./ppstructure/layout/README_ch.md)模型优化:模型存储减少95%,速度提升11倍,平均CPU耗时仅需41ms; - [表格识别](./ppstructure/table/README_ch.md)模型优化:设计3大优化策略,预测耗时不变情况下,模型精度提升6%; - [关键信息抽取](./ppstructure/kie/README_ch.md)模型优化:设计视觉无关模型结构,语义实体识别精度提升2.8%,关系抽取精度提升9.1%。 - -- **🔥2022.8 发布 [OCR场景应用集合](./applications)** - - - 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 - - -- **2022.8 新增实现[8种前沿算法](doc/doc_ch/algorithm_overview.md)** - - 文本检测:[FCENet](doc/doc_ch/algorithm_det_fcenet.md), [DB++](doc/doc_ch/algorithm_det_db.md) - - 文本识别:[ViTSTR](doc/doc_ch/algorithm_rec_vitstr.md), [ABINet](doc/doc_ch/algorithm_rec_abinet.md), [VisionLAN](doc/doc_ch/algorithm_rec_visionlan.md), [SPIN](doc/doc_ch/algorithm_rec_spin.md), [RobustScanner](doc/doc_ch/algorithm_rec_robustscanner.md) - - 表格识别:[TableMaster](doc/doc_ch/algorithm_table_master.md) - - +- 🔥**2022.8 发布 [OCR场景应用集合](./applications)**:包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等**9个垂类模型**,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 +- **2022.8 新增实现[8种前沿算法](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_overview.md)** + - 文本检测:[FCENet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_det_fcenet.md), [DB++](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_det_db.md) + - 文本识别:[ViTSTR](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_vitstr.md), [ABINet](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_abinet.md), [VisionLAN](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_visionlan.md), [SPIN](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_spin.md), [RobustScanner](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_rec_robustscanner.md) + - 表格识别:[TableMaster](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6rc/doc/doc_ch/algorithm_table_master.md) - **2022.5.9 发布 PaddleOCR [release/2.5](https://github.com/PaddlePaddle/PaddleOCR/tree/release/2.5)** - 发布[PP-OCRv3](./doc/doc_ch/ppocr_introduction.md#pp-ocrv3),速度可比情况下,中文场景效果相比于PP-OCRv2再提升5%,英文场景提升11%,80语种多语言模型平均识别准确率提升5%以上; - 发布半自动标注工具[PPOCRLabelv2](./PPOCRLabel):新增表格文字图像、图像关键信息抽取任务和不规则文字图像的标注功能; @@ -79,23 +69,23 @@ PaddleOCR旨在打造一套丰富、领先、且实用的OCR工具库,助力 ## 📚《动手学OCR》电子书 - [《动手学OCR》电子书](./doc/doc_ch/ocr_book.md) - + ## 👫 开源社区 - **📑项目合作:** 如果您是企业开发者且有明确的OCR垂类应用需求,填写[问卷](https://paddle.wjx.cn/vj/QwF7GKw.aspx)后可免费与官方团队展开不同层次的合作。 -- **👫加入社区:** 微信扫描二维码并填写问卷之后,加入交流群领取20G重磅OCR学习大礼包 - - **包括《动手学OCR》电子书** ,配套讲解视频和notebook项目;PaddleOCR历次发版直播课视频; +- **👫加入社区:** **微信扫描二维码并填写问卷之后,加入交流群领取20G重磅OCR学习大礼包** + - **包括《动手学OCR》电子书** ,配套讲解视频和notebook项目;**PaddleOCR历次发版直播课回放链接**; - **OCR场景应用模型集合:** 包含数码管、液晶屏、车牌、高精度SVTR模型、手写体识别等垂类模型,覆盖通用,制造、金融、交通行业的主要OCR垂类应用。 - PDF2Word应用程序;OCR社区优秀开发者项目分享视频。 - **🏅️社区项目**:[社区项目](./doc/doc_ch/thirdparty.md)文档中包含了社区用户**使用PaddleOCR开发的各种工具、应用**以及**为PaddleOCR贡献的功能、优化的文档与代码**等,是官方为社区开发者打造的荣誉墙,也是帮助优质项目宣传的广播站。 - **🎁社区常规赛**:社区常规赛是面向OCR开发者的积分赛事,覆盖文档、代码、模型和应用四大类型,以季度为单位评选并发放奖励,赛题详情与报名方法可参考[链接](https://github.com/PaddlePaddle/PaddleOCR/issues/4982)。
- + +

PaddleOCR官方交流群二维码

- ## 🛠️ PP-OCR系列模型列表(更新中) diff --git a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml index ab48b99791..9bddcb29ed 100644 --- a/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml +++ b/configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml @@ -19,6 +19,7 @@ Global: use_space_char: true distributed: true save_res_path: ./output/rec/predicts_pp-OCRv2_distillation.txt + amp_custom_black_list: ['matmul','matmul_v2','elementwise_add'] Optimizer: diff --git a/deploy/avh/requirements.txt b/deploy/avh/requirements.txt index 1bf86ed110..a1a8626f19 100644 --- a/deploy/avh/requirements.txt +++ b/deploy/avh/requirements.txt @@ -1,3 +1,4 @@ paddlepaddle numpy -opencv-python \ No newline at end of file +opencv-python +typing-extensions diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index 7f6919c13a..02a4cbad95 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -3,6 +3,8 @@ - [1. 两阶段OCR算法](#1) - [1.1 文本检测算法](#11) - [1.2 文本识别算法](#12) + - [1.3 文本超分辨率算法](#13) + - [1.4 公式识别算法](#14) - [2. 端到端OCR算法](#2) - [3. 表格识别算法](#3) - [4. 关键信息抽取算法](#4) @@ -107,6 +109,34 @@ PaddleOCR将**持续新增**支持OCR领域前沿算法与模型,**欢迎广 |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| |RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | + + + +### 1.3 文本超分辨率算法 +已支持的文本超分辨率算法列表(戳链接获取使用教程): +- [x] [Text Gestalt](./algorithm_sr_gestalt.md) +- [x] [Text Telescope](./algorithm_sr_telescope.md) + +在TextZoom公开数据集上,算法效果如下: + +|模型|骨干网络|PSNR_Avg|SSIM_Avg|配置文件|下载链接| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[训练模型](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[训练模型](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + + + +### 1.4 公式识别算法 + +已支持的公式识别算法列表(戳链接获取使用教程): +- [x] [CAN](./algorithm_rec_can.md.md) + +在CROHME手写公式数据集上,算法效果如下: + +|模型 |骨干网络|配置文件|ExpRate|下载链接| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[训练模型](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + ## 2. 端到端算法 diff --git a/doc/doc_ch/finetune.md b/doc/doc_ch/finetune.md index 973c4cb103..97ec99f288 100644 --- a/doc/doc_ch/finetune.md +++ b/doc/doc_ch/finetune.md @@ -26,21 +26,11 @@ PaddleOCR提供的PP-OCR系列模型在通用场景中性能优异,能够解 ### 2.2 模型选择 -建议选择PP-OCRv2模型(配置文件:[ch_PP-OCRv2_det_student.yml](../../configs/det/ch_PP-OCRv2/ch_PP-OCRv2_det_student.yml),预训练模型:[ch_PP-OCRv2_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_det_distill_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 +建议选择PP-OCRv3模型(配置文件:[ch_PP-OCRv3_det_student.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml),预训练模型:[ch_PP-OCRv3_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 -更多PP-OCR系列模型,请参考[PaddleOCR 首页说明文档](../../README_ch.md)。 +更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](./models_list.md)。 -注意:在使用上述预训练模型的时候,由于保存的模型中包含教师模型,因此需要将其中的学生模型单独提取出来,再加载学生模型即可进行模型微调。 - -```python -import paddle -# 加载完整的检测预训练模型 -a = paddle.load("ch_PP-OCRv2_det_distill_train/best_accuracy.pdparams") -# 提取学生模型的参数 -b = {k[len("student_model."):]: a[k] for k in a if "student_model." in k} -# 保存模型,用于后续模型微调 -paddle.save(b, "ch_PP-OCRv2_det_student.pdparams") -``` +注意:在使用上述预训练模型的时候,需要使用文件夹中的`student.pdparams`文件作为预训练模型,即,仅使用学生模型。 ### 2.3 训练超参选择 @@ -49,7 +39,7 @@ paddle.save(b, "ch_PP-OCRv2_det_student.pdparams") ```yaml Global: - pretrained_model: ./pretrain_models/student.pdparams # 预训练模型路径 + pretrained_model: ./ch_PP-OCRv3_det_distill_train/student.pdparams # 预训练模型路径 Optimizer: lr: name: Cosine @@ -67,7 +57,7 @@ Train: num_workers: 4 ``` -上述配置文件中,首先需要将`pretrained_model`字段指定为2.2章节中提取出来的`ch_PP-OCRv2_det_student.pdparams`文件路径。 +上述配置文件中,首先需要将`pretrained_model`字段指定为`student.pdparams`文件路径。 PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*8=64`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如 @@ -88,7 +78,7 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8* | det_db_score_mode | str | "fast" | DB的检测结果得分计算方法,支持`fast`和`slow`,`fast`是根据polygon的外接矩形边框内的所有像素计算平均得分,`slow`是根据原始polygon内的所有像素计算平均得分,计算速度相对较慢一些,但是更加准确一些。 | -更多关于推理方法的介绍可以参考[Paddle Inference推理教程](./inference.md)。 +更多关于推理方法的介绍可以参考[Paddle Inference推理教程](././inference_ppocr.md)。 ## 3. 文本识别模型微调 @@ -109,9 +99,9 @@ PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8* ### 3.2 模型选择 -建议选择PP-OCRv2模型(配置文件:[ch_PP-OCRv2_rec_distillation.yml](../../configs/rec/ch_PP-OCRv2/ch_PP-OCRv2_rec_distillation.yml),预训练模型:[ch_PP-OCRv2_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv2/chinese/ch_PP-OCRv2_rec_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 +建议选择PP-OCRv3模型(配置文件:[ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),预训练模型:[ch_PP-OCRv3_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar))进行微调,其精度与泛化性能是目前提供的最优预训练模型。 -更多PP-OCR系列,模型请参考[PaddleOCR 首页说明文档](../../README_ch.md)。 +更多PP-OCR系列模型,请参考[PP-OCR 系列模型库](./models_list.md)。 ### 3.3 训练超参选择 @@ -147,7 +137,7 @@ Train: ``` -上述配置文件中,首先需要将`pretrained_model`字段指定为2.2章节中解压得到的`ch_PP-OCRv2_rec_train/best_accuracy.pdparams`文件路径。 +上述配置文件中,首先需要将`pretrained_model`字段指定为3.2章节中解压得到的`ch_PP-OCRv3_rec_train/best_accuracy.pdparams`文件路径。 PaddleOCR提供的配置文件是在8卡训练(相当于总的batch size是`8*128=1024`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如: @@ -175,5 +165,4 @@ Train: ### 3.4 训练调优 -训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。 -通过多次迭代训练,不断优化模型效果。 +训练过程并非一蹴而就的,完成一个阶段的训练评估后,建议收集分析当前模型在真实场景中的 badcase,有针对性的调整训练数据比例,或者进一步新增合成数据。通过多次迭代训练,不断优化模型效果。 diff --git a/doc/doc_ch/table_recognition.md b/doc/doc_ch/table_recognition.md index 156ba80e37..f09dedd038 100644 --- a/doc/doc_ch/table_recognition.md +++ b/doc/doc_ch/table_recognition.md @@ -14,6 +14,9 @@ - [2.5. 分布式训练](#25-分布式训练) - [2.6. 其他训练环境](#26-其他训练环境) - [2.7. 模型微调](#27-模型微调) + - [2.7.1 数据选择](#271-数据选择) + - [2.7.2 模型选择](#272-模型选择) + - [2.7.3 训练超参选择](#273-训练超参选择) - [3. 模型评估与预测](#3-模型评估与预测) - [3.1. 指标评估](#31-指标评估) - [3.2. 测试表格结构识别效果](#32-测试表格结构识别效果) @@ -219,7 +222,39 @@ DCU设备上运行需要设置环境变量 `export HIP_VISIBLE_DEVICES=0,1,2,3` ## 2.7. 模型微调 -实际使用过程中,建议加载官方提供的预训练模型,在自己的数据集中进行微调,关于模型的微调方法,请参考:[模型微调教程](./finetune.md)。 +### 2.7.1 数据选择 + +数据量:建议至少准备2000张的表格识别数据集用于模型微调。 + +### 2.7.2 模型选择 + +建议选择SLANet模型(配置文件:[SLANet_ch.yml](../../configs/table/SLANet_ch.yml),预训练模型:[ch_ppstructure_mobile_v2.0_SLANet_train.tar](https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar))进行微调,其精度与泛化性能是目前提供的最优中文表格预训练模型。 + +更多表格识别模型,请参考[PP-Structure 系列模型库](../../ppstructure/docs/models_list.md)。 + +### 2.7.3 训练超参选择 + +在模型微调的时候,最重要的超参就是预训练模型路径`pretrained_model`, 学习率`learning_rate`,部分配置文件如下所示。 + +```yaml +Global: + pretrained_model: ./ch_ppstructure_mobile_v2.0_SLANet_train/best_accuracy.pdparams # 预训练模型路径 +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # + warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 +``` + +上述配置文件中,首先需要将`pretrained_model`字段指定为`best_accuracy.pdparams`文件路径。 + +PaddleOCR提供的配置文件是在4卡训练(相当于总的batch size是`4*48=192`)、且没有加载预训练模型情况下的配置文件,因此您的场景中,学习率与总的batch size需要对应线性调整,例如 + +* 如果您的场景中是单卡训练,单卡batch_size=48,则总的batch_size=48,建议将学习率调整为`0.00025`左右。 +* 如果您的场景中是单卡训练,由于显存限制,只能设置单卡batch_size=32,则总的batch_size=32,建议将学习率调整为`0.00017`左右。 # 3. 模型评估与预测 diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md index 83f062801a..ba955c832b 100644 --- a/doc/doc_ch/whl.md +++ b/doc/doc_ch/whl.md @@ -294,7 +294,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls tru ## 3 自定义模型 -当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[inference.md](./inference.md) 第一节转换将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 +当内置模型无法满足需求时,需要使用到自己训练的模型。 首先,参照[模型导出](./detection.md#4-模型导出与预测)将检测、分类和识别模型转换为inference模型,然后按照如下方式使用 ### 3.1 代码使用 diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 309d074ed4..fad0fb8a72 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -3,6 +3,8 @@ - [1. Two-stage OCR Algorithms](#1) - [1.1 Text Detection Algorithms](#11) - [1.2 Text Recognition Algorithms](#12) + - [1.3 Text Super-Resolution Algorithms](#13) + - [1.4 Formula Recognition Algorithm](#14) - [2. End-to-end OCR Algorithms](#2) - [3. Table Recognition Algorithms](#3) - [4. Key Information Extraction Algorithms](#4) @@ -104,6 +106,36 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |RobustScanner|ResNet31| 87.77% | rec_r31_robustscanner | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_r31_robustscanner.tar)| |RFL|ResNetRFL| 88.63% | rec_resnet_rfl_att | [trained model](https://paddleocr.bj.bcebos.com/contribution/rec_resnet_rfl_att_train.tar) | + + +### 1.3 Text Super-Resolution Algorithms + +Supported text super-resolution algorithms (Click the link to get the tutorial): +- [x] [Text Gestalt](./algorithm_sr_gestalt.md) +- [x] [Text Telescope](./algorithm_sr_telescope.md) + +On the TextZoom public dataset, the effect of the algorithm is as follows: + +|Model|Backbone|PSNR_Avg|SSIM_Avg|Config|Download link| +|---|---|---|---|---|---| +|Text Gestalt|tsrn|19.28|0.6560| [configs/sr/sr_tsrn_transformer_strock.yml](../../configs/sr/sr_tsrn_transformer_strock.yml)|[trained model](https://paddleocr.bj.bcebos.com/sr_tsrn_transformer_strock_train.tar)| +|Text Telescope|tbsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[trained model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| + + + +### 1.4 Formula Recognition Algorithm + +Supported formula recognition algorithms (Click the link to get the tutorial): + +- [x] [CAN](./algorithm_rec_can.md.md) + +On the CROHME handwritten formula dataset, the effect of the algorithm is as follows: + +|Model |Backbone|Config|ExpRate|Download link| +| ----- | ----- | ----- | ----- | ----- | +|CAN|DenseNet|[rec_d28_can.yml](../../configs/rec/rec_d28_can.yml)|51.72%|[trained model](https://paddleocr.bj.bcebos.com/contribution/rec_d28_can_train.tar)| + + ## 2. End-to-end OCR Algorithms @@ -122,7 +154,7 @@ On the PubTabNet dataset, the algorithm result is as follows: |Model|Backbone|Config|Acc|Download link| |---|---|---|---|---| -|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| +|TableMaster|TableResNetExtra|[configs/table/table_master.yml](../../configs/table/table_master.yml)|77.47%|[trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_train.tar) / [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/tablemaster/table_structure_tablemaster_infer.tar)| diff --git a/doc/doc_en/algorithm_sr_telescope_en.md b/doc/doc_en/algorithm_sr_telescope_en.md index 9acb524312..334b58b6e8 100644 --- a/doc/doc_en/algorithm_sr_telescope_en.md +++ b/doc/doc_en/algorithm_sr_telescope_en.md @@ -27,7 +27,7 @@ Paper: Referring to the [FudanOCR](https://github.com/FudanVI/FudanOCR/tree/main/scene-text-telescope) data download instructions, the effect of the super-score algorithm on the TextZoom test set is as follows: |Model|Backbone|config|Acc|Download link| -|---|---|---|---|---|---| +|---|---|---|---|---| |Text Gestalt|tsrn|21.56|0.7411| [configs/sr/sr_telescope.yml](../../configs/sr/sr_telescope.yml)|[train model](https://paddleocr.bj.bcebos.com/contribution/sr_telescope_train.tar)| The [TextZoom dataset](https://paddleocr.bj.bcebos.com/dataset/TextZoom.tar) comes from two superfraction data sets, RealSR and SR-RAW, both of which contain LR-HR pairs. TextZoom has 17367 pairs of training data and 4373 pairs of test data. diff --git a/doc/doc_en/detection_en.md b/doc/doc_en/detection_en.md index c215e1a466..ab2e868c54 100644 --- a/doc/doc_en/detection_en.md +++ b/doc/doc_en/detection_en.md @@ -13,6 +13,7 @@ This section uses the icdar2015 dataset as an example to introduce the training, * [2.5 Distributed Training](#25-distributed-training) * [2.6 Training with knowledge distillation](#26) * [2.7 Training on other platform(Windows/macOS/Linux DCU)](#27) + * [2.8 Fine-tuning](#28) - [3. Evaluation and Test](#3-evaluation-and-test) - [3.1 Evaluation](#31-evaluation) - [3.2 Test](#32-test) @@ -178,6 +179,10 @@ GPU mode is not supported, you need to set `use_gpu` to False in the configurati - Linux DCU Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. +### 2.8 Fine-tuning + +In actual use, it is recommended to load the official pre-trained model and fine-tune it in your own data set. For the fine-tuning method of the detection model, please refer to: [Model Fine-tuning Tutorial](./finetune_en.md). + ## 3. Evaluation and Test ### 3.1 Evaluation diff --git a/doc/doc_en/finetune_en.md b/doc/doc_en/finetune_en.md new file mode 100644 index 0000000000..54be93f4d9 --- /dev/null +++ b/doc/doc_en/finetune_en.md @@ -0,0 +1,167 @@ +# Fine-tune + +## 1. background and meaning + +The PP-OCR series models provided by PaddleOCR have excellent performance in general scenarios and can solve detection and recognition problems in most cases. In vertical scenarios, if you want to obtain better model, you can further improve the accuracy of the PP-OCR series detection and recognition models through fine-tune. + +This article mainly introduces some precautions when fine-tuning the text detection and recognition model. Finally, you can obtain a text detection and recognition model with higher accuracy through model fine-tuning in your own scenarios. + +The core points of this article are as follows: + +1. The pre-trained model provided by PP-OCR has better generalization ability +2. Adding a small amount of real data (detection:>=500, recognition:>=5000) will greatly improve the detection and recognition effect of vertical scenes +3. When fine-tuning the model, adding real general scene data can further improve the model accuracy and generalization performance +4. In the text detection task, increasing the prediction shape of the image can further improve the detection effect of the smaller text area +5. When fine-tuning the model, it is necessary to properly adjust the hyperparameters (learning rate, batch size are the most important) to obtain a better fine-tuning effect. + +For more details, please refer to Chapter 2 and Chapter 3。 + +## 2. Text detection model fine-tuning + +### 2.1 Dataset + +* Dataset: It is recommended to prepare at least 500 text detection datasets for model fine-tuning. + +* Dataset annotation: single-line text annotation format, it is recommended that the labeled detection frame be consistent with the actual semantic content. For example, in the train ticket scene, the surname and first name may be far apart, but they belong to the same detection field semantically. Here, the entire name also needs to be marked as a detection frame. + +### 2.2 Model + +It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3_det_student.yml](../../configs/det/ch_PP-OCRv3/ch_PP-OCRv3_det_student.yml),pre-trained model: [ch_PP-OCRv3_det_distill_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_det_distill_train.tar), its accuracy and generalization performance is the best pre-training model currently available. + +For more PP-OCR series models, please refer to [PP-OCR Series Model Library](./models_list_en.md)。 + +Note: When using the above pre-trained model, you need to use the `student.pdparams` file in the folder as the pre-trained model, that is, only use the student model. + + +### 2.3 Training hyperparameter + +When fine-tuning the model, the most important hyperparameter is the pre-training model path `pretrained_model`, `learning_rate`与`batch_size`,some hyperparameters are as follows: + +```yaml +Global: + pretrained_model: ./ch_PP-OCRv3_det_distill_train/student.pdparams # pre-training model path +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # learning_rate + warmup_epoch: 2 + regularizer: + name: 'L2' + factor: 0 + +Train: + loader: + shuffle: True + drop_last: False + batch_size_per_card: 8 # single gpu batch size + num_workers: 4 +``` + +In the above configuration file, you need to specify the `pretrained_model` field as the `student.pdparams` file path. + +The configuration file provided by PaddleOCR is for 8-gpu training (equivalent to a total batch size of `8*8=64`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example + +* If your scenario is single-gpu training, single gpu batch_size=8, then the total batch_size=8, it is recommended to adjust the learning rate to about `1e-4`. +* If your scenario is for single-gpu training, due to memory limitations, you can only set batch_size=4 for a single gpu, and the total batch_size=4. It is recommended to adjust the learning rate to about `5e-5`. + +### 2.4 Prediction hyperparameter + +When exporting and inferring the trained model, you can further adjust the predicted image scale to improve the detection effect of small-area text. The following are some hyperparameters during DBNet inference, which can be adjusted appropriately to improve the effect. + +| hyperparameter | type | default | meaning | +| :--: | :--: | :--: | :--: | +| det_db_thresh | float | 0.3 | In the probability map output by DB, pixels with a score greater than the threshold will be considered as text pixels | +| det_db_box_thresh | float | 0.6 | When the average score of all pixels within the frame of the detection result is greater than the threshold, the result will be considered as a text area | +| det_db_unclip_ratio | float | 1.5 | The expansion coefficient of `Vatti clipping`, using this method to expand the text area | +| max_batch_size | int | 10 | batch size | +| use_dilation | bool | False | Whether to expand the segmentation results to obtain better detection results | +| det_db_score_mode | str | "fast" | DB's detection result score calculation method supports `fast` and `slow`. `fast` calculates the average score based on all pixels in the polygon’s circumscribed rectangle border, and `slow` calculates the average score based on all pixels in the original polygon. The calculation speed is relatively slower, but more accurate. | + + +For more information on inference methods, please refer to[Paddle Inference doc](././inference_ppocr_en.md)。 + + +## 3. Text recognition model fine-tuning + + +### 3.1 Dataset + +* Dataset:If the dictionary is not changed, it is recommended to prepare at least 5,000 text recognition datasets for model fine-tuning; if the dictionary is changed (not recommended), more quantities are required. + +* Data distribution: It is recommended that the distribution be as consistent as possible with the actual measurement scenario. If the actual scene contains a lot of short text, it is recommended to include more short text in the training data. If the actual scene has high requirements for the recognition effect of spaces, it is recommended to include more text content with spaces in the training data. + +* Data synthesis: In the case of some character recognition errors, it is recommended to obtain a batch of specific character dataset, add it to the original dataset and use a small learning rate for fine-tuning. The ratio of original dataset to new dataset can be 10:1 to 5:1 to avoid overfitting of the model caused by too much data in a single scene. At the same time, try to balance the word frequency of the corpus to ensure that the frequency of common words will not be too low. + + Specific characters can be generated using the TextRenderer tool, for synthesis examples, please refer to [data synthesis](https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.6/applications/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB/%E5%85%89%E5%8A%9F%E7%8E%87%E8%AE%A1%E6%95%B0%E7%A0%81%E7%AE%A1%E5%AD%97%E7%AC%A6%E8%AF%86%E5%88%AB.md#31-%E6%95%B0%E6%8D%AE%E5%87%86%E5%A4%87) + . The synthetic data corpus should come from real usage scenarios as much as possible, and keep the richness of fonts and backgrounds on the basis of being close to the real scene, which will help improve the model effect. + +* Common Chinese and English data: During training, common real data can be added to the training set (for example, in the fine-tuning scenario without changing the dictionary, it is recommended to add real data such as LSVT, RCTW, MTWI) to further improve the generalization performance of the model. + +### 3.2 Model + +It is recommended to choose the PP-OCRv3 model (configuration file: [ch_PP-OCRv3_rec_distillation.yml](../../configs/rec/PP-OCRv3/ch_PP-OCRv3_rec_distillation.yml),pre-trained model: [ch_PP-OCRv3_rec_train.tar](https://paddleocr.bj.bcebos.com/PP-OCRv3/chinese/ch_PP-OCRv3_rec_train.tar),its accuracy and generalization performance is the best pre-training model currently available. + +For more PP-OCR series models, please refer to [PP-OCR Series Model Library](./models_list_en.md)。 + + +### 3.3 Training hyperparameter + +Similar to text detection task fine-tuning, when fine-tuning the recognition model, the most important hyperparameters are the pre-trained model path `pretrained_model`, `learning_rate` and `batch_size`, some default configuration files are shown below. + +```yaml +Global: + pretrained_model: # pre-training model path +Optimizer: + lr: + name: Piecewise + decay_epochs : [700, 800] + values : [0.001, 0.0001] # learning_rate + warmup_epoch: 5 + regularizer: + name: 'L2' + factor: 0 + +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - ./train_data/train_list.txt + ratio_list: [1.0] # Sampling ratio, the default value is [1.0] + loader: + shuffle: True + drop_last: False + batch_size_per_card: 128 # single gpu batch size + num_workers: 8 + +``` + + +In the above configuration file, you first need to specify the `pretrained_model` field as the `ch_PP-OCRv3_rec_train/best_accuracy.pdparams` file path decompressed in Chapter 3.2. + +The configuration file provided by PaddleOCR is for 8-gpu training (equivalent to a total batch size of `8*128=1024`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example: + +* If your scenario is single-gpu training, single gpu batch_size=128, then the total batch_size=128, in the case of loading the pre-trained model, it is recommended to adjust the learning rate to about `[1e-4, 2e-5]` (For the piecewise learning rate strategy, two values need to be set, the same below). +* If your scenario is for single-gpu training, due to memory limitations, you can only set batch_size=64 for a single gpu, and the total batch_size=64. When loading the pre-trained model, it is recommended to adjust the learning rate to `[5e-5 , 1e-5]`about. + + +If there is general real scene data added, it is recommended that in each epoch, the amount of vertical scene data and real scene data should be kept at about 1:1. + +For example: your own vertical scene recognition data volume is 1W, the data label file is `vertical.txt`, the collected general scene recognition data volume is 10W, and the data label file is `general.txt`. + +Then, the `label_file_list` and `ratio_list` parameters can be set as shown below. In each epoch, `vertical.txt` will be fully sampled (sampling ratio is 1.0), including 1W pieces of data; `general.txt` will be sampled according to a sampling ratio of 0.1, including `10W*0.1=1W` pieces of data, the final ratio of the two is `1:1`. + +```yaml +Train: + dataset: + name: SimpleDataSet + data_dir: ./train_data/ + label_file_list: + - vertical.txt + - general.txt + ratio_list: [1.0, 0.1] +``` + +### 3.4 training optimization + +The training process does not happen overnight. After completing a stage of training evaluation, it is recommended to collect and analyze the badcase of the current model in the real scene, adjust the proportion of training data in a targeted manner, or further add synthetic data. Through multiple iterations of training, the model effect is continuously optimized. diff --git a/doc/doc_en/ocr_book_en.md b/doc/doc_en/ocr_book_en.md index b0455fe61a..63162be566 100644 --- a/doc/doc_en/ocr_book_en.md +++ b/doc/doc_en/ocr_book_en.md @@ -1,6 +1,6 @@ # E-book: *Dive Into OCR* -"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR team, Chen Zhineng, a Pre-tenure Professor at Fudan University, Huang Wenhui, a senior expert in the field of vision at China Mobile Research Institute, and other industry-university-research colleagues, as well as OCR developers. The main features are as follows: +"Dive Into OCR" is a textbook that combines OCR theory and practice, written by the PaddleOCR community. The main features are as follows: - OCR full-stack technology covering text detection, recognition and document analysis - Closely integrate theory and practice, cross the code implementation gap, and supporting instructional videos @@ -8,6 +8,10 @@ ## Structure +
+ +
+ - The first part is the preliminary knowledge of the book, including the knowledge index and resource links needed in the process of positioning and using the book content of the book - The second part is chapters 4-8 of the book, which introduce the concepts, applications, and industry practices related to the detection and identification capabilities of the OCR engine. In the "Introduction to OCR Technology", the application scenarios and challenges of OCR, the basic concepts of technology, and the pain points in industrial applications are comprehensively explained. Then, in the two chapters of "Text Detection" and "Text Recognition", the two basic tasks of OCR are introduced. In each chapter, an algorithm is accompanied by a detailed explanation of the code and practical exercises. Chapters 6 and 7 are a detailed introduction to the PP-OCR series model, PP-OCR is a set of OCR systems for industrial applications, on the basis of the basic detection and identification model, after a series of optimization strategies to achieve the general field of industrial SOTA model, while opening up a variety of predictive deployment solutions, enabling enterprises to quickly land OCR applications. @@ -16,6 +20,11 @@ ## Address -- [E-book: *Dive Into OCR* (link generating)]() -- [Jupyter notebook](../../notebook/notebook_en/) -- [videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) +- [E-book: *Dive Into OCR* (PDF)](https://paddleocr.bj.bcebos.com/ebook/Dive_into_OCR.pdf) +- [Notebook (.ipynb)](https://github.com/PaddleOCR-Community/Dive-into-OCR) +- [Videos (Chinese only)](https://aistudio.baidu.com/aistudio/education/group/info/25207) + + +trackgit-views + + diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index 7d31b0ffe2..78917aea90 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -15,6 +15,7 @@ * [2.6 Training with knowledge distillation](#kd) * [2.7 Multi-language Training](#Multi_language) * [2.8 Training on other platform(Windows/macOS/Linux DCU)](#28) + * [2.9 Fine-tuning](#29) - [3. Evaluation and Test](#3-evaluation-and-test) * [3.1 Evaluation](#31-evaluation) * [3.2 Test](#32-test) @@ -384,6 +385,11 @@ GPU mode is not supported, you need to set `use_gpu` to False in the configurati - Linux DCU Running on a DCU device requires setting the environment variable `export HIP_VISIBLE_DEVICES=0,1,2,3`, and the rest of the training and evaluation prediction commands are exactly the same as the Linux GPU. + +## 2.9 Fine-tuning + +In actual use, it is recommended to load the official pre-trained model and fine-tune it in your own data set. For the fine-tuning method of the recognition model, please refer to: [Model Fine-tuning Tutorial](./finetune_en.md). + ## 3. Evaluation and Test diff --git a/doc/doc_en/table_recognition_en.md b/doc/doc_en/table_recognition_en.md index cff2933df2..d79d98936e 100644 --- a/doc/doc_en/table_recognition_en.md +++ b/doc/doc_en/table_recognition_en.md @@ -14,6 +14,9 @@ This article provides a full-process guide for the PaddleOCR table recognition m - [2.5. Distributed Training](#25-distributed-training) - [2.6. Training on other platform(Windows/macOS/Linux DCU)](#26-training-on-other-platformwindowsmacoslinux-dcu) - [2.7. Fine-tuning](#27-fine-tuning) + - [2.7.1 Dataset](#271-dataset) + - [2.7.2 model selection](#272-model-selection) + - [2.7.3 Training hyperparameter selection](#273-training-hyperparameter-selection) - [3. Evaluation and Test](#3-evaluation-and-test) - [3.1. Evaluation](#31-evaluation) - [3.2. Test table structure recognition effect](#32-test-table-structure-recognition-effect) @@ -226,8 +229,40 @@ Running on a DCU device requires setting the environment variable `export HIP_VI ## 2.7. Fine-tuning -In the actual use process, it is recommended to load the officially provided pre-training model and fine-tune it in your own data set. For the fine-tuning method of the table recognition model, please refer to: [Model fine-tuning tutorial](./finetune.md). +### 2.7.1 Dataset + +Data number: It is recommended to prepare at least 2000 table recognition datasets for model fine-tuning. + +### 2.7.2 model selection + +It is recommended to choose the SLANet model (configuration file: [SLANet_ch.yml](../../configs/table/SLANet_ch.yml), pre-training model: [ch_ppstructure_mobile_v2.0_SLANet_train.tar](https://paddleocr.bj.bcebos .com/ppstructure/models/slanet/ch_ppstructure_mobile_v2.0_SLANet_train.tar)) for fine-tuning, its accuracy and generalization performance is the best Chinese table pre-training model currently available. + +For more table recognition models, please refer to [PP-Structure Series Model Library](../../ppstructure/docs/models_list.md). + +### 2.7.3 Training hyperparameter selection + +When fine-tuning the model, the most important hyperparameters are the pretrained model path `pretrained_model`, the learning rate `learning_rate`, and some configuration files are shown below. + +```yaml +Global: + pretrained_model: ./ch_ppstructure_mobile_v2.0_SLANet_train/best_accuracy.pdparams # Pre-trained model path +Optimizer: + lr: + name: Cosine + learning_rate: 0.001 # + warmup_epoch: 0 + regularizer: + name: 'L2' + factor: 0 +``` + +In the above configuration file, you first need to specify the `pretrained_model` field as the `best_accuracy.pdparams` file path. + +The configuration file provided by PaddleOCR is for 4-card training (equivalent to a total batch size of `4*48=192`) and no pre-trained model is loaded. Therefore, in your scenario, the learning rate is the same as the total The batch size needs to be adjusted linearly, for example + +* If your scenario is single card training, single card batch_size=48, then the total batch_size=48, it is recommended to adjust the learning rate to about `0.00025`. +* If your scenario is for single-card training, due to memory limitations, you can only set batch_size=32 for a single card, then the total batch_size=32, it is recommended to adjust the learning rate to about `0.00017`. # 3. Evaluation and Test diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md index 77e80faa68..5628dc3f45 100644 --- a/doc/doc_en/whl_en.md +++ b/doc/doc_en/whl_en.md @@ -261,7 +261,7 @@ Output will be a list, each item contains classification result and confidence ## 3 Use custom model When the built-in model cannot meet the needs, you need to use your own trained model. -First, refer to the first section of [inference_en.md](./inference_en.md) to convert your det and rec model to inference model, and then use it as follows +First, refer to [export](./detection_en.md#4-inference) doc to convert your det and rec model to inference model, and then use it as follows ### 3.1 Use by code diff --git a/paddleocr.py b/paddleocr.py index af0145b48b..96a641bb77 100644 --- a/paddleocr.py +++ b/paddleocr.py @@ -26,6 +26,9 @@ import logging import numpy as np from pathlib import Path +import base64 +from io import BytesIO +from PIL import Image tools = importlib.import_module('.', 'tools') ppocr = importlib.import_module('.', 'ppocr') @@ -431,7 +434,25 @@ def check_img(img): img, flag_gif, flag_pdf = check_and_read(image_file) if not flag_gif and not flag_pdf: with open(image_file, 'rb') as f: - img = img_decode(f.read()) + img_str = f.read() + img = img_decode(img_str) + if img is None: + try: + buf = BytesIO() + image = BytesIO(img_str) + im = Image.open(image) + rgb = im.convert('RGB') + rgb.save(buf, 'jpeg') + buf.seek(0) + image_bytes = buf.read() + data_base64 = str(base64.b64encode(image_bytes), + encoding="utf-8") + image_decode = base64.b64decode(data_base64) + img_array = np.frombuffer(image_decode, np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + except: + logger.error("error in loading image:{}".format(image_file)) + return None if img is None: logger.error("error in loading image:{}".format(image_file)) return None diff --git a/ppocr/utils/e2e_metric/Deteval.py b/ppocr/utils/e2e_metric/Deteval.py index 387b3c24e6..c2a4383eed 100755 --- a/ppocr/utils/e2e_metric/Deteval.py +++ b/ppocr/utils/e2e_metric/Deteval.py @@ -17,8 +17,6 @@ import scipy.io as io from ppocr.utils.utility import check_install -check_install("Polygon", "Polygon3") -import Polygon as plg from ppocr.utils.e2e_metric.polygon_fast import iod, area_of_intersection, area @@ -279,6 +277,8 @@ def get_score_C(gt_label, text, pred_bboxes): """ get score for CentripetalText (CT) prediction. """ + check_install("Polygon", "Polygon3") + import Polygon as plg def gt_reading_mod(gt_label, text): """This helper reads groundtruths from mat files""" diff --git a/ppstructure/kie/requirements.txt b/ppstructure/kie/requirements.txt index 6cfcba7641..61c230d3ed 100644 --- a/ppstructure/kie/requirements.txt +++ b/ppstructure/kie/requirements.txt @@ -2,6 +2,6 @@ sentencepiece yacs seqeval pypandoc -attrdict +attrdict3 python_docx paddlenlp>=2.4.1 diff --git a/ppstructure/predict_system.py b/ppstructure/predict_system.py index bb061c998f..b32b706299 100644 --- a/ppstructure/predict_system.py +++ b/ppstructure/predict_system.py @@ -229,7 +229,9 @@ def main(args): if args.recovery and args.use_pdf2docx_api and flag_pdf: from pdf2docx.converter import Converter - docx_file = os.path.join(args.output, '{}.docx'.format(img_name)) + os.makedirs(args.output, exist_ok=True) + docx_file = os.path.join(args.output, + '{}_api.docx'.format(img_name)) cv = Converter(image_file) cv.convert(docx_file) cv.close() diff --git a/ppstructure/recovery/recovery_to_doc.py b/ppstructure/recovery/recovery_to_doc.py index 1d8f8d9d4b..0501812082 100644 --- a/ppstructure/recovery/recovery_to_doc.py +++ b/ppstructure/recovery/recovery_to_doc.py @@ -73,7 +73,7 @@ def convert_info_docx(img, res, save_folder, img_name): text_run.font.size = shared.Pt(10) # save to docx - docx_path = os.path.join(save_folder, '{}.docx'.format(img_name)) + docx_path = os.path.join(save_folder, '{}_ocr.docx'.format(img_name)) doc.save(docx_path) logger.info('docx save to {}'.format(docx_path)) diff --git a/ppstructure/recovery/requirements.txt b/ppstructure/recovery/requirements.txt index ec08f9d0a2..761b9d7c3e 100644 --- a/ppstructure/recovery/requirements.txt +++ b/ppstructure/recovery/requirements.txt @@ -1,5 +1,4 @@ python-docx -PyMuPDF==1.19.0 beautifulsoup4 fonttools>=4.24.0 fire>=0.3.0 diff --git a/ppstructure/recovery/table_process.py b/ppstructure/recovery/table_process.py index 982e6b760f..77a6ef7659 100644 --- a/ppstructure/recovery/table_process.py +++ b/ppstructure/recovery/table_process.py @@ -278,8 +278,6 @@ def handle_table(self, html, doc): cell_col += colspan cell_row += 1 - doc.save('1.docx') - def handle_data(self, data): if self.skip: return diff --git a/requirements.txt b/requirements.txt index f3d9ce89e3..56f342a4e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ lxml premailer openpyxl attrdict -PyMuPDF==1.19.0 \ No newline at end of file +PyMuPDF<1.21.0 diff --git a/test_tipc/test_serving_infer_cpp.sh b/test_tipc/test_serving_infer_cpp.sh index 10ddecf3fa..6de685682a 100644 --- a/test_tipc/test_serving_infer_cpp.sh +++ b/test_tipc/test_serving_infer_cpp.sh @@ -103,7 +103,9 @@ function func_serving(){ last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" - ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + #ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + ${python_list[0]} ${web_service_py} stop + sleep 5s else server_log_path="${LOG_PATH}/cpp_server_gpu.log" web_service_cpp_cmd="nohup ${python_list[0]} ${web_service_py} --model ${det_server_value} ${rec_server_value} ${op_key} ${op_value} ${port_key} ${port_value} ${gpu_key} ${gpu_id} > ${server_log_path} 2>&1 &" @@ -115,7 +117,8 @@ function func_serving(){ last_status=${PIPESTATUS[0]} eval "cat ${_save_log_path}" status_check $last_status "${cpp_client_cmd}" "${status_log}" "${model_name}" "${_save_log_path}" - ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + #ps ux | grep -i ${port_value} | awk '{print $2}' | xargs kill -s 9 + ${python_list[0]} ${web_service_py} stop fi done } diff --git a/tools/infer_kie_token_ser_re.py b/tools/infer_kie_token_ser_re.py index c4fa2c927a..76120a913f 100755 --- a/tools/infer_kie_token_ser_re.py +++ b/tools/infer_kie_token_ser_re.py @@ -81,7 +81,7 @@ def make_input(ser_inputs, ser_results): end.append(entity['end']) label.append(entities_labels[res['pred']]) - entities = np.full([max_seq_len + 1, 3], fill_value=-1) + entities = np.full([max_seq_len + 1, 3], fill_value=-1, dtype=np.int64) entities[0, 0] = len(start) entities[1:len(start) + 1, 0] = start entities[0, 1] = len(end) @@ -98,7 +98,7 @@ def make_input(ser_inputs, ser_results): head.append(i) tail.append(j) - relations = np.full([len(head) + 1, 2], fill_value=-1) + relations = np.full([len(head) + 1, 2], fill_value=-1, dtype=np.int64) relations[0, 0] = len(head) relations[1:len(head) + 1, 0] = head relations[0, 1] = len(tail)