Merge branch 'develop' into amp

PaddlePaddle · Apr 25, 2023 · bc365cb · bc365cb
2 parents f3d567f + 7cab869
commit bc365cb
Show file tree

Hide file tree

Showing 816 changed files with 30,798 additions and 32,546 deletions.
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
 1. 虽然跳过了 CI，但是还要先排队排到才能跳过，所以非自己方向看到 pending 不要着急 🤣
 2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
 3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`，因为每个 commit 都会触发 CI
-4. 删除 python 环境中已经安装好的的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
+4. 删除 python 环境中已经安装好的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
diff --git a/.mergify.yml b/.mergify.yml
@@ -136,7 +136,7 @@ pull_request_rules:
  add: ["Docker"]
  - name: "auto add label=Deployment"
  conditions:
- - files~=^speechx/
+ - files~=^runtime/
  actions:
  label:
  add: ["Deployment"]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,8 +3,12 @@ repos:
  rev: v0.16.0
  hooks:
  - id: yapf
- files: \.py$
- exclude: (?=third_party).*(\.py)$
+ name: yapf
+ language: python
+ entry: yapf
+ args: [-i, -vv]
+ types: [python]
+ exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 
 - repo: https://github.com/pre-commit/pre-commit-hooks
  rev: a11d9314b22d8f8c7556443875b731ef05965464
@@ -31,7 +35,7 @@ repos:
  - --ignore=E501,E228,E226,E261,E266,E128,E402,W503
  - --builtins=G,request
  - --jobs=1
- exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+ exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
 
 - repo : https://github.com/Lucas-C/pre-commit-hooks
  rev: v1.0.1
@@ -53,16 +57,16 @@ repos:
  entry: bash .pre-commit-hooks/clang-format.hook -i
  language: system
  files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
- exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+ exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
  - id: cpplint
  name: cpplint
  description: Static code analysis of C/C++ files
  language: python
  files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
- exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+ exclude: (?=runtime/engine/kaldi|runtime/engine/common/matrix|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
  entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 - repo: https://github.com/asottile/reorder_python_imports
  rev: v2.4.0
  hooks:
  - id: reorder-python-imports
- exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+ exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
diff --git a/README.md b/README.md
@@ -179,6 +179,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 
 ### Recent Update
 - 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server).
 - 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
 - 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
 - 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux).
@@ -193,7 +194,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 - 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
 - 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](./demos/speech_ssl), Support ASR and Feature Extraction.
 - 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
-- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](./speechx/examples/u2pp_ol/wenetspeech).
+- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/runtime/examples/u2pp_ol/wenetspeech).
 - 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
 - 🔥 2022.10.26: Add [Prosody Prediction](./examples/other/rhy) for TTS.
 - 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.

diff --git a/README_cn.md b/README_cn.md
@@ -184,6 +184,7 @@
 
 ### 近期更新
 - 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。
 - 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例，包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5)，效果持续优化中。
 - 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
 - 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。

diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
 
  if sr <= 0:
  raise ParameterError(
- f'Sample rate should be larger than 0, recieved sr = {sr}')
+ f'Sample rate should be larger than 0, received sr = {sr}')
 
  if y.dtype not in ['int16', 'int8']:
  warnings.warn(

diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -18,139 +18,7 @@
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'https://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- "--target_dir",
- default=DATA_HOME + "/aidatatang_200zh",
- type=str,
- help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
- "--manifest_prefix",
- default="manifest",
- type=str,
- help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
- print("Creating manifest %s ..." % manifest_path_prefix)
- json_lines = []
- transcript_path = os.path.join(data_dir, 'transcript',
- 'aidatatang_200_zh_transcript.txt')
- transcript_dict = {}
- for line in codecs.open(transcript_path, 'r', 'utf-8'):
- line = line.strip()
- if line == '':
- continue
- audio_id, text = line.split(' ', 1)
- # remove withespace, charactor text
- text = ''.join(text.split())
- transcript_dict[audio_id] = text
-
- data_types = ['train', 'dev', 'test']
- for dtype in data_types:
- del json_lines[:]
- total_sec = 0.0
- total_text = 0.0
- total_num = 0
-
- audio_dir = os.path.join(data_dir, 'corpus/', dtype)
- for subfolder, _, filelist in sorted(os.walk(audio_dir)):
- for fname in filelist:
- if not fname.endswith('.wav'):
- continue
-
- audio_path = os.path.abspath(os.path.join(subfolder, fname))
- audio_id = os.path.basename(fname)[:-4]
- utt2spk = Path(audio_path).parent.name
-
- audio_data, samplerate = soundfile.read(audio_path)
- duration = float(len(audio_data) / samplerate)
- text = transcript_dict[audio_id]
- json_lines.append(
- json.dumps(
- {
- 'utt': audio_id,
- 'utt2spk': str(utt2spk),
- 'feat': audio_path,
- 'feat_shape': (duration, ), # second
- 'text': text,
- },
- ensure_ascii=False))
-
- total_sec += duration
- total_text += len(text)
- total_num += 1
-
- manifest_path = manifest_path_prefix + '.' + dtype
- with codecs.open(manifest_path, 'w', 'utf-8') as fout:
- for line in json_lines:
- fout.write(line + '\n')
-
- manifest_dir = os.path.dirname(manifest_path_prefix)
- meta_path = os.path.join(manifest_dir, dtype) + '.meta'
- with open(meta_path, 'w') as f:
- print(f"{dtype}:", file=f)
- print(f"{total_num} utts", file=f)
- print(f"{total_sec / (60*60)} h", file=f)
- print(f"{total_text} text", file=f)
- print(f"{total_text / total_sec} text/sec", file=f)
- print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
- """Download, unpack and create manifest file."""
- data_dir = os.path.join(target_dir, subset)
- if not os.path.exists(data_dir):
- filepath = download(url, md5sum, target_dir)
- unpack(filepath, target_dir)
- # unpack all audio tar files
- audio_dir = os.path.join(data_dir, 'corpus')
- for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
- for sub in dirlist:
- print(f"unpack dir {sub}...")
- for folder, _, filelist in sorted(
- os.walk(os.path.join(subfolder, sub))):
- for ftar in filelist:
- unpack(os.path.join(folder, ftar), folder, True)
- else:
- print("Skip downloading and unpacking. Data already exists in %s." %
- target_dir)
-
- create_manifest(data_dir, manifest_path)
-
-
-def main():
- if args.target_dir.startswith('~'):
- args.target_dir = os.path.expanduser(args.target_dir)
-
- prepare_dataset(
- url=DATA_URL,
- md5sum=MD5_DATA,
- target_dir=args.target_dir,
- manifest_path=args.manifest_prefix,
- subset='aidatatang_200zh')
-
- print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main
 
 if __name__ == '__main__':
- main()
+ aidatatang_200zh_main()
diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md
diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py
@@ -18,143 +18,7 @@
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'https://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- "--target_dir",
- default=DATA_HOME + "/Aishell",
- type=str,
- help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
- "--manifest_prefix",
- default="manifest",
- type=str,
- help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
- print("Creating manifest %s ..." % manifest_path_prefix)
- json_lines = []
- transcript_path = os.path.join(data_dir, 'transcript',
- 'aishell_transcript_v0.8.txt')
- transcript_dict = {}
- for line in codecs.open(transcript_path, 'r', 'utf-8'):
- line = line.strip()
- if line == '':
- continue
- audio_id, text = line.split(' ', 1)
- # remove withespace, charactor text
- text = ''.join(text.split())
- transcript_dict[audio_id] = text
-
- data_types = ['train', 'dev', 'test']
- for dtype in data_types:
- del json_lines[:]
- total_sec = 0.0
- total_text = 0.0
- total_num = 0
-
- audio_dir = os.path.join(data_dir, 'wav', dtype)
- for subfolder, _, filelist in sorted(os.walk(audio_dir)):
- for fname in filelist:
- audio_path = os.path.abspath(os.path.join(subfolder, fname))
- audio_id = os.path.basename(fname)[:-4]
- # if no transcription for audio then skipped
- if audio_id not in transcript_dict:
- continue
-
- utt2spk = Path(audio_path).parent.name
- audio_data, samplerate = soundfile.read(audio_path)
- duration = float(len(audio_data) / samplerate)
- text = transcript_dict[audio_id]
- json_lines.append(
- json.dumps(
- {
- 'utt': audio_id,
- 'utt2spk': str(utt2spk),
- 'feat': audio_path,
- 'feat_shape': (duration, ), # second
- 'text': text
- },
- ensure_ascii=False))
-
- total_sec += duration
- total_text += len(text)
- total_num += 1
-
- manifest_path = manifest_path_prefix + '.' + dtype
- with codecs.open(manifest_path, 'w', 'utf-8') as fout:
- for line in json_lines:
- fout.write(line + '\n')
-
- manifest_dir = os.path.dirname(manifest_path_prefix)
- meta_path = os.path.join(manifest_dir, dtype) + '.meta'
- with open(meta_path, 'w') as f:
- print(f"{dtype}:", file=f)
- print(f"{total_num} utts", file=f)
- print(f"{total_sec / (60*60)} h", file=f)
- print(f"{total_text} text", file=f)
- print(f"{total_text / total_sec} text/sec", file=f)
- print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
- """Download, unpack and create manifest file."""
- data_dir = os.path.join(target_dir, 'data_aishell')
- if not os.path.exists(data_dir):
- filepath = download(url, md5sum, target_dir)
- unpack(filepath, target_dir)
- # unpack all audio tar files
- audio_dir = os.path.join(data_dir, 'wav')
- for subfolder, _, filelist in sorted(os.walk(audio_dir)):
- for ftar in filelist:
- unpack(os.path.join(subfolder, ftar), subfolder, True)
- else:
- print("Skip downloading and unpacking. Data already exists in %s." %
- target_dir)
-
- if manifest_path:
- create_manifest(data_dir, manifest_path)
-
-
-def main():
- if args.target_dir.startswith('~'):
- args.target_dir = os.path.expanduser(args.target_dir)
-
- prepare_dataset(
- url=DATA_URL,
- md5sum=MD5_DATA,
- target_dir=args.target_dir,
- manifest_path=args.manifest_prefix)
-
- prepare_dataset(
- url=RESOURCE_URL,
- md5sum=MD5_RESOURCE,
- target_dir=args.target_dir,
- manifest_path=None)
-
- print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main
 
 if __name__ == '__main__':
- main()
+ aishell_main()
diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
@@ -28,8 +28,8 @@
 import distutils.util
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "https://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"