[s2t] mv dataset into paddlespeech.dataset (#3183)

* mv dataset into paddlespeech.dataset * add aidatatang * fix import
PaddlePaddle · Apr 21, 2023 · 35d874c · 35d874c
1 parent 3ad55a3
commit 35d874c
Show file tree

Hide file tree

Showing 27 changed files with 619 additions and 387 deletions.
diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -18,139 +18,7 @@
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http:https://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- "--target_dir",
- default=DATA_HOME + "/aidatatang_200zh",
- type=str,
- help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
- "--manifest_prefix",
- default="manifest",
- type=str,
- help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
- print("Creating manifest %s ..." % manifest_path_prefix)
- json_lines = []
- transcript_path = os.path.join(data_dir, 'transcript',
- 'aidatatang_200_zh_transcript.txt')
- transcript_dict = {}
- for line in codecs.open(transcript_path, 'r', 'utf-8'):
- line = line.strip()
- if line == '':
- continue
- audio_id, text = line.split(' ', 1)
- # remove withespace, charactor text
- text = ''.join(text.split())
- transcript_dict[audio_id] = text
-
- data_types = ['train', 'dev', 'test']
- for dtype in data_types:
- del json_lines[:]
- total_sec = 0.0
- total_text = 0.0
- total_num = 0
-
- audio_dir = os.path.join(data_dir, 'corpus/', dtype)
- for subfolder, _, filelist in sorted(os.walk(audio_dir)):
- for fname in filelist:
- if not fname.endswith('.wav'):
- continue
-
- audio_path = os.path.abspath(os.path.join(subfolder, fname))
- audio_id = os.path.basename(fname)[:-4]
- utt2spk = Path(audio_path).parent.name
-
- audio_data, samplerate = soundfile.read(audio_path)
- duration = float(len(audio_data) / samplerate)
- text = transcript_dict[audio_id]
- json_lines.append(
- json.dumps(
- {
- 'utt': audio_id,
- 'utt2spk': str(utt2spk),
- 'feat': audio_path,
- 'feat_shape': (duration, ), # second
- 'text': text,
- },
- ensure_ascii=False))
-
- total_sec += duration
- total_text += len(text)
- total_num += 1
-
- manifest_path = manifest_path_prefix + '.' + dtype
- with codecs.open(manifest_path, 'w', 'utf-8') as fout:
- for line in json_lines:
- fout.write(line + '\n')
-
- manifest_dir = os.path.dirname(manifest_path_prefix)
- meta_path = os.path.join(manifest_dir, dtype) + '.meta'
- with open(meta_path, 'w') as f:
- print(f"{dtype}:", file=f)
- print(f"{total_num} utts", file=f)
- print(f"{total_sec / (60*60)} h", file=f)
- print(f"{total_text} text", file=f)
- print(f"{total_text / total_sec} text/sec", file=f)
- print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
- """Download, unpack and create manifest file."""
- data_dir = os.path.join(target_dir, subset)
- if not os.path.exists(data_dir):
- filepath = download(url, md5sum, target_dir)
- unpack(filepath, target_dir)
- # unpack all audio tar files
- audio_dir = os.path.join(data_dir, 'corpus')
- for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
- for sub in dirlist:
- print(f"unpack dir {sub}...")
- for folder, _, filelist in sorted(
- os.walk(os.path.join(subfolder, sub))):
- for ftar in filelist:
- unpack(os.path.join(folder, ftar), folder, True)
- else:
- print("Skip downloading and unpacking. Data already exists in %s." %
- target_dir)
-
- create_manifest(data_dir, manifest_path)
-
-
-def main():
- if args.target_dir.startswith('~'):
- args.target_dir = os.path.expanduser(args.target_dir)
-
- prepare_dataset(
- url=DATA_URL,
- md5sum=MD5_DATA,
- target_dir=args.target_dir,
- manifest_path=args.manifest_prefix,
- subset='aidatatang_200zh')
-
- print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main
 
 if __name__ == '__main__':
- main()
+ aidatatang_200zh_main()
diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md
diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py
@@ -18,143 +18,7 @@
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http:https://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
- "--target_dir",
- default=DATA_HOME + "/Aishell",
- type=str,
- help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
- "--manifest_prefix",
- default="manifest",
- type=str,
- help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
- print("Creating manifest %s ..." % manifest_path_prefix)
- json_lines = []
- transcript_path = os.path.join(data_dir, 'transcript',
- 'aishell_transcript_v0.8.txt')
- transcript_dict = {}
- for line in codecs.open(transcript_path, 'r', 'utf-8'):
- line = line.strip()
- if line == '':
- continue
- audio_id, text = line.split(' ', 1)
- # remove withespace, charactor text
- text = ''.join(text.split())
- transcript_dict[audio_id] = text
-
- data_types = ['train', 'dev', 'test']
- for dtype in data_types:
- del json_lines[:]
- total_sec = 0.0
- total_text = 0.0
- total_num = 0
-
- audio_dir = os.path.join(data_dir, 'wav', dtype)
- for subfolder, _, filelist in sorted(os.walk(audio_dir)):
- for fname in filelist:
- audio_path = os.path.abspath(os.path.join(subfolder, fname))
- audio_id = os.path.basename(fname)[:-4]
- # if no transcription for audio then skipped
- if audio_id not in transcript_dict:
- continue
-
- utt2spk = Path(audio_path).parent.name
- audio_data, samplerate = soundfile.read(audio_path)
- duration = float(len(audio_data) / samplerate)
- text = transcript_dict[audio_id]
- json_lines.append(
- json.dumps(
- {
- 'utt': audio_id,
- 'utt2spk': str(utt2spk),
- 'feat': audio_path,
- 'feat_shape': (duration, ), # second
- 'text': text
- },
- ensure_ascii=False))
-
- total_sec += duration
- total_text += len(text)
- total_num += 1
-
- manifest_path = manifest_path_prefix + '.' + dtype
- with codecs.open(manifest_path, 'w', 'utf-8') as fout:
- for line in json_lines:
- fout.write(line + '\n')
-
- manifest_dir = os.path.dirname(manifest_path_prefix)
- meta_path = os.path.join(manifest_dir, dtype) + '.meta'
- with open(meta_path, 'w') as f:
- print(f"{dtype}:", file=f)
- print(f"{total_num} utts", file=f)
- print(f"{total_sec / (60*60)} h", file=f)
- print(f"{total_text} text", file=f)
- print(f"{total_text / total_sec} text/sec", file=f)
- print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
- """Download, unpack and create manifest file."""
- data_dir = os.path.join(target_dir, 'data_aishell')
- if not os.path.exists(data_dir):
- filepath = download(url, md5sum, target_dir)
- unpack(filepath, target_dir)
- # unpack all audio tar files
- audio_dir = os.path.join(data_dir, 'wav')
- for subfolder, _, filelist in sorted(os.walk(audio_dir)):
- for ftar in filelist:
- unpack(os.path.join(subfolder, ftar), subfolder, True)
- else:
- print("Skip downloading and unpacking. Data already exists in %s." %
- target_dir)
-
- if manifest_path:
- create_manifest(data_dir, manifest_path)
-
-
-def main():
- if args.target_dir.startswith('~'):
- args.target_dir = os.path.expanduser(args.target_dir)
-
- prepare_dataset(
- url=DATA_URL,
- md5sum=MD5_DATA,
- target_dir=args.target_dir,
- manifest_path=args.manifest_prefix)
-
- prepare_dataset(
- url=RESOURCE_URL,
- md5sum=MD5_RESOURCE,
- target_dir=args.target_dir,
- manifest_path=None)
-
- print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main
 
 if __name__ == '__main__':
- main()
+ aishell_main()
diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
@@ -28,8 +28,8 @@
 import distutils.util
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "http:https://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"

diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py
@@ -27,8 +27,8 @@
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "http:https://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"

diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py
@@ -29,8 +29,8 @@
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 

diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py
@@ -29,8 +29,8 @@
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 

diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py
@@ -27,8 +27,8 @@
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 

diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py
@@ -28,7 +28,7 @@
 
 import soundfile
 
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 
 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"

diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
@@ -31,9 +31,9 @@
 
 import soundfile
 
-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')

diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py
@@ -27,9 +27,9 @@
 
 import soundfile
 
-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')

diff --git a/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py
@@ -28,9 +28,9 @@
 
 import soundfile
 
-from utils.utility import download_multi
-from utils.utility import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import download_multi
+from paddlespeech.dataset.download import getfile_insensitive
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')