Skip to content

Commit

Permalink
[s2t] mv dataset into paddlespeech.dataset (#3183)
Browse files Browse the repository at this point in the history
* mv dataset into paddlespeech.dataset

* add aidatatang

* fix import
  • Loading branch information
zh794390558 committed Apr 21, 2023
1 parent 3ad55a3 commit 35d874c
Show file tree
Hide file tree
Showing 27 changed files with 619 additions and 387 deletions.
136 changes: 2 additions & 134 deletions dataset/aidatatang_200zh/aidatatang_200zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,139 +18,7 @@
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path

import soundfile

from utils.utility import download
from utils.utility import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http:https://www.openslr.org/resources/62'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/aidatatang_200zh",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aidatatang_200_zh_transcript.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text

data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0

audio_dir = os.path.join(data_dir, 'corpus/', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
if not fname.endswith('.wav'):
continue

audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name

audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text,
},
ensure_ascii=False))

total_sec += duration
total_text += len(text)
total_num += 1

manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')

manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)


def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, subset)
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'corpus')
for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
for sub in dirlist:
print(f"unpack dir {sub}...")
for folder, _, filelist in sorted(
os.walk(os.path.join(subfolder, sub))):
for ftar in filelist:
unpack(os.path.join(folder, ftar), folder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)

create_manifest(data_dir, manifest_path)


def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)

prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix,
subset='aidatatang_200zh')

print("Data download and manifest prepare done!")

from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main

if __name__ == '__main__':
main()
aidatatang_200zh_main()
3 changes: 0 additions & 3 deletions dataset/aishell/README.md

This file was deleted.

140 changes: 2 additions & 138 deletions dataset/aishell/aishell.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,143 +18,7 @@
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path

import soundfile

from utils.utility import download
from utils.utility import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http:https://openslr.elda.org/resources/33'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Aishell",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aishell_transcript_v0.8.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text

data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0

audio_dir = os.path.join(data_dir, 'wav', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue

utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
},
ensure_ascii=False))

total_sec += duration
total_text += len(text)
total_num += 1

manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')

manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)


def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell')
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)

if manifest_path:
create_manifest(data_dir, manifest_path)


def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)

prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix)

prepare_dataset(
url=RESOURCE_URL,
md5sum=MD5_RESOURCE,
target_dir=args.target_dir,
manifest_path=None)

print("Data download and manifest prepare done!")

from paddlespeech.dataset.aishell import aishell_main

if __name__ == '__main__':
main()
aishell_main()
4 changes: 2 additions & 2 deletions dataset/librispeech/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
import distutils.util
import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

URL_ROOT = "http:https://openslr.elda.org/resources/12"
#URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
Expand Down
4 changes: 2 additions & 2 deletions dataset/mini_librispeech/mini_librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@

import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

URL_ROOT = "http:https://openslr.elda.org/resources/31"
URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
Expand Down
4 changes: 2 additions & 2 deletions dataset/musan/musan.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
4 changes: 2 additions & 2 deletions dataset/rir_noise/rir_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

import soundfile

from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
4 changes: 2 additions & 2 deletions dataset/thchs30/thchs30.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@

import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
2 changes: 1 addition & 1 deletion dataset/timit/timit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import soundfile

from utils.utility import unzip
from paddlespeech.dataset.download import unzip

URL_ROOT = ""
MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
Expand Down
6 changes: 3 additions & 3 deletions dataset/voxceleb/voxceleb1.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@

import soundfile

from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import check_md5sum
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip

# all the data will be download in the current data/voxceleb directory default
DATA_HOME = os.path.expanduser('.')
Expand Down
6 changes: 3 additions & 3 deletions dataset/voxceleb/voxceleb2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@

import soundfile

from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import check_md5sum
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip

# all the data will be download in the current data/voxceleb directory default
DATA_HOME = os.path.expanduser('.')
Expand Down
6 changes: 3 additions & 3 deletions dataset/voxforge/voxforge.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@

import soundfile

from utils.utility import download_multi
from utils.utility import getfile_insensitive
from utils.utility import unpack
from paddlespeech.dataset.download import download_multi
from paddlespeech.dataset.download import getfile_insensitive
from paddlespeech.dataset.download import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
Loading

0 comments on commit 35d874c

Please sign in to comment.