Skip to content

Commit

Permalink
fix scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackwaterveg committed Jul 5, 2022
1 parent 6ec6921 commit 92d1d08
Show file tree
Hide file tree
Showing 16 changed files with 901 additions and 417 deletions.
20 changes: 4 additions & 16 deletions examples/wenetspeech/asr1/conf/conformer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ test_manifest: data/test_meeting/data.list
use_stream_data: True
unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt
preprocess_config: conf/preprocess.yaml
cmvn_file: data/mean_std.json
spm_model_prefix: ''
feat_dim: 80
Expand All @@ -65,30 +66,17 @@ maxlen_in: 1200 # if input length(number of frames) > maxlen-in, data is automa
minlen_out: 0
maxlen_out: 150 # if output length(number of tokens) > maxlen-out, data is automatically removed
resample_rate: 16000
shuffle_size: 1500
sort_size: 1000
shuffle_size: 1500 # read number of 'shuffle_size' data as a chunk, shuffle the data in the chunk
sort_size: 1000 # read number of 'sort_size' data as a chunk, sort the data in the chunk
num_workers: 8
prefetch_factor: 10
dist_sampler: True
num_encs: 1
augment_conf:
max_w: 80
w_inplace: True
w_mode: "PIL"
max_f: 30
num_f_mask: 2
f_inplace: True
f_replace_with_zero: False
max_t: 40
num_t_mask: 2
t_inplace: True
t_replace_with_zero: False


###########################################
# Training #
###########################################
n_epoch: 30
n_epoch: 32
accum_grad: 32
global_grad_clip: 5.0
log_interval: 100
Expand Down
4 changes: 2 additions & 2 deletions examples/wenetspeech/asr1/local/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for x in $dev_set $test_sets ${train_set}; do
dst=$shards_dir/$x
mkdir -p $dst
utils/make_filted_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
--do_filter --num_node 1 --num_gpus_per_node 8 \
utils/make_filted_shard_list.py --num_node 1 --num_gpus_per_node 8 --num_utts_per_shard 1000 \
--do_filter --resample 16000 \
--num_threads 32 --segments data/$x/segments \
data/$x/wav.scp data/$x/text \
$(realpath $dst) data/$x/data.list
Expand Down
17 changes: 9 additions & 8 deletions paddlespeech/audio/streamdata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2017-2019 NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#
Expand Down Expand Up @@ -26,22 +27,22 @@
pipelinefilter,
rename,
rename_keys,
rsample,
audio_resample,
select,
shuffle,
slice,
to_tuple,
transform_with,
unbatched,
xdecode,
data_filter,
tokenize,
resample,
compute_fbank,
spec_aug,
audio_data_filter,
audio_tokenize,
audio_resample,
audio_compute_fbank,
audio_spec_aug,
sort,
padding,
cmvn,
audio_padding,
audio_cmvn,
placeholder,
)
from .handlers import (
Expand Down
8 changes: 4 additions & 4 deletions paddlespeech/audio/streamdata/autodecode.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,12 +291,12 @@ def torch_video(key, data):


################################################################
# paddleaudio
# paddlespeech.audio
################################################################


def paddle_audio(key, data):
"""Decode audio using the paddleaudio library.
"""Decode audio using the paddlespeech.audio library.
:param key: file name extension
:param data: data to be decoded
Expand All @@ -305,13 +305,13 @@ def paddle_audio(key, data):
if extension not in ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma"]:
return None

import paddleaudio
import paddlespeech.audio

with tempfile.TemporaryDirectory() as dirname:
fname = os.path.join(dirname, f"file.{extension}")
with open(fname, "wb") as stream:
stream.write(data)
return paddleaudio.load(fname)
return paddlespeech.audio.load(fname)


################################################################
Expand Down
24 changes: 12 additions & 12 deletions paddlespeech/audio/streamdata/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,29 +78,29 @@ def extract_keys(self, *args, **kw):
def xdecode(self, *args, **kw):
return self.compose(filters.xdecode(*args, **kw))

def data_filter(self, *args, **kw):
return self.compose(filters.data_filter(*args, **kw))
def audio_data_filter(self, *args, **kw):
return self.compose(filters.audio_data_filter(*args, **kw))

def tokenize(self, *args, **kw):
return self.compose(filters.tokenize(*args, **kw))
def audio_tokenize(self, *args, **kw):
return self.compose(filters.audio_tokenize(*args, **kw))

def resample(self, *args, **kw):
return self.compose(filters.resample(*args, **kw))

def compute_fbank(self, *args, **kw):
return self.compose(filters.compute_fbank(*args, **kw))
def audio_compute_fbank(self, *args, **kw):
return self.compose(filters.audio_compute_fbank(*args, **kw))

def spec_aug(self, *args, **kw):
return self.compose(filters.spec_aug(*args, **kw))
def audio_spec_aug(self, *args, **kw):
return self.compose(filters.audio_spec_aug(*args, **kw))

def sort(self, size=500):
return self.compose(filters.sort(size))

def padding(self):
return self.compose(filters.padding())
def audio_padding(self):
return self.compose(filters.audio_padding())

def cmvn(self, cmvn_file):
return self.compose(filters.cmvn(cmvn_file))
def audio_cmvn(self, cmvn_file):
return self.compose(filters.audio_cmvn(cmvn_file))

class WebDataset(DataPipeline, FluidInterface):
"""Small fluid-interface wrapper for DataPipeline."""
Expand Down
28 changes: 14 additions & 14 deletions paddlespeech/audio/streamdata/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ def _xdecode(



def _data_filter(source,
def _audio_data_filter(source,
frame_shift=10,
max_length=10240,
min_length=10,
Expand Down Expand Up @@ -629,9 +629,9 @@ def _data_filter(source,
continue
yield sample

data_filter = pipelinefilter(_data_filter)
audio_data_filter = pipelinefilter(_audio_data_filter)

def _tokenize(source,
def _audio_tokenize(source,
symbol_table,
bpe_model=None,
non_lang_syms=None,
Expand Down Expand Up @@ -693,9 +693,9 @@ def _tokenize(source,
sample['label'] = label
yield sample

tokenize = pipelinefilter(_tokenize)
audio_tokenize = pipelinefilter(_audio_tokenize)

def _resample(source, resample_rate=16000):
def _audio_resample(source, resample_rate=16000):
""" Resample data.
Inplace operation.
Expand All @@ -718,9 +718,9 @@ def _resample(source, resample_rate=16000):
))
yield sample

resample = pipelinefilter(_resample)
audio_resample = pipelinefilter(_audio_resample)

def _compute_fbank(source,
def _audio_compute_fbank(source,
num_mel_bins=80,
frame_length=25,
frame_shift=10,
Expand Down Expand Up @@ -756,9 +756,9 @@ def _compute_fbank(source,
yield dict(fname=sample['fname'], label=sample['label'], feat=mat)


compute_fbank = pipelinefilter(_compute_fbank)
audio_compute_fbank = pipelinefilter(_audio_compute_fbank)

def _spec_aug(source,
def _audio_spec_aug(source,
max_w=5,
w_inplace=True,
w_mode="PIL",
Expand Down Expand Up @@ -799,7 +799,7 @@ def _spec_aug(source,
sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32)
yield sample

spec_aug = pipelinefilter(_spec_aug)
audio_spec_aug = pipelinefilter(_audio_spec_aug)


def _sort(source, sort_size=500):
Expand Down Expand Up @@ -881,7 +881,7 @@ def dynamic_batched(source, max_frames_in_batch=12000):
yield buf


def _padding(source):
def _audio_padding(source):
""" Padding the data into training data
Args:
Expand Down Expand Up @@ -914,9 +914,9 @@ def _padding(source):
yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
label_lengths)

padding = pipelinefilter(_padding)
audio_padding = pipelinefilter(_audio_padding)

def _cmvn(source, cmvn_file):
def _audio_cmvn(source, cmvn_file):
global_cmvn = GlobalCMVN(cmvn_file)
for batch in source:
sorted_keys, padded_feats, feats_lengths, padding_labels, label_lengths = batch
Expand All @@ -926,7 +926,7 @@ def _cmvn(source, cmvn_file):
yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
label_lengths)

cmvn = pipelinefilter(_cmvn)
audio_cmvn = pipelinefilter(_audio_cmvn)

def _placeholder(source):
for data in source:
Expand Down
6 changes: 3 additions & 3 deletions paddlespeech/audio/streamdata/tariterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
meta_prefix = "__"
meta_suffix = "__"

from ... import audio as paddleaudio
import paddlespeech
import paddle
import numpy as np

Expand Down Expand Up @@ -118,7 +118,7 @@ def tar_file_iterator(
assert pos > 0
prefix, postfix = name[:pos], name[pos + 1:]
if postfix == 'wav':
waveform, sample_rate = paddleaudio.load(stream.extractfile(tarinfo), normal=False)
waveform, sample_rate = paddlespeech.audio.load(stream.extractfile(tarinfo), normal=False)
result = dict(fname=prefix, wav=waveform, sample_rate = sample_rate)
else:
txt = stream.extractfile(tarinfo).read().decode('utf8').strip()
Expand Down Expand Up @@ -167,7 +167,7 @@ def tar_file_and_group_iterator(
if postfix == 'txt':
example['txt'] = file_obj.read().decode('utf8').strip()
elif postfix in AUDIO_FORMAT_SETS:
waveform, sample_rate = paddleaudio.load(file_obj, normal=False)
waveform, sample_rate = paddlespeech.audio.load(file_obj, normal=False)
waveform = paddle.to_tensor(np.expand_dims(np.array(waveform),0), dtype=paddle.float32)

example['wav'] = waveform
Expand Down
Loading

0 comments on commit 92d1d08

Please sign in to comment.