Skip to content

Commit

Permalink
add training scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackwaterveg committed Jun 27, 2022
1 parent c7a7b11 commit 0c7abc1
Show file tree
Hide file tree
Showing 20 changed files with 1,620 additions and 132 deletions.
28 changes: 13 additions & 15 deletions examples/wenetspeech/asr1/conf/conformer.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
############################################
# Network Architecture #
############################################
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
Expand Down Expand Up @@ -43,9 +42,9 @@ model_conf:
###########################################
# Data #
###########################################
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
train_manifest: data/train_l/data.list
dev_manifest: data/dev/data.list
test_manifest: data/test_meeting/data.list

###########################################
# Dataloader #
Expand All @@ -54,23 +53,22 @@ use_stream_data: True
unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt
cmvn_file: data/mean_std.json
preprocess_config: conf/preprocess.yaml
spm_model_prefix: ''
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
dither: 0.1
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 64
batch_size: 32
minlen_in: 10
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_in: 1200 # if input length(number of frames) > maxlen-in, data is automatically removed
minlen_out: 0
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
maxlen_out: 150 # if output length(number of tokens) > maxlen-out, data is automatically removed
resample_rate: 16000
shuffle_size: 10000
sort_size: 500
num_workers: 4
prefetch_factor: 100
shuffle_size: 1500
sort_size: 1000
num_workers: 0
prefetch_factor: 10
dist_sampler: True
num_encs: 1
augment_conf:
Expand All @@ -90,10 +88,10 @@ augment_conf:
###########################################
# Training #
###########################################
n_epoch: 240
accum_grad: 16
n_epoch: 30
accum_grad: 32
global_grad_clip: 5.0
log_interval: 1
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
Expand Down
125 changes: 49 additions & 76 deletions examples/wenetspeech/asr1/local/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

# Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# NPU, ASLP Group (Author: Qijie Shao)
#
# Modified from wenet(https://github.com/wenet-e2e/wenet)

stage=-1
stop_stage=100
Expand Down Expand Up @@ -30,7 +32,7 @@ mkdir -p data
TARGET_DIR=${MAIN_ROOT}/dataset
mkdir -p ${TARGET_DIR}

if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data
echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
exit 0;
Expand All @@ -44,86 +46,57 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
data || exit 1;
fi

if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# generate manifests
python3 ${TARGET_DIR}/aishell/aishell.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell"

if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated."
exit 1
fi

for dataset in train dev test; do
mv data/manifest.${dataset} data/manifest.${dataset}.raw
done
fi

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# compute mean and stddev for normalizer
if $cmvn; then
full_size=`cat data/${train_set}/wav.scp | wc -l`
sampling_size=$((full_size / cmvn_sampling_divisor))
shuf -n $sampling_size data/$train_set/wav.scp \
> data/$train_set/wav.scp.sampled
num_workers=$(nproc)

python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"

if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
fi

dict=data/dict/lang_char.txt
dict=data/lang_char/vocab.txt
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/lang_char/vocab.txt" \
--manifest_paths "data/manifest.train.raw"

if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
echo "Make a dictionary"
echo "dictionary: ${dict}"
mkdir -p $(dirname $dict)
echo "<blank>" > ${dict} # 0 will be used for "blank" in CTC
echo "<unk>" >> ${dict} # <unk> must be 1
echo "" >> ${dict} # ▁ is for space
utils/text2token.py -s 1 -n 1 --space "" data/${train_set}/text \
| cut -f 2- -d" " | tr " " "\n" \
| sort | uniq | grep -a -v -e '^\s*$' \
| grep -v "" \
| awk '{print $0}' >> ${dict} \
|| exit 1;
echo "<eos>" >> $dict
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="data/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
echo "Compute cmvn"
# Here we use all the training data, you can sample some some data to save time
# BUG!!! We should use the segmented data for CMVN
if $cmvn; then
full_size=`cat data/${train_set}/wav.scp | wc -l`
sampling_size=$((full_size / cmvn_sampling_divisor))
shuf -n $sampling_size data/$train_set/wav.scp \
> data/$train_set/wav.scp.sampled
python3 utils/compute_cmvn_stats.py \
--num_workers 16 \
--train_config $train_config \
--in_scp data/$train_set/wav.scp.sampled \
--out_cmvn data/$train_set/mean_std.json \
|| exit 1;
fi
fi

if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
} &
done
wait
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "Making shards, please wait..."
RED='\033[0;31m'
NOCOLOR='\033[0m'
echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
for x in $dev_set $test_sets ${train_set}; do
dst=$shards_dir/$x
mkdir -p $dst
utils/make_filted_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
--do_filter --num_node 1 --num_gpus_per_node 8 \
--num_threads 32 --segments data/$x/segments \
data/$x/wav.scp data/$x/text \
$(realpath $dst) data/$x/data.list
done
fi

echo "Aishell data preparation done."
echo "Wenetspeech data preparation done."
exit 0
4 changes: 2 additions & 2 deletions examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ stage=1
prefix=
train_subset=L

. ./tools/parse_options.sh || exit 1;
. ./utils/parse_options.sh || exit 1;

filter_by_id () {
idlist=$1
Expand Down Expand Up @@ -132,4 +132,4 @@ if [ $stage -le 2 ]; then
done
fi

echo "$0: Done"
echo "$0: Done"
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
pipe_cleaner,
)
from .compat import WebDataset, WebLoader, FluidWrapper
from webdataset.extradatasets import MockDataset, with_epoch, with_length
from .extradatasets import MockDataset, with_epoch, with_length
from .filters import (
associate,
batched,
Expand Down Expand Up @@ -65,5 +65,5 @@
)
from .tariterators import tarfile_samples, tarfile_to_samples
from .utils import PipelineStage, repeatedly
from webdataset.writer import ShardWriter, TarWriter, numpy_dumps
from webdataset.mix import RandomMix, RoundRobin
from .writer import ShardWriter, TarWriter, numpy_dumps
from .mix import RandomMix, RoundRobin
Loading

0 comments on commit 0c7abc1

Please sign in to comment.