add training scripts

PaddlePaddle · Jun 27, 2022 · 0c7abc1 · 0c7abc1
1 parent c7a7b11
commit 0c7abc1
Show file tree

Hide file tree

Showing 20 changed files with 1,620 additions and 132 deletions.
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -1,7 +1,6 @@
 ############################################
 # Network Architecture #
 ############################################
-cmvn_file: 
 cmvn_file_type: "json"
 # encoder related
 encoder: conformer
@@ -43,9 +42,9 @@ model_conf:
 ###########################################
 # Data #
 ###########################################
-train_manifest: data/manifest.train
-dev_manifest: data/manifest.dev
-test_manifest: data/manifest.test
+train_manifest: data/train_l/data.list
+dev_manifest: data/dev/data.list
+test_manifest: data/test_meeting/data.list
 
 ###########################################
 # Dataloader #
@@ -54,23 +53,22 @@ use_stream_data: True
 unit_type: 'char'
 vocab_filepath: data/lang_char/vocab.txt 
 cmvn_file: data/mean_std.json
-preprocess_config: conf/preprocess.yaml
 spm_model_prefix: ''
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 dither: 0.1
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-batch_size: 64
+batch_size: 32
 minlen_in: 10
-maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+maxlen_in: 1200 # if input length(number of frames) > maxlen-in, data is automatically removed
 minlen_out: 0
-maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+maxlen_out: 150 # if output length(number of tokens) > maxlen-out, data is automatically removed
 resample_rate: 16000
-shuffle_size: 10000
-sort_size: 500
-num_workers: 4
-prefetch_factor: 100
+shuffle_size: 1500
+sort_size: 1000
+num_workers: 0
+prefetch_factor: 10
 dist_sampler: True
 num_encs: 1
 augment_conf:
@@ -90,10 +88,10 @@ augment_conf:
 ###########################################
 # Training #
 ###########################################
-n_epoch: 240 
-accum_grad: 16
+n_epoch: 30 
+accum_grad: 32
 global_grad_clip: 5.0
-log_interval: 1
+log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5

diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
@@ -2,6 +2,8 @@
 
 # Copyright 2021 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
 # NPU, ASLP Group (Author: Qijie Shao)
+#
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
 
 stage=-1
 stop_stage=100
@@ -30,7 +32,7 @@ mkdir -p data
 TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 
-if [ ${stage} -le -2 ] && [ ${stop_stage} -ge -2 ]; then
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
  # download data
  echo "Please follow https://github.com/wenet-e2e/WenetSpeech to download the data."
  exit 0;
@@ -44,86 +46,57 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
  data || exit 1;
 fi
 
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
- # generate manifests
- python3 ${TARGET_DIR}/aishell/aishell.py \
- --manifest_prefix="data/manifest" \
- --target_dir="${TARGET_DIR}/aishell"
-
- if [ $? -ne 0 ]; then
- echo "Prepare Aishell failed. Terminated."
- exit 1
- fi
-
- for dataset in train dev test; do
- mv data/manifest.${dataset} data/manifest.${dataset}.raw
- done
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
- # compute mean and stddev for normalizer
- if $cmvn; then
- full_size=`cat data/${train_set}/wav.scp | wc -l`
- sampling_size=$((full_size / cmvn_sampling_divisor))
- shuf -n $sampling_size data/$train_set/wav.scp \
- > data/$train_set/wav.scp.sampled
- num_workers=$(nproc)
-
- python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
- --manifest_path="data/manifest.train.raw" \
- --spectrum_type="fbank" \
- --feat_dim=80 \
- --delta_delta=false \
- --stride_ms=10 \
- --window_ms=25 \
- --sample_rate=16000 \
- --use_dB_normalization=False \
- --num_samples=-1 \
- --num_workers=${num_workers} \
- --output_path="data/mean_std.json"
-
- if [ $? -ne 0 ]; then
- echo "Compute mean and stddev failed. Terminated."
- exit 1
- fi
- fi
-fi
-
-dict=data/dict/lang_char.txt
+dict=data/lang_char/vocab.txt
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # download data, generate manifests
- # build vocabulary
- python3 ${MAIN_ROOT}/utils/build_vocab.py \
- --unit_type="char" \
- --count_threshold=0 \
- --vocab_path="data/lang_char/vocab.txt" \
- --manifest_paths "data/manifest.train.raw"
-
- if [ $? -ne 0 ]; then
- echo "Build vocabulary failed. Terminated."
- exit 1
- fi
+ echo "Make a dictionary"
+ echo "dictionary: ${dict}"
+ mkdir -p $(dirname $dict)
+ echo "<blank>" > ${dict} # 0 will be used for "blank" in CTC
+ echo "<unk>" >> ${dict} # <unk> must be 1
+ echo "▁" >> ${dict} # ▁ is for space
+ utils/text2token.py -s 1 -n 1 --space "▁" data/${train_set}/text \
+ | cut -f 2- -d" " | tr " " "\n" \
+ | sort | uniq | grep -a -v -e '^\s*$' \
+ | grep -v "▁" \
+ | awk '{print $0}' >> ${dict} \
+ || exit 1;
+ echo "<eos>" >> $dict
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # format manifest with tokenids, vocab size
- for dataset in train dev test; do
- {
- python3 ${MAIN_ROOT}/utils/format_data.py \
- --cmvn_path "data/mean_std.json" \
- --unit_type "char" \
- --vocab_path="data/vocab.txt" \
- --manifest_path="data/manifest.${dataset}.raw" \
- --output_path="data/manifest.${dataset}"
+ echo "Compute cmvn"
+ # Here we use all the training data, you can sample some some data to save time
+ # BUG!!! We should use the segmented data for CMVN
+ if $cmvn; then
+ full_size=`cat data/${train_set}/wav.scp | wc -l`
+ sampling_size=$((full_size / cmvn_sampling_divisor))
+ shuf -n $sampling_size data/$train_set/wav.scp \
+ > data/$train_set/wav.scp.sampled
+ python3 utils/compute_cmvn_stats.py \
+ --num_workers 16 \
+ --train_config $train_config \
+ --in_scp data/$train_set/wav.scp.sampled \
+ --out_cmvn data/$train_set/mean_std.json \
+ || exit 1;
+ fi
+fi
 
- if [ $? -ne 0 ]; then
- echo "Formt mnaifest failed. Terminated."
- exit 1
- fi
- } &
- done
- wait
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "Making shards, please wait..."
+ RED='\033[0;31m'
+ NOCOLOR='\033[0m'
+ echo -e "It requires ${RED}1.2T ${NOCOLOR}space for $shards_dir, please make sure you have enough space"
+ echo -e "It takes about ${RED}12 ${NOCOLOR}hours with 32 threads"
+ for x in $dev_set $test_sets ${train_set}; do
+ dst=$shards_dir/$x
+ mkdir -p $dst
+ utils/make_filted_shard_list.py --resample 16000 --num_utts_per_shard 1000 \
+ --do_filter --num_node 1 --num_gpus_per_node 8 \
+ --num_threads 32 --segments data/$x/segments \
+ data/$x/wav.scp data/$x/text \
+ $(realpath $dst) data/$x/data.list
+ done
 fi
 
-echo "Aishell data preparation done."
+echo "Wenetspeech data preparation done."
 exit 0
diff --git a/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh b/examples/wenetspeech/asr1/local/wenetspeech_data_prep.sh
@@ -24,7 +24,7 @@ stage=1
 prefix=
 train_subset=L
 
-. ./tools/parse_options.sh || exit 1;
+. ./utils/parse_options.sh || exit 1;
 
 filter_by_id () {
  idlist=$1
@@ -132,4 +132,4 @@ if [ $stage -le 2 ]; then
  done
 fi
 
-echo "$0: Done"
+echo "$0: Done"
diff --git a/paddlespeech/audio/stream_data/__init__.py → paddlespeech/audio/streamdata/__init__.py b/paddlespeech/audio/stream_data/__init__.py → paddlespeech/audio/streamdata/__init__.py
@@ -11,7 +11,7 @@
  pipe_cleaner,
 )
 from .compat import WebDataset, WebLoader, FluidWrapper
-from webdataset.extradatasets import MockDataset, with_epoch, with_length
+from .extradatasets import MockDataset, with_epoch, with_length
 from .filters import (
  associate,
  batched,
@@ -65,5 +65,5 @@
 )
 from .tariterators import tarfile_samples, tarfile_to_samples
 from .utils import PipelineStage, repeatedly
-from webdataset.writer import ShardWriter, TarWriter, numpy_dumps
-from webdataset.mix import RandomMix, RoundRobin
+from .writer import ShardWriter, TarWriter, numpy_dumps
+from .mix import RandomMix, RoundRobin