transformer using batch data loader

PaddlePaddle · Nov 5, 2021 · 6905569 · 6905569
1 parent 3f611c7
commit 6905569
Show file tree

Hide file tree

Showing 27 changed files with 328 additions and 172 deletions.
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for dataset in train dev test; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type "char" \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh
@@ -67,7 +67,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for dataset in train dev test; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type "char" \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh
@@ -55,7 +55,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for dataset in train dev test; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type "char" \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/dataset/librispeech/librispeech.py b/examples/dataset/librispeech/librispeech.py
@@ -89,25 +89,28 @@ def create_manifest(data_dir, manifest_path):
  text_filepath = os.path.join(subfolder, text_filelist[0])
  for line in io.open(text_filepath, encoding="utf8"):
  segments = line.strip().split()
+ n_token = len(segments[1:])
  text = ' '.join(segments[1:]).lower()
 
  audio_filepath = os.path.abspath(
  os.path.join(subfolder, segments[0] + '.flac'))
  audio_data, samplerate = soundfile.read(audio_filepath)
  duration = float(len(audio_data)) / samplerate
+
+ utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+ utt2spk = '-'.join(utt.split('-')[:2])
+
  json_lines.append(
  json.dumps({
- 'utt':
- os.path.splitext(os.path.basename(audio_filepath))[0],
- 'feat':
- audio_filepath,
- 'feat_shape': (duration, ), #second
- 'text':
- text
+ 'utt': utt,
+ 'utt2spk': utt2spk,
+ 'feat': audio_filepath,
+ 'feat_shape': (duration, ), # second
+ 'text': text,
  }))
 
  total_sec += duration
- total_text += len(text)
+ total_text += n_token
  total_num += 1
 
  with codecs.open(manifest_path, 'w', 'utf-8') as out_file:

diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh
@@ -81,7 +81,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for set in train dev test dev-clean dev-other test-clean test-other; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type ${unit_type} \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh
@@ -88,7 +88,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for set in train dev test dev-clean dev-other test-clean test-other; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type "spm" \
  --spm_model_prefix ${bpeprefix} \

diff --git a/examples/other/1xt2x/aishell/local/data.sh b/examples/other/1xt2x/aishell/local/data.sh
@@ -50,7 +50,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for dataset in train dev test; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.npz" \
  --unit_type "char" \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/other/1xt2x/baidu_en8k/local/data.sh b/examples/other/1xt2x/baidu_en8k/local/data.sh
@@ -65,7 +65,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for set in train dev test dev-clean dev-other test-clean test-other; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.npz" \
  --unit_type ${unit_type} \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/other/1xt2x/librispeech/local/data.sh b/examples/other/1xt2x/librispeech/local/data.sh
@@ -63,7 +63,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for set in train dev test dev-clean dev-other test-clean test-other; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.npz" \
  --unit_type ${unit_type} \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
@@ -89,7 +89,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for set in train dev test; do
  {
  python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type "spm" \
  --spm_model_prefix ${bpeprefix} \

diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh
@@ -66,7 +66,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  for set in train dev test; do
  {
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type ${unit_type} \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh
@@ -63,7 +63,6 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  # format manifest with tokenids, vocab size
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type ${unit_type} \
  --vocab_path="data/vocab.txt" \

diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

diff --git a/examples/tiny/s1/conf/preprocess.yaml b/examples/tiny/s1/conf/preprocess.yaml
@@ -0,0 +1,27 @@
+process:
+ # extract kaldi fbank from PCM
+ - type: "fbank_kaldi"
+ fs: 16000
+ n_mels: 80
+ n_shift: 160
+ win_length: 400
+ dither: true
+ # these three processes are a.k.a. SpecAugument
+ - type: "time_warp"
+ max_time_warp: 5
+ inplace: true
+ mode: "PIL"
+ - type: "freq_mask"
+ F: 30
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+ - type: "time_mask"
+ T: 40
+ n_mask: 2
+ inplace: true
+ replace_with_zero: false
+
+
+
+
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
@@ -15,7 +15,7 @@ collator:
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
- augmentation_config: conf/augmentation.json
+ augmentation_config: conf/preprocess.yaml
  batch_size: 4
  raw_wav: True # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank

diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh
@@ -69,7 +69,6 @@ fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  # format manifest with tokenids, vocab size
  python3 ${MAIN_ROOT}/utils/format_data.py \
- --feat_type "raw" \
  --cmvn_path "data/mean_std.json" \
  --unit_type "spm" \
  --spm_model_prefix ${bpeprefix} \