[st] Distributed sampler and new dataloader with MIMO (#1239)

* update timit result, test=doc_fix * result update * fix bug * add triplet loader * empty preprocess file * sync to u2, updating * sync to u2 config * fix bugs * code refine * update config * customize decoding batch size * update optimizer and lr scheduler * minor * minor * minor * fix bugs of refs * minor * distributed sampler * minor * refine the loader
PaddlePaddle · Dec 30, 2021 · 420709e · 420709e
1 parent fbe3c05
commit 420709e
Show file tree

Hide file tree

Showing 11 changed files with 292 additions and 256 deletions.
diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml
@@ -1,6 +1,6 @@
 # https://yaml.org/type/float.html
 data:
- train_manifest: data/manifest.train.tiny
+ train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
  min_input_len: 0.05 # second
@@ -15,8 +15,10 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: data/lang_char/bpe_unigram_8000
  mean_std_filepath: ""
- # augmentation_config: conf/augmentation.json
- batch_size: 10
+ augmentation_config: conf/preprocess.yaml
+ batch_size: 16
+ maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced
+ maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
  raw_wav: True # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@@ -78,13 +80,13 @@ training:
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
- lr: 0.004
+ lr: 2.5
  weight_decay: 1e-06
- scheduler: warmuplr  
+ scheduler: noam 
  scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
- log_interval: 5
+ log_interval: 50
  checkpoint:
  kbest_n: 50
  latest_n: 5
@@ -97,6 +99,7 @@ decoding:
  alpha: 2.5
  beta: 0.3
  beam_size: 10
+ word_reward: 0.7
  cutoff_prob: 1.0
  cutoff_top_n: 0
  num_proc_bsearch: 8
@@ -107,3 +110,5 @@ decoding:
  # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False # simulate streaming inference. Defaults to False.
+
+
diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@@ -15,8 +15,10 @@ collator:
  unit_type: 'spm'
  spm_model_prefix: data/lang_char/bpe_unigram_8000
  mean_std_filepath: ""
- # augmentation_config: conf/augmentation.json
- batch_size: 10
+ augmentation_config: conf/preprocess.yaml
+ batch_size: 16
+ maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced
+ maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
  raw_wav: True # use raw_wav or kaldi feature
  spectrum_type: fbank #linear, mfcc, fbank
  feat_dim: 80

diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh
@@ -13,14 +13,12 @@ ckpt_prefix=$2
 
 for type in fullsentence; do
  echo "decoding ${type}"
- batch_size=32
  python3 -u ${BIN_DIR}/test.py \
  --ngpu ${ngpu} \
  --config ${config_path} \
  --result_file ${ckpt_prefix}.${type}.rsl \
  --checkpoint_path ${ckpt_prefix} \
  --opts decoding.decoding_method ${type} \
- --opts decoding.batch_size ${batch_size}
 
  if [ $? -ne 0 ]; then
  echo "Failed in evaluation!"

diff --git a/examples/ted_en_zh/st1/RESULTS.md b/examples/ted_en_zh/st1/RESULTS.md
@@ -12,5 +12,5 @@
 ## Transformer
 | Model | Params | Config | Val loss | Char-BLEU |
 | --- | --- | --- | --- | --- |
-| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 19.45 |
+| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 69.91 | 20.26 |
 | FAT + Transformer+ASR MTL with word reward | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 20.80 |
diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml
@@ -1,39 +1,33 @@
 # https://yaml.org/type/float.html
 data:
- train_manifest: data/manifest.train.tiny
+ train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
- min_input_len: 5.0 # frame
- max_input_len: 3000.0 # frame
- min_output_len: 0.0 # tokens
- max_output_len: 400.0 # tokens
- min_output_input_ratio: 0.01
- max_output_input_ratio: 20.0
 
 collator:
- vocab_filepath: data/lang_char/vocab.txt
+ vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
  unit_type: 'spm'
- spm_model_prefix: data/lang_char/bpe_unigram_8000
+ spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
  mean_std_filepath: ""
  # augmentation_config: conf/augmentation.json
- batch_size: 10
- raw_wav: True # use raw_wav or kaldi feature
- spectrum_type: fbank #linear, mfcc, fbank
+ batch_size: 20
  feat_dim: 83
- delta_delta: False
- dither: 1.0
- target_sample_rate: 16000
- max_freq: None
- n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
- use_dB_normalization: True
- target_dB: -20
- random_seed: 0
- keep_transcription_text: False
- sortagrad: True 
- shuffle_method: batch_shuffle
- num_workers: 2
+ sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+ maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+ maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+ minibatches: 0 # for debug
+ batch_count: auto
+ batch_bins: 0 
+ batch_frames_in: 0
+ batch_frames_out: 0
+ batch_frames_inout: 0
+ augmentation_config:
+ num_workers: 0
+ subsampling_factor: 1
+ num_encs: 1
+
 
 
 # network architecture
@@ -73,18 +67,18 @@ model:
 
 
 training:
- n_epoch: 20
+ n_epoch: 40
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
- lr: 0.004
- weight_decay: 1e-06
- scheduler: warmuplr  
+ lr: 2.5
+ weight_decay: 0.
+ scheduler: noam 
  scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
- log_interval: 5
+ log_interval: 50
  checkpoint:
  kbest_n: 50
  latest_n: 5
@@ -107,4 +101,4 @@ decoding:
  # >0: for decoding, use fixed chunk size as set.
  # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
- simulate_streaming: False # simulate streaming inference. Defaults to False.
+ simulate_streaming: False # simulate streaming inference. Defaults to False.
diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -3,37 +3,31 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
- min_input_len: 5.0 # frame
- max_input_len: 3000.0 # frame
- min_output_len: 0.0 # tokens
- max_output_len: 400.0 # tokens
- min_output_input_ratio: 0.01
- max_output_input_ratio: 20.0
 
 collator:
  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
  unit_type: 'spm'
  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
  mean_std_filepath: ""
  # augmentation_config: conf/augmentation.json
- batch_size: 10
- raw_wav: True # use raw_wav or kaldi feature
- spectrum_type: fbank #linear, mfcc, fbank
+ batch_size: 20
  feat_dim: 83
- delta_delta: False
- dither: 1.0
- target_sample_rate: 16000
- max_freq: None
- n_fft: None
  stride_ms: 10.0
  window_ms: 25.0
- use_dB_normalization: True
- target_dB: -20
- random_seed: 0
- keep_transcription_text: False
- sortagrad: True 
- shuffle_method: batch_shuffle
- num_workers: 2
+ sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+ maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+ maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
+ minibatches: 0 # for debug
+ batch_count: auto
+ batch_bins: 0 
+ batch_frames_in: 0
+ batch_frames_out: 0
+ batch_frames_inout: 0
+ augmentation_config:
+ num_workers: 0
+ subsampling_factor: 1
+ num_encs: 1
+
 
 
 # network architecture
@@ -73,18 +67,18 @@ model:
 
 
 training:
- n_epoch: 20
+ n_epoch: 40
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
  lr: 2.5
- weight_decay: 1e-06
+ weight_decay: 0.
  scheduler: noam 
  scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
- log_interval: 5
+ log_interval: 50
  checkpoint:
  kbest_n: 50
  latest_n: 5

diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh
@@ -13,14 +13,12 @@ ckpt_prefix=$2
 
 for type in fullsentence; do
  echo "decoding ${type}"
- batch_size=32
  python3 -u ${BIN_DIR}/test.py \
  --ngpu ${ngpu} \
  --config ${config_path} \
  --result_file ${ckpt_prefix}.${type}.rsl \
  --checkpoint_path ${ckpt_prefix} \
  --opts decoding.decoding_method ${type} \
- --opts decoding.batch_size ${batch_size}
 
  if [ $? -ne 0 ]; then
  echo "Failed in evaluation!"