support multi-gpu training with webdataset

PaddlePaddle · Jun 24, 2022 · c7a7b11 · c7a7b11
1 parent 8f5e610
commit c7a7b11
Show file tree

Hide file tree

Showing 21 changed files with 341 additions and 1,930 deletions.
diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml
@@ -50,26 +50,41 @@ test_manifest: data/manifest.test
 ###########################################
 # Dataloader #
 ###########################################
-vocab_filepath: data/lang_char/vocab.txt 
+use_stream_data: True
 unit_type: 'char'
+vocab_filepath: data/lang_char/vocab.txt 
+cmvn_file: data/mean_std.json
 preprocess_config: conf/preprocess.yaml
 spm_model_prefix: ''
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
+dither: 0.1
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 64
+minlen_in: 10
 maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
+minlen_out: 0
 maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
-minibatches: 0 # for debug
-batch_count: auto
-batch_bins: 0 
-batch_frames_in: 0
-batch_frames_out: 0
-batch_frames_inout: 0
-num_workers: 0
-subsampling_factor: 1
+resample_rate: 16000
+shuffle_size: 10000
+sort_size: 500
+num_workers: 4
+prefetch_factor: 100
+dist_sampler: True
 num_encs: 1
+augment_conf:
+ max_w: 80
+ w_inplace: True
+ w_mode: "PIL"
+ max_f: 30
+ num_f_mask: 2
+ f_inplace: True
+ f_replace_with_zero: False
+ max_t: 40
+ num_t_mask: 2
+ t_inplace: True
+ t_replace_with_zero: False
 
 
 ###########################################
@@ -78,7 +93,7 @@ num_encs: 1
 n_epoch: 240 
 accum_grad: 16
 global_grad_clip: 5.0
-log_interval: 100
+log_interval: 1
 checkpoint:
  kbest_n: 50
  latest_n: 5

diff --git a/paddlespeech/audio/stream_data/__init__.py b/paddlespeech/audio/stream_data/__init__.py
@@ -41,7 +41,8 @@
  spec_aug,
  sort,
  padding,
- cmvn
+ cmvn,
+ placeholder,
 )
 from webdataset.handlers import (
  ignore_and_continue,

diff --git a/paddlespeech/audio/stream_data/filters.py b/paddlespeech/audio/stream_data/filters.py
@@ -758,27 +758,44 @@ def _compute_fbank(source,
 
 compute_fbank = pipelinefilter(_compute_fbank)
 
-def _spec_aug(source, num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80):
+def _spec_aug(source,
+ max_w=5, 
+ w_inplace=True, 
+ w_mode="PIL",
+ max_f=30,
+ num_f_mask=2, 
+ f_inplace=True, 
+ f_replace_with_zero=False,
+ max_t=40, 
+ num_t_mask=2, 
+ t_inplace=True, 
+ t_replace_with_zero=False,):
  """ Do spec augmentation
  Inplace operation
 
  Args:
  source: Iterable[{fname, feat, label}]
- num_t_mask: number of time mask to apply
+ max_w: max width of time warp
+ w_inplace: whether to inplace the original data while time warping
+ w_mode: time warp mode
+ max_f: max width of freq mask
  num_f_mask: number of freq mask to apply
+ f_inplace: whether to inplace the original data while frequency masking
+ f_replace_with_zero: use zero to mask
  max_t: max width of time mask
- max_f: max width of freq mask
- max_w: max width of time warp
-
+ num_t_mask: number of time mask to apply
+ t_inplace: whether to inplace the original data while time masking
+ t_replace_with_zero: use zero to mask
+ 
  Returns
  Iterable[{fname, feat, label}]
  """
  for sample in source:
  x = sample['feat']
  x = x.numpy()
- x = time_warp(x, max_time_warp=max_w, inplace = True, mode= "PIL")
- x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = True, replace_with_zero = False)
- x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = True, replace_with_zero = False)
+ x = time_warp(x, max_time_warp=max_w, inplace = w_inplace, mode= w_mode)
+ x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = f_inplace, replace_with_zero = f_replace_with_zero)
+ x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = t_inplace, replace_with_zero = t_replace_with_zero)
  sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32)
  yield sample
 
@@ -910,3 +927,9 @@ def _cmvn(source, cmvn_file):
  label_lengths)
 
 cmvn = pipelinefilter(_cmvn)
+
+def _placeholder(source):
+ for data in source:
+ yield data
+
+placeholder = pipelinefilter(_placeholder)
diff --git a/paddlespeech/audio/stream_data/pipeline.py b/paddlespeech/audio/stream_data/pipeline.py
@@ -89,6 +89,12 @@ def stage(self, i):
  def append(self, f):
  """Append a pipeline stage (modifies the object)."""
  self.pipeline.append(f)
+ return self
+
+ def append_list(self, *args):
+ for arg in args:
+ self.pipeline.append(arg)
+ return self
 
  def compose(self, *args):
  """Append a pipeline stage to a copy of the pipeline and returns the copy."""

diff --git a/paddlespeech/audio/stream_data/shardlists.py b/paddlespeech/audio/stream_data/shardlists.py
@@ -24,6 +24,8 @@
 from .paddle_utils import IterableDataset
 
 
+from ..utils.log import Logger
+logger = Logger(__name__)
 def expand_urls(urls):
  if isinstance(urls, str):
  urllist = urls.split("::")

diff --git a/paddlespeech/audio/utils/log.py b/paddlespeech/audio/utils/log.py
@@ -65,6 +65,7 @@ class Logger(object):
 
  def __init__(self, name: str=None):
  name = 'PaddleAudio' if not name else name
+ self.name = name
  self.logger = logging.getLogger(name)
 
  for key, conf in log_config.items():
@@ -101,7 +102,7 @@ def __call__(self, log_level: str, msg: str):
  if not self.is_enable:
  return
 
- self.logger.log(log_level, msg)
+ self.logger.log(log_level, self.name + " | " + msg)
 
  @contextlib.contextmanager
  def use_terminator(self, terminator: str):

diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py
@@ -93,9 +93,6 @@ def pad_sequence(sequences: List[paddle.Tensor],
  for i, tensor in enumerate(sequences):
  length = tensor.shape[0]
  # use index notation to prevent duplicate references to the tensor
- logger.info(
- f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
- )
  if batch_first:
  # TODO (Hui Zhang): set_value op not supprot `end==start`
  # TODO (Hui Zhang): set_value op not support int16