format wav2vec2 demo

PaddlePaddle · Oct 10, 2022 · 19180d3 · 19180d3
1 parent 6e429f0
commit 19180d3
Show file tree

Hide file tree

Showing 15 changed files with 558 additions and 485 deletions.
diff --git a/.flake8 b/.flake8
@@ -33,7 +33,7 @@ filename =
 # Specify a list of codes to ignore.
 ignore =
  W503
- E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
+ E252,E262,E127,E265,E126,E266,E241,E261,E128,E125,E129
  W291,W293,W605
  E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
  # shebang has extra meaning in fbcode lints, so I think it's not worth trying

diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md
@@ -3,7 +3,7 @@
 * asr0 - deepspeech2 Streaming/Non-Streaming
 * asr1 - transformer/conformer Streaming/Non-Streaming
 * asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
-
+* asr3 - wav2vecASR, ASR model with pre-trained wav2vec2 and CTC
 
 ## Data
 | Data Subset | Duration in Seconds |

diff --git a/paddlespeech/audio/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py
@@ -382,6 +382,36 @@ def __call__(self, x, train):
  return mat
 
 
+class WavProcess():
+ def __init__(self, dither=0.1):
+ """
+ Args:
+ dither (float): Dithering constant
+
+ Returns:
+ """
+
+ self.dither = dither
+
+ def __call__(self, x, train):
+ """
+ Args:
+ x (np.ndarray): shape (Ti,)
+ train (bool): True, train mode.
+
+ Raises:
+ ValueError: not support (Ti, C)
+
+ Returns:
+ np.ndarray: (T, D)
+ """
+ dither = self.dither if train else 0.0
+ if x.ndim != 1:
+ raise ValueError("Not support x: [Time, Channel]")
+ waveform = np.expand_dims(x, -1)
+ return waveform
+
+
 class LogMelSpectrogramKaldi_decay():
  def __init__(
  self,

diff --git a/paddlespeech/audio/transform/transformation.py b/paddlespeech/audio/transform/transformation.py
@@ -41,6 +41,7 @@
  utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN",
  fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram",
  spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram",
+ wav_process="paddlespeech.audio.transform.spectrogram:WavProcess",
  stft="paddlespeech.audio.transform.spectrogram:Stft",
  istft="paddlespeech.audio.transform.spectrogram:IStft",
  stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram",

diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
@@ -27,15 +27,15 @@
 from paddlespeech.s2t.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 
+
 class Wav2vec2Infer():
  def __init__(self, config, args):
  self.args = args
  self.config = config
  self.audio_file = args.audio_file
 
  self.text_feature = TextFeaturizer(
- unit_type=config.unit_type,
- vocab=config.vocab_filepath)
+ unit_type=config.unit_type, vocab=config.vocab_filepath)
  paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
  # model
@@ -63,10 +63,10 @@ def run(self):
  xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
  decode_config = self.config.decode
  result_transcripts, result_tokenids = self.model.decode(
-  xs,
-  text_feature=self.text_feature,
-  decoding_method=decode_config.decoding_method,
-  beam_size=decode_config.beam_size)
+ xs,
+ text_feature=self.text_feature,
+ decoding_method=decode_config.decoding_method,
+ beam_size=decode_config.beam_size)
  rsl = result_transcripts[0]
  utt = Path(self.audio_file).name
  logger.info(f"hyp: {utt} {rsl}")

diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py
@@ -18,53 +18,53 @@
 from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
-from paddlespeech.s2t.utils import mp_tools
 
 import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
+
 from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
-from paddlespeech.s2t.io.dataloader import StreamDataLoader
 from paddlespeech.s2t.io.dataloader import DataLoaderFactory
-from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
 from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
-from paddlespeech.s2t.utils import error_rate
-
+from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
 from paddlespeech.s2t.training.reporter import ObsScope
 from paddlespeech.s2t.training.reporter import report
 from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
 from paddlespeech.s2t.training.timer import Timer
 from paddlespeech.s2t.training.trainer import Trainer
-from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.s2t.utils import error_rate
 from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
 from paddlespeech.s2t.utils.log import Log
-
-
+from paddlespeech.s2t.utils.utility import UpdateConfig
 
 logger = Log(__name__).getlog()
 
+
 class Wav2Vec2ASRTrainer(Trainer):
  def __init__(self, config, args):
  super().__init__(config, args)
  self.avg_train_loss = 0
+
  def train_batch(self, batch_index, batch, msg):
  train_conf = self.config
  start = time.time()
 
  # forward
  utt, wav, wavs_lens, target, target_lens = batch
- wavs_lens_rate = wavs_lens / wav.shape[1] 
+ wavs_lens_rate = wavs_lens / wav.shape[1]
  target_lens_rate = target_lens / target.shape[1]
- wav = wav[:,:,0]
+ wav = wav[:, :, 0]
  wav = self.speech_augmentation(wav, wavs_lens_rate)
  loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
  # pring(wav, wavs_lens_rate, target, target_lens_rate)
  # loss div by `batch_size * accum_grad`
  loss /= train_conf.accum_grad
- 
+
  losses_np = {'loss': float(loss) * train_conf.accum_grad}
 
  # loss backward
@@ -108,15 +108,16 @@ def train_batch(self, batch_index, batch, msg):
  def valid(self):
  self.model.eval()
  if not self.use_streamdata:
- logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+ logger.info(
+ f"Valid Total Examples: {len(self.valid_loader.dataset)}")
  valid_losses = defaultdict(list)
  num_seen_utts = 1
  total_loss = 0.0
  for i, batch in enumerate(self.valid_loader):
  utt, wav, wavs_lens, target, target_lens = batch
- wavs_lens_rate = wavs_lens / wav.shape[1] 
+ wavs_lens_rate = wavs_lens / wav.shape[1]
  target_lens_rate = target_lens / target.shape[1]
- wav = wav[:,:,0]
+ wav = wav[:, :, 0]
  loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
 
  if paddle.isfinite(loss):
@@ -134,7 +135,8 @@ def valid(self):
  msg += "epoch: {}, ".format(self.epoch)
  msg += "step: {}, ".format(self.iteration)
  if not self.use_streamdata:
- msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+ msg += "batch: {}/{}, ".format(i + 1,
+ len(self.valid_loader))
  msg += ', '.join('{}: {:>.6f}'.format(k, v)
  for k, v in valid_dump.items())
  logger.info(msg)
@@ -155,7 +157,8 @@ def do_train(self):
  self.before_train()
 
  if not self.use_streamdata:
- logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+ logger.info(
+ f"Train Total Examples: {len(self.train_loader.dataset)}")
  while self.epoch < self.config.n_epoch:
  with Timer("Epoch-Train Time Cost: {}"):
  self.model.train()
@@ -223,14 +226,18 @@ def setup_dataloader(self):
  config = self.config.clone()
  self.use_streamdata = config.get("use_stream_data", False)
  if self.train:
- self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
- self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
+ self.train_loader = DataLoaderFactory.get_dataloader(
+ 'train', config, self.args)
+ self.valid_loader = DataLoaderFactory.get_dataloader(
+ 'valid', config, self.args)
  logger.info("Setup train/valid Dataloader!")
  else:
  decode_batch_size = config.get('decode', dict()).get(
  'decode_batch_size', 1)
- self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
- self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
+ self.test_loader = DataLoaderFactory.get_dataloader('test', config,
+ self.args)
+ self.align_loader = DataLoaderFactory.get_dataloader(
+ 'align', config, self.args)
  logger.info("Setup test/align Dataloader!")
 
  def setup_model(self):
@@ -248,7 +255,7 @@ def setup_model(self):
  model = Wav2vec2ASR.from_config(model_conf)
 
  if self.parallel:
- model = paddle.DataParallel(model, find_unused_parameters=True) 
+ model = paddle.DataParallel(model, find_unused_parameters=True)
 
  logger.info(f"{model}")
  layer_tools.print_params(model, logger.info)
@@ -312,14 +319,14 @@ def __init__(self, config, args):
  self.text_featurizer = TextFeaturizer(
  unit_type=config.unit_type, vocab=config.vocab_filepath)
  self.vocab_list = self.text_featurizer.vocab_list
+
  def id2token(self, texts, texts_len):
  """ ord() id to chr() chr """
  trans = []
  for text, n in zip(texts, texts_len):
  n = n.numpy().item()
  ids = text[:n]
- trans.append(
- self.text_featurizer.defeaturize(ids.numpy().tolist()))
+ trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
  return trans
 
  def compute_metrics(self,
@@ -337,10 +344,10 @@ def compute_metrics(self,
  start_time = time.time()
  target_transcripts = self.id2token(texts, texts_len)
  result_transcripts, result_tokenids = self.model.decode(
-  audio,
-  text_feature=self.text_featurizer,
-  decoding_method=decode_cfg.decoding_method,
-  beam_size=decode_cfg.beam_size)
+ audio,
+ text_feature=self.text_featurizer,
+ decoding_method=decode_cfg.decoding_method,
+ beam_size=decode_cfg.beam_size)
  decode_time = time.time() - start_time
 
  for utt, target, result, rec_tids in zip(
@@ -432,4 +439,4 @@ def test(self):
  "decode_method":
  self.config.decode.decoding_method,
  })
- f.write(data + '\n')
+ f.write(data + '\n')
diff --git a/paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py b/paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py
@@ -3,6 +3,7 @@
 * Elena Rastorgueva 2020
 """
 import paddle
+
 from paddlespeech.s2t.models.wav2vec2.modules import containers
 from paddlespeech.s2t.models.wav2vec2.modules import linear
 
@@ -27,19 +28,17 @@ class VanillaNN(containers.Sequential):
  """
 
  def __init__(
- self,
- input_shape,
- activation=paddle.nn.LeakyReLU,
- dnn_blocks=2,
- dnn_neurons=512,
- ):
+ self,
+ input_shape,
+ activation=paddle.nn.LeakyReLU,
+ dnn_blocks=2,
+ dnn_neurons=512, ):
  super().__init__(input_shape=input_shape)
 
  for block_index in range(dnn_blocks):
  self.append(
  linear.Linear,
  n_neurons=dnn_neurons,
  bias=True,
- layer_name="linear",
- )
+ layer_name="linear", )
  self.append(activation(), layer_name="act")
diff --git a/paddlespeech/s2t/models/wav2vec2/modules/activations.py b/paddlespeech/s2t/models/wav2vec2/modules/activations.py
@@ -11,12 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import math
 
-from packaging import version
-from paddle import Tensor, nn
-
+from paddle import nn
+from paddle import Tensor
 
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
@@ -29,7 +27,9 @@ class NewGELUActivation(nn.Layer):
  """
 
  def forward(self, input: Tensor) -> Tensor:
- return 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+ return 0.5 * input * (1.0 + paddle.tanh(
+ math.sqrt(2.0 / math.pi) *
+ (input + 0.044715 * paddle.pow(input, 3.0))))
 
 
 class GELUActivation(nn.Layer):
@@ -40,7 +40,7 @@ class GELUActivation(nn.Layer):
  Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
  """
 
- def __init__(self, use_gelu_python: bool = False):
+ def __init__(self, use_gelu_python: bool=False):
  super().__init__()
  self.act = nn.functional.gelu
 
@@ -57,7 +57,9 @@ class FastGELUActivation(nn.Layer):
  """
 
  def forward(self, input: Tensor) -> Tensor:
- return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+ return 0.5 * input * (
+ 1.0 + paddle.tanh(input * 0.7978845608 *
+ (1.0 + 0.044715 * input * input)))
 
 
 class QuickGELUActivation(nn.Layer):
@@ -84,7 +86,8 @@ class ClippedGELUActivation(nn.Layer):
 
  def __init__(self, min: float, max: float):
  if min > max:
- raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+ raise ValueError(
+ f"min should be < max (got min: {min}, max: {max})")
 
  super().__init__()
  self.min = min
@@ -161,7 +164,9 @@ def get_activation(activation_string):
  if activation_string in ACT2FN:
  return ACT2FN[activation_string]
  else:
- raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+ raise KeyError(
+ f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
+ )
 
 
 # For backwards compatibility with: from activations import gelu_python