Skip to content

Commit

Permalink
format wav2vec2 demo
Browse files Browse the repository at this point in the history
  • Loading branch information
Zth9730 committed Oct 10, 2022
1 parent 6e429f0 commit 19180d3
Show file tree
Hide file tree
Showing 15 changed files with 558 additions and 485 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ filename =
# Specify a list of codes to ignore.
ignore =
W503
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125
E252,E262,E127,E265,E126,E266,E241,E261,E128,E125,E129
W291,W293,W605
E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
# shebang has extra meaning in fbcode lints, so I think it's not worth trying
Expand Down
2 changes: 1 addition & 1 deletion examples/librispeech/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature

* asr3 - wav2vecASR, ASR model with pre-trained wav2vec2 and CTC

## Data
| Data Subset | Duration in Seconds |
Expand Down
30 changes: 30 additions & 0 deletions paddlespeech/audio/transform/spectrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,36 @@ def __call__(self, x, train):
return mat


class WavProcess():
def __init__(self, dither=0.1):
"""
Args:
dither (float): Dithering constant
Returns:
"""

self.dither = dither

def __call__(self, x, train):
"""
Args:
x (np.ndarray): shape (Ti,)
train (bool): True, train mode.
Raises:
ValueError: not support (Ti, C)
Returns:
np.ndarray: (T, D)
"""
dither = self.dither if train else 0.0
if x.ndim != 1:
raise ValueError("Not support x: [Time, Channel]")
waveform = np.expand_dims(x, -1)
return waveform


class LogMelSpectrogramKaldi_decay():
def __init__(
self,
Expand Down
1 change: 1 addition & 0 deletions paddlespeech/audio/transform/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN",
fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram",
spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram",
wav_process="paddlespeech.audio.transform.spectrogram:WavProcess",
stft="paddlespeech.audio.transform.spectrogram:Stft",
istft="paddlespeech.audio.transform.spectrogram:IStft",
stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram",
Expand Down
12 changes: 6 additions & 6 deletions paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@
from paddlespeech.s2t.utils.utility import UpdateConfig
logger = Log(__name__).getlog()


class Wav2vec2Infer():
def __init__(self, config, args):
self.args = args
self.config = config
self.audio_file = args.audio_file

self.text_feature = TextFeaturizer(
unit_type=config.unit_type,
vocab=config.vocab_filepath)
unit_type=config.unit_type, vocab=config.vocab_filepath)
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')

# model
Expand Down Expand Up @@ -63,10 +63,10 @@ def run(self):
xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
decode_config = self.config.decode
result_transcripts, result_tokenids = self.model.decode(
xs,
text_feature=self.text_feature,
decoding_method=decode_config.decoding_method,
beam_size=decode_config.beam_size)
xs,
text_feature=self.text_feature,
decoding_method=decode_config.decoding_method,
beam_size=decode_config.beam_size)
rsl = result_transcripts[0]
utt = Path(self.audio_file).name
logger.info(f"hyp: {utt} {rsl}")
Expand Down
63 changes: 35 additions & 28 deletions paddlespeech/s2t/exps/wav2vec2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,53 +18,53 @@
from collections import defaultdict
from collections import OrderedDict
from contextlib import nullcontext
from paddlespeech.s2t.utils import mp_tools

import jsonlines
import numpy as np
import paddle
from paddle import distributed as dist

from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.io.dataloader import StreamDataLoader
from paddlespeech.s2t.io.dataloader import DataLoaderFactory
from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
from paddlespeech.s2t.io.dataloader import StreamDataLoader
from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
from paddlespeech.s2t.utils import error_rate

from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
from paddlespeech.s2t.training.optimizer import OptimizerFactory
from paddlespeech.s2t.training.reporter import ObsScope
from paddlespeech.s2t.training.reporter import report
from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
from paddlespeech.s2t.training.timer import Timer
from paddlespeech.s2t.training.trainer import Trainer
from paddlespeech.s2t.utils.utility import UpdateConfig
from paddlespeech.s2t.utils import error_rate
from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils import mp_tools
from paddlespeech.s2t.utils.log import Log


from paddlespeech.s2t.utils.utility import UpdateConfig

logger = Log(__name__).getlog()


class Wav2Vec2ASRTrainer(Trainer):
def __init__(self, config, args):
super().__init__(config, args)
self.avg_train_loss = 0

def train_batch(self, batch_index, batch, msg):
train_conf = self.config
start = time.time()

# forward
utt, wav, wavs_lens, target, target_lens = batch
wavs_lens_rate = wavs_lens / wav.shape[1]
wavs_lens_rate = wavs_lens / wav.shape[1]
target_lens_rate = target_lens / target.shape[1]
wav = wav[:,:,0]
wav = wav[:, :, 0]
wav = self.speech_augmentation(wav, wavs_lens_rate)
loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
# pring(wav, wavs_lens_rate, target, target_lens_rate)
# loss div by `batch_size * accum_grad`
loss /= train_conf.accum_grad

losses_np = {'loss': float(loss) * train_conf.accum_grad}

# loss backward
Expand Down Expand Up @@ -108,15 +108,16 @@ def train_batch(self, batch_index, batch, msg):
def valid(self):
self.model.eval()
if not self.use_streamdata:
logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
logger.info(
f"Valid Total Examples: {len(self.valid_loader.dataset)}")
valid_losses = defaultdict(list)
num_seen_utts = 1
total_loss = 0.0
for i, batch in enumerate(self.valid_loader):
utt, wav, wavs_lens, target, target_lens = batch
wavs_lens_rate = wavs_lens / wav.shape[1]
wavs_lens_rate = wavs_lens / wav.shape[1]
target_lens_rate = target_lens / target.shape[1]
wav = wav[:,:,0]
wav = wav[:, :, 0]
loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)

if paddle.isfinite(loss):
Expand All @@ -134,7 +135,8 @@ def valid(self):
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
if not self.use_streamdata:
msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
msg += "batch: {}/{}, ".format(i + 1,
len(self.valid_loader))
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in valid_dump.items())
logger.info(msg)
Expand All @@ -155,7 +157,8 @@ def do_train(self):
self.before_train()

if not self.use_streamdata:
logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
logger.info(
f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
Expand Down Expand Up @@ -223,14 +226,18 @@ def setup_dataloader(self):
config = self.config.clone()
self.use_streamdata = config.get("use_stream_data", False)
if self.train:
self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
self.train_loader = DataLoaderFactory.get_dataloader(
'train', config, self.args)
self.valid_loader = DataLoaderFactory.get_dataloader(
'valid', config, self.args)
logger.info("Setup train/valid Dataloader!")
else:
decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1)
self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
self.test_loader = DataLoaderFactory.get_dataloader('test', config,
self.args)
self.align_loader = DataLoaderFactory.get_dataloader(
'align', config, self.args)
logger.info("Setup test/align Dataloader!")

def setup_model(self):
Expand All @@ -248,7 +255,7 @@ def setup_model(self):
model = Wav2vec2ASR.from_config(model_conf)

if self.parallel:
model = paddle.DataParallel(model, find_unused_parameters=True)
model = paddle.DataParallel(model, find_unused_parameters=True)

logger.info(f"{model}")
layer_tools.print_params(model, logger.info)
Expand Down Expand Up @@ -312,14 +319,14 @@ def __init__(self, config, args):
self.text_featurizer = TextFeaturizer(
unit_type=config.unit_type, vocab=config.vocab_filepath)
self.vocab_list = self.text_featurizer.vocab_list

def id2token(self, texts, texts_len):
""" ord() id to chr() chr """
trans = []
for text, n in zip(texts, texts_len):
n = n.numpy().item()
ids = text[:n]
trans.append(
self.text_featurizer.defeaturize(ids.numpy().tolist()))
trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
return trans

def compute_metrics(self,
Expand All @@ -337,10 +344,10 @@ def compute_metrics(self,
start_time = time.time()
target_transcripts = self.id2token(texts, texts_len)
result_transcripts, result_tokenids = self.model.decode(
audio,
text_feature=self.text_featurizer,
decoding_method=decode_cfg.decoding_method,
beam_size=decode_cfg.beam_size)
audio,
text_feature=self.text_featurizer,
decoding_method=decode_cfg.decoding_method,
beam_size=decode_cfg.beam_size)
decode_time = time.time() - start_time

for utt, target, result, rec_tids in zip(
Expand Down Expand Up @@ -432,4 +439,4 @@ def test(self):
"decode_method":
self.config.decode.decoding_method,
})
f.write(data + '\n')
f.write(data + '\n')
15 changes: 7 additions & 8 deletions paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Elena Rastorgueva 2020
"""
import paddle

from paddlespeech.s2t.models.wav2vec2.modules import containers
from paddlespeech.s2t.models.wav2vec2.modules import linear

Expand All @@ -27,19 +28,17 @@ class VanillaNN(containers.Sequential):
"""

def __init__(
self,
input_shape,
activation=paddle.nn.LeakyReLU,
dnn_blocks=2,
dnn_neurons=512,
):
self,
input_shape,
activation=paddle.nn.LeakyReLU,
dnn_blocks=2,
dnn_neurons=512, ):
super().__init__(input_shape=input_shape)

for block_index in range(dnn_blocks):
self.append(
linear.Linear,
n_neurons=dnn_neurons,
bias=True,
layer_name="linear",
)
layer_name="linear", )
self.append(activation(), layer_name="act")
23 changes: 14 additions & 9 deletions paddlespeech/s2t/models/wav2vec2/modules/activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math

from packaging import version
from paddle import Tensor, nn

from paddle import nn
from paddle import Tensor

from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
Expand All @@ -29,7 +27,9 @@ class NewGELUActivation(nn.Layer):
"""

def forward(self, input: Tensor) -> Tensor:
return 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
return 0.5 * input * (1.0 + paddle.tanh(
math.sqrt(2.0 / math.pi) *
(input + 0.044715 * paddle.pow(input, 3.0))))


class GELUActivation(nn.Layer):
Expand All @@ -40,7 +40,7 @@ class GELUActivation(nn.Layer):
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""

def __init__(self, use_gelu_python: bool = False):
def __init__(self, use_gelu_python: bool=False):
super().__init__()
self.act = nn.functional.gelu

Expand All @@ -57,7 +57,9 @@ class FastGELUActivation(nn.Layer):
"""

def forward(self, input: Tensor) -> Tensor:
return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
return 0.5 * input * (
1.0 + paddle.tanh(input * 0.7978845608 *
(1.0 + 0.044715 * input * input)))


class QuickGELUActivation(nn.Layer):
Expand All @@ -84,7 +86,8 @@ class ClippedGELUActivation(nn.Layer):

def __init__(self, min: float, max: float):
if min > max:
raise ValueError(f"min should be < max (got min: {min}, max: {max})")
raise ValueError(
f"min should be < max (got min: {min}, max: {max})")

super().__init__()
self.min = min
Expand Down Expand Up @@ -161,7 +164,9 @@ def get_activation(activation_string):
if activation_string in ACT2FN:
return ACT2FN[activation_string]
else:
raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
raise KeyError(
f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
)


# For backwards compatibility with: from activations import gelu_python
Expand Down
Loading

0 comments on commit 19180d3

Please sign in to comment.