Skip to content

Commit

Permalink
Removed need for torchaudio, added improvements for APEX mixed precis…
Browse files Browse the repository at this point in the history
…ion, improved tuning for LMs, updated Docs
  • Loading branch information
sean.narenthiran committed Jul 25, 2019
1 parent 98c987a commit b8e34cc
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 177 deletions.
25 changes: 11 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,6 @@ export CUDA_HOME="/usr/local/cuda"
cd ../pytorch_binding && python setup.py install
```

Install pytorch audio:
```
sudo apt-get install sox libsox-dev libsox-fmt-all
git clone https://github.com/pytorch/audio.git
cd audio && python setup.py install
```

Install NVIDIA apex:
```
git clone --recursive https://github.com/NVIDIA/apex.git
Expand Down Expand Up @@ -180,22 +173,26 @@ python -m multiproc train.py --visdom --cuda # Add your parameters as normal, mu

multiproc will open a log for all processes other than the main process.

You can also specify specific GPU IDs rather than allowing the script to use all available GPUs:

```
python -m multiproc train.py --visdom --cuda --device-ids 0,1,2,3 # Add your parameters as normal, will only run on 4 GPUs
```

We suggest using the NCCL backend which defaults to TCP if Infiniband isn't available.

## Mixed Precision

If you are using NVIDIA volta cards or above to train your model, it's highly suggested to turn on mixed precision for speed/memory benefits. More information can be found [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). Also suggested is to turn on dyanmic loss scaling to handle small grad values:
If you are using NVIDIA volta cards or above to train your model, it's highly suggested to turn on mixed precision for speed/memory benefits. More information can be found [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).

```
python train.py --train-manifest data/train_manifest.csv --val-manifest data/val_manifest.csv --mixed-precision --dynamic-loss-scale
```

You can also specify specific GPU IDs rather than allowing the script to use all available GPUs:
Different Optimization levels are available. More information on the Nvidia Apex API can be seen [here](https://nvidia.github.io/apex/amp.html#opt-levels).

```
python -m multiproc train.py --visdom --cuda --device-ids 0,1,2,3 # Add your parameters as normal, will only run on 4 GPUs
python train.py --train-manifest data/train_manifest.csv --val-manifest data/val_manifest.csv --opt-level O1 --loss-scale 1.0
```

Training a model in mixed-precision means you can use 32 bit float or half precision at runtime. Float is default, to use half precision (Which on V100s come with a speedup and better memory use) use the `--half` flag when testing or transcribing.

### Noise Augmentation/Injection

There is support for two different types of noise; noise augmentation and noise injection.
Expand Down
6 changes: 3 additions & 3 deletions data/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import scipy.signal
import torch
import torchaudio
from scipy.io.wavfile import read
import math
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
Expand All @@ -20,8 +20,8 @@


def load_audio(path):
sound, _ = torchaudio.load(path, normalization=True)
sound = sound.numpy().T
sample_rate, sound = read(path)
sound = sound.astype('float32') / 32767 # normalize audio
if len(sound.shape) > 1:
if sound.shape[1] == 1:
sound = sound.squeeze()
Expand Down
12 changes: 3 additions & 9 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def __repr__(self):

class DeepSpeech(nn.Module):
def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5, audio_conf=None,
bidirectional=True, context=20, mixed_precision=False):
bidirectional=True, context=20):
super(DeepSpeech, self).__init__()

# model metadata needed for serialization/deserialization
Expand All @@ -156,7 +156,6 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layer
self.audio_conf = audio_conf or {}
self.labels = labels
self.bidirectional = bidirectional
self.mixed_precision = mixed_precision

sample_rate = self.audio_conf.get("sample_rate", 16000)
window_size = self.audio_conf.get("window_size", 0.02)
Expand Down Expand Up @@ -201,8 +200,6 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layer
self.inference_softmax = InferenceBatchSoftmax()

def forward(self, x, lengths):
if x.is_cuda and self.mixed_precision:
x = x.half()
lengths = lengths.cpu().int()
output_lengths = self.get_seq_lens(lengths)
x, _ = self.conv(x, output_lengths)
Expand Down Expand Up @@ -244,8 +241,7 @@ def load_model(cls, path):
labels=package['labels'],
audio_conf=package['audio_conf'],
rnn_type=supported_rnns[package['rnn_type']],
bidirectional=package.get('bidirectional', True),
mixed_precision=package.get('mixed_precision', False))
bidirectional=package.get('bidirectional', True))
model.load_state_dict(package['state_dict'])
for x in model.rnns:
x.flatten_parameters()
Expand All @@ -258,8 +254,7 @@ def load_model_package(cls, package):
labels=package['labels'],
audio_conf=package['audio_conf'],
rnn_type=supported_rnns[package['rnn_type']],
bidirectional=package.get('bidirectional', True),
mixed_precision=package.get('mixed_precision', False))
bidirectional=package.get('bidirectional', True))
model.load_state_dict(package['state_dict'])
return model

Expand All @@ -275,7 +270,6 @@ def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=No
'labels': model.labels,
'state_dict': model.state_dict(),
'bidirectional': model.bidirectional,
'mixed_precision': model.mixed_precision
}
if optimizer is not None:
package['optim_dict'] = optimizer.state_dict()
Expand Down
6 changes: 4 additions & 2 deletions noise_inject.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse

import torch
import torchaudio
from scipy.io.wavfile import write

from data.data_loader import load_audio, NoiseInjection

Expand All @@ -18,5 +18,7 @@
data = load_audio(args.input_path)
mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level)
mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1) # Add channels dim
torchaudio.save(args.output_path, mixed_data, args.sample_rate)
write(filename=args.output_path,
data=mixed_data.numpy(),
rate=args.sample_rate)
print('Saved mixed file to %s' % args.output_path)
4 changes: 3 additions & 1 deletion opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ def add_decoder_args(parser):


def add_inference_args(parser):
parser.add_argument('--cuda', action="store_true", help='Use cuda to test model')
parser.add_argument('--cuda', action="store_true", help='Use cuda')
parser.add_argument('--half', action="store_true",
help='Use half precision. This is recommended when using mixed-precision at training time')
parser.add_argument('--decoder', default="greedy", choices=["greedy", "beam"], type=str, help="Decoder to use")
parser.add_argument('--model-path', default='models/deepspeech_final.pth',
help='Path to model file created by training')
Expand Down
9 changes: 7 additions & 2 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ def transcribe_file():
with NamedTemporaryFile(suffix=file_extension) as tmp_saved_audio_file:
file.save(tmp_saved_audio_file.name)
logging.info('Transcribing file...')
transcription, _ = transcribe(tmp_saved_audio_file.name, spect_parser, model, decoder, device)
transcription, _ = transcribe(audio_path=tmp_saved_audio_file,
spect_parser=spect_parser,
model=model,
decoder=decoder,
device=device,
use_half=args.half)
logging.info('File transcribed')
res['status'] = "OK"
res['transcription'] = transcription
Expand All @@ -53,7 +58,7 @@ def main():
logging.info('Setting up server...')
torch.set_grad_enabled(False)
device = torch.device("cuda" if args.cuda else "cpu")
model = load_model(device, args.model_path, args.cuda)
model = load_model(device, args.model_path, args.half)

if args.decoder == "beam":
from decoder import BeamCTCDecoder
Expand Down
19 changes: 10 additions & 9 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,14 @@
parser.add_argument('--batch-size', default=20, type=int, help='Batch size for training')
parser.add_argument('--num-workers', default=4, type=int, help='Number of workers used in dataloading')
parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample")
parser.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output")
parser.add_argument('--save-output', default=None, help="Saves output of model from test to this file_path")
parser = add_decoder_args(parser)
parser.add_argument('--save-output', action="store_true", help="Saves output of model from test")
args = parser.parse_args()

if __name__ == '__main__':
torch.set_grad_enabled(False)
device = torch.device("cuda" if args.cuda else "cpu")
model = load_model(device, args.model_path, args.cuda)
model = load_model(device, args.model_path, args.half)

if args.decoder == "beam":
from decoder import BeamCTCDecoder
Expand All @@ -47,6 +46,8 @@
inputs, targets, input_percentages, target_sizes = data
input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
inputs = inputs.to(device)
if args.half:
inputs = inputs.half()
# unflatten targets
split_targets = []
offset = 0
Expand All @@ -56,12 +57,12 @@

out, output_sizes = model(inputs, input_sizes)

if args.save_output:
# add output to data array, and continue
output_data.append((out.cpu().numpy(), output_sizes.numpy()))

decoded_output, _ = decoder.decode(out, output_sizes)
target_strings = target_decoder.convert_to_strings(split_targets)

if args.save_output is not None:
# add output to data array, and continue
output_data.append((out.cpu().numpy(), output_sizes.numpy(), target_strings))
for x in range(len(target_strings)):
transcript, reference = decoded_output[x][0], target_strings[x][0]
wer_inst = decoder.wer(transcript, reference)
Expand All @@ -81,5 +82,5 @@
print('Test Summary \t'
'Average WER {wer:.3f}\t'
'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))
if args.save_output:
np.save(args.output_path, output_data)
if args.save_output is not None:
np.save(args.save_output, output_data)
52 changes: 20 additions & 32 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np
import torch.distributed as dist
import torch.utils.data.distributed
from apex.fp16_utils import FP16_Optimizer
from apex import amp
from apex.parallel import DistributedDataParallel
from tqdm import tqdm
from warpctc_pytorch import CTCLoss
Expand All @@ -16,7 +16,7 @@
from decoder import GreedyDecoder
from logger import VisdomLogger, TensorBoardLogger
from model import DeepSpeech, supported_rnns
from utils import convert_model_to_half, reduce_tensor, check_loss
from utils import reduce_tensor, check_loss

parser = argparse.ArgumentParser(description='DeepSpeech training')
parser.add_argument('--train-manifest', metavar='DIR',
Expand Down Expand Up @@ -77,15 +77,10 @@
parser.add_argument('--gpu-rank', default=None,
help='If using distributed parallel for multi-gpu, sets the GPU for the process')
parser.add_argument('--seed', default=123456, type=int, help='Seed to generators')
parser.add_argument('--mixed-precision', action='store_true',
help='Uses mixed precision to train a model (suggested with volta and above)')
parser.add_argument('--static-loss-scale', type=float, default=1,
help='Static loss scale for mixed precision, ' +
'positive power of 2 values can improve FP16 convergence,' +
'however dynamic loss scaling is preferred.')
parser.add_argument('--dynamic-loss-scale', action='store_true',
help='Use dynamic loss scaling for mixed precision. If supplied, this argument supersedes ' +
'--static_loss_scale. Suggested to turn on for mixed precision')
parser.add_argument('--opt-level', type=str)
parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
parser.add_argument('--loss-scale', type=str, default=None)

torch.manual_seed(123456)
torch.cuda.manual_seed_all(123456)

Expand Down Expand Up @@ -123,8 +118,6 @@ def update(self, val, n=1):
random.seed(args.seed)

device = torch.device("cuda" if args.cuda else "cpu")
if args.mixed_precision and not args.cuda:
raise ValueError('If using mixed precision training, CUDA must be enabled!')
args.distributed = args.world_size > 1
main_proc = True
device = torch.device("cuda" if args.cuda else "cpu")
Expand All @@ -150,7 +143,7 @@ def update(self, val, n=1):
print("Loading checkpoint model %s" % args.continue_from)
package = torch.load(args.continue_from, map_location=lambda storage, loc: storage)
model = DeepSpeech.load_model_package(package)
labels = model.labels
labels = model.labels
audio_conf = model.audio_conf
if not args.finetune: # Don't want to restart training
optim_state = package['optim_dict']
Expand Down Expand Up @@ -188,8 +181,7 @@ def update(self, val, n=1):
labels=labels,
rnn_type=supported_rnns[rnn_type],
audio_conf=audio_conf,
bidirectional=args.bidirectional,
mixed_precision=args.mixed_precision)
bidirectional=args.bidirectional)

decoder = GreedyDecoder(labels)
train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels,
Expand All @@ -211,19 +203,18 @@ def update(self, val, n=1):
train_sampler.shuffle(start_epoch)

model = model.to(device)
if args.mixed_precision:
model = convert_model_to_half(model)
parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=args.lr,
momentum=args.momentum, nesterov=True, weight_decay=1e-5)
if args.distributed:
model = DistributedDataParallel(model)
if args.mixed_precision:
optimizer = FP16_Optimizer(optimizer,
static_loss_scale=args.static_loss_scale,
dynamic_loss_scale=args.dynamic_loss_scale)
if optim_state is not None:
optimizer.load_state_dict(optim_state)

model, optimizer = amp.initialize(model, optimizer,
opt_level=args.opt_level,
keep_batchnorm_fp32=args.keep_batchnorm_fp32,
loss_scale=args.loss_scale)
if args.distributed:
model = DistributedDataParallel(model)
print(model)
print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

Expand Down Expand Up @@ -263,12 +254,10 @@ def update(self, val, n=1):
if valid_loss:
optimizer.zero_grad()
# compute gradient
if args.mixed_precision:
optimizer.backward(loss)
optimizer.clip_master_grads(args.max_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)

with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
optimizer.step()
else:
print(error)
Expand Down Expand Up @@ -364,8 +353,7 @@ def update(self, val, n=1):
wer_results=wer_results, cer_results=cer_results),
file_path)
# anneal lr
param_groups = optimizer.optimizer.param_groups if args.mixed_precision else optimizer.param_groups
for g in param_groups:
for g in optimizer.param_groups:
g['lr'] = g['lr'] / args.learning_anneal
print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))

Expand Down
Loading

0 comments on commit b8e34cc

Please sign in to comment.