Removed need for torchaudio, added improvements for APEX mixed precis…

…ion, improved tuning for LMs, updated Docs
DevKiHyun · Jul 25, 2019 · b8e34cc · b8e34cc
1 parent 98c987a
commit b8e34cc
Show file tree

Hide file tree

Showing 11 changed files with 131 additions and 177 deletions.
diff --git a/README.md b/README.md
@@ -35,13 +35,6 @@ export CUDA_HOME="/usr/local/cuda"
 cd ../pytorch_binding && python setup.py install
 ```
 
-Install pytorch audio:
-```
-sudo apt-get install sox libsox-dev libsox-fmt-all
-git clone https://github.com/pytorch/audio.git
-cd audio && python setup.py install
-```
-
 Install NVIDIA apex:
 ```
 git clone --recursive https://github.com/NVIDIA/apex.git
@@ -180,22 +173,26 @@ python -m multiproc train.py --visdom --cuda # Add your parameters as normal, mu
 
 multiproc will open a log for all processes other than the main process.
 
+You can also specify specific GPU IDs rather than allowing the script to use all available GPUs:
+
+```
+python -m multiproc train.py --visdom --cuda --device-ids 0,1,2,3 # Add your parameters as normal, will only run on 4 GPUs
+```
+
 We suggest using the NCCL backend which defaults to TCP if Infiniband isn't available.
 
 ## Mixed Precision
 
-If you are using NVIDIA volta cards or above to train your model, it's highly suggested to turn on mixed precision for speed/memory benefits. More information can be found [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html). Also suggested is to turn on dyanmic loss scaling to handle small grad values:
+If you are using NVIDIA volta cards or above to train your model, it's highly suggested to turn on mixed precision for speed/memory benefits. More information can be found [here](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html).
 
-```
-python train.py --train-manifest data/train_manifest.csv --val-manifest data/val_manifest.csv --mixed-precision --dynamic-loss-scale
-```
-
-You can also specify specific GPU IDs rather than allowing the script to use all available GPUs:
+Different Optimization levels are available. More information on the Nvidia Apex API can be seen [here](https://nvidia.github.io/apex/amp.html#opt-levels).
 
 ```
-python -m multiproc train.py --visdom --cuda --device-ids 0,1,2,3 # Add your parameters as normal, will only run on 4 GPUs
+python train.py --train-manifest data/train_manifest.csv --val-manifest data/val_manifest.csv --opt-level O1 --loss-scale 1.0
 ```
 
+Training a model in mixed-precision means you can use 32 bit float or half precision at runtime. Float is default, to use half precision (Which on V100s come with a speedup and better memory use) use the `--half` flag when testing or transcribing.
+
 ### Noise Augmentation/Injection
 
 There is support for two different types of noise; noise augmentation and noise injection.

diff --git a/data/data_loader.py b/data/data_loader.py
@@ -10,7 +10,7 @@
 import numpy as np
 import scipy.signal
 import torch
-import torchaudio
+from scipy.io.wavfile import read
 import math
 from torch.utils.data import DataLoader
 from torch.utils.data import Dataset
@@ -20,8 +20,8 @@
 
 
 def load_audio(path):
- sound, _ = torchaudio.load(path, normalization=True)
- sound = sound.numpy().T
+ sample_rate, sound = read(path)
+ sound = sound.astype('float32') / 32767 # normalize audio
  if len(sound.shape) > 1:
  if sound.shape[1] == 1:
  sound = sound.squeeze()

diff --git a/model.py b/model.py
@@ -143,7 +143,7 @@ def __repr__(self):
 
 class DeepSpeech(nn.Module):
  def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5, audio_conf=None,
- bidirectional=True, context=20, mixed_precision=False):
+ bidirectional=True, context=20):
  super(DeepSpeech, self).__init__()
 
  # model metadata needed for serialization/deserialization
@@ -156,7 +156,6 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layer
  self.audio_conf = audio_conf or {}
  self.labels = labels
  self.bidirectional = bidirectional
- self.mixed_precision = mixed_precision
 
  sample_rate = self.audio_conf.get("sample_rate", 16000)
  window_size = self.audio_conf.get("window_size", 0.02)
@@ -201,8 +200,6 @@ def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layer
  self.inference_softmax = InferenceBatchSoftmax()
 
  def forward(self, x, lengths):
- if x.is_cuda and self.mixed_precision:
- x = x.half()
  lengths = lengths.cpu().int()
  output_lengths = self.get_seq_lens(lengths)
  x, _ = self.conv(x, output_lengths)
@@ -244,8 +241,7 @@ def load_model(cls, path):
  labels=package['labels'],
  audio_conf=package['audio_conf'],
  rnn_type=supported_rnns[package['rnn_type']],
- bidirectional=package.get('bidirectional', True),
- mixed_precision=package.get('mixed_precision', False))
+ bidirectional=package.get('bidirectional', True))
  model.load_state_dict(package['state_dict'])
  for x in model.rnns:
  x.flatten_parameters()
@@ -258,8 +254,7 @@ def load_model_package(cls, package):
  labels=package['labels'],
  audio_conf=package['audio_conf'],
  rnn_type=supported_rnns[package['rnn_type']],
- bidirectional=package.get('bidirectional', True),
- mixed_precision=package.get('mixed_precision', False))
+ bidirectional=package.get('bidirectional', True))
  model.load_state_dict(package['state_dict'])
  return model
 
@@ -275,7 +270,6 @@ def serialize(model, optimizer=None, epoch=None, iteration=None, loss_results=No
  'labels': model.labels,
  'state_dict': model.state_dict(),
  'bidirectional': model.bidirectional,
- 'mixed_precision': model.mixed_precision
  }
  if optimizer is not None:
  package['optim_dict'] = optimizer.state_dict()

diff --git a/noise_inject.py b/noise_inject.py
@@ -1,7 +1,7 @@
 import argparse
 
 import torch
-import torchaudio
+from scipy.io.wavfile import write
 
 from data.data_loader import load_audio, NoiseInjection
 
@@ -18,5 +18,7 @@
 data = load_audio(args.input_path)
 mixed_data = noise_injector.inject_noise_sample(data, args.noise_path, args.noise_level)
 mixed_data = torch.tensor(mixed_data, dtype=torch.float).unsqueeze(1) # Add channels dim
-torchaudio.save(args.output_path, mixed_data, args.sample_rate)
+write(filename=args.output_path,
+ data=mixed_data.numpy(),
+ rate=args.sample_rate)
 print('Saved mixed file to %s' % args.output_path)
diff --git a/opts.py b/opts.py
@@ -17,7 +17,9 @@ def add_decoder_args(parser):
 
 
 def add_inference_args(parser):
- parser.add_argument('--cuda', action="store_true", help='Use cuda to test model')
+ parser.add_argument('--cuda', action="store_true", help='Use cuda')
+ parser.add_argument('--half', action="store_true",
+ help='Use half precision. This is recommended when using mixed-precision at training time')
  parser.add_argument('--decoder', default="greedy", choices=["greedy", "beam"], type=str, help="Decoder to use")
  parser.add_argument('--model-path', default='models/deepspeech_final.pth',
  help='Path to model file created by training')

diff --git a/server.py b/server.py
@@ -32,7 +32,12 @@ def transcribe_file():
  with NamedTemporaryFile(suffix=file_extension) as tmp_saved_audio_file:
  file.save(tmp_saved_audio_file.name)
  logging.info('Transcribing file...')
- transcription, _ = transcribe(tmp_saved_audio_file.name, spect_parser, model, decoder, device)
+ transcription, _ = transcribe(audio_path=tmp_saved_audio_file,
+ spect_parser=spect_parser,
+ model=model,
+ decoder=decoder,
+ device=device,
+ use_half=args.half)
  logging.info('File transcribed')
  res['status'] = "OK"
  res['transcription'] = transcription
@@ -53,7 +58,7 @@ def main():
  logging.info('Setting up server...')
  torch.set_grad_enabled(False)
  device = torch.device("cuda" if args.cuda else "cpu")
- model = load_model(device, args.model_path, args.cuda)
+ model = load_model(device, args.model_path, args.half)
 
  if args.decoder == "beam":
  from decoder import BeamCTCDecoder

diff --git a/test.py b/test.py
@@ -16,15 +16,14 @@
 parser.add_argument('--batch-size', default=20, type=int, help='Batch size for training')
 parser.add_argument('--num-workers', default=4, type=int, help='Number of workers used in dataloading')
 parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample")
-parser.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output")
+parser.add_argument('--save-output', default=None, help="Saves output of model from test to this file_path")
 parser = add_decoder_args(parser)
-parser.add_argument('--save-output', action="store_true", help="Saves output of model from test")
 args = parser.parse_args()
 
 if __name__ == '__main__':
  torch.set_grad_enabled(False)
  device = torch.device("cuda" if args.cuda else "cpu")
- model = load_model(device, args.model_path, args.cuda)
+ model = load_model(device, args.model_path, args.half)
 
  if args.decoder == "beam":
  from decoder import BeamCTCDecoder
@@ -47,6 +46,8 @@
  inputs, targets, input_percentages, target_sizes = data
  input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
  inputs = inputs.to(device)
+ if args.half:
+ inputs = inputs.half()
  # unflatten targets
  split_targets = []
  offset = 0
@@ -56,12 +57,12 @@
 
  out, output_sizes = model(inputs, input_sizes)
 
- if args.save_output:
- # add output to data array, and continue
- output_data.append((out.cpu().numpy(), output_sizes.numpy()))
-
  decoded_output, _ = decoder.decode(out, output_sizes)
  target_strings = target_decoder.convert_to_strings(split_targets)
+
+ if args.save_output is not None:
+ # add output to data array, and continue
+ output_data.append((out.cpu().numpy(), output_sizes.numpy(), target_strings))
  for x in range(len(target_strings)):
  transcript, reference = decoded_output[x][0], target_strings[x][0]
  wer_inst = decoder.wer(transcript, reference)
@@ -81,5 +82,5 @@
  print('Test Summary \t'
  'Average WER {wer:.3f}\t'
  'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))
- if args.save_output:
- np.save(args.output_path, output_data)
+ if args.save_output is not None:
+ np.save(args.save_output, output_data)
diff --git a/train.py b/train.py
@@ -7,7 +7,7 @@
 import numpy as np
 import torch.distributed as dist
 import torch.utils.data.distributed
-from apex.fp16_utils import FP16_Optimizer
+from apex import amp
 from apex.parallel import DistributedDataParallel
 from tqdm import tqdm
 from warpctc_pytorch import CTCLoss
@@ -16,7 +16,7 @@
 from decoder import GreedyDecoder
 from logger import VisdomLogger, TensorBoardLogger
 from model import DeepSpeech, supported_rnns
-from utils import convert_model_to_half, reduce_tensor, check_loss
+from utils import reduce_tensor, check_loss
 
 parser = argparse.ArgumentParser(description='DeepSpeech training')
 parser.add_argument('--train-manifest', metavar='DIR',
@@ -77,15 +77,10 @@
 parser.add_argument('--gpu-rank', default=None,
  help='If using distributed parallel for multi-gpu, sets the GPU for the process')
 parser.add_argument('--seed', default=123456, type=int, help='Seed to generators')
-parser.add_argument('--mixed-precision', action='store_true',
- help='Uses mixed precision to train a model (suggested with volta and above)')
-parser.add_argument('--static-loss-scale', type=float, default=1,
- help='Static loss scale for mixed precision, ' +
- 'positive power of 2 values can improve FP16 convergence,' +
- 'however dynamic loss scaling is preferred.')
-parser.add_argument('--dynamic-loss-scale', action='store_true',
- help='Use dynamic loss scaling for mixed precision. If supplied, this argument supersedes ' +
- '--static_loss_scale. Suggested to turn on for mixed precision')
+parser.add_argument('--opt-level', type=str)
+parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+parser.add_argument('--loss-scale', type=str, default=None)
+
 torch.manual_seed(123456)
 torch.cuda.manual_seed_all(123456)
 
@@ -123,8 +118,6 @@ def update(self, val, n=1):
  random.seed(args.seed)
 
  device = torch.device("cuda" if args.cuda else "cpu")
- if args.mixed_precision and not args.cuda:
- raise ValueError('If using mixed precision training, CUDA must be enabled!')
  args.distributed = args.world_size > 1
  main_proc = True
  device = torch.device("cuda" if args.cuda else "cpu")
@@ -150,7 +143,7 @@ def update(self, val, n=1):
  print("Loading checkpoint model %s" % args.continue_from)
  package = torch.load(args.continue_from, map_location=lambda storage, loc: storage)
  model = DeepSpeech.load_model_package(package)
- labels = model.labels 
+ labels = model.labels
  audio_conf = model.audio_conf
  if not args.finetune: # Don't want to restart training
  optim_state = package['optim_dict']
@@ -188,8 +181,7 @@ def update(self, val, n=1):
  labels=labels,
  rnn_type=supported_rnns[rnn_type],
  audio_conf=audio_conf,
- bidirectional=args.bidirectional,
- mixed_precision=args.mixed_precision)
+ bidirectional=args.bidirectional)
 
  decoder = GreedyDecoder(labels)
  train_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.train_manifest, labels=labels,
@@ -211,19 +203,18 @@ def update(self, val, n=1):
  train_sampler.shuffle(start_epoch)
 
  model = model.to(device)
- if args.mixed_precision:
- model = convert_model_to_half(model)
  parameters = model.parameters()
  optimizer = torch.optim.SGD(parameters, lr=args.lr,
  momentum=args.momentum, nesterov=True, weight_decay=1e-5)
- if args.distributed:
- model = DistributedDataParallel(model)
- if args.mixed_precision:
- optimizer = FP16_Optimizer(optimizer,
- static_loss_scale=args.static_loss_scale,
- dynamic_loss_scale=args.dynamic_loss_scale)
  if optim_state is not None:
  optimizer.load_state_dict(optim_state)
+
+ model, optimizer = amp.initialize(model, optimizer,
+ opt_level=args.opt_level,
+ keep_batchnorm_fp32=args.keep_batchnorm_fp32,
+ loss_scale=args.loss_scale)
+ if args.distributed:
+ model = DistributedDataParallel(model)
  print(model)
  print("Number of parameters: %d" % DeepSpeech.get_param_size(model))
 
@@ -263,12 +254,10 @@ def update(self, val, n=1):
  if valid_loss:
  optimizer.zero_grad()
  # compute gradient
- if args.mixed_precision:
- optimizer.backward(loss)
- optimizer.clip_master_grads(args.max_norm)
- else:
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
+
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
  optimizer.step()
  else:
  print(error)
@@ -364,8 +353,7 @@ def update(self, val, n=1):
  wer_results=wer_results, cer_results=cer_results),
  file_path)
  # anneal lr
- param_groups = optimizer.optimizer.param_groups if args.mixed_precision else optimizer.param_groups
- for g in param_groups:
+ for g in optimizer.param_groups:
  g['lr'] = g['lr'] / args.learning_anneal
  print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))