Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into bucketing
Browse files Browse the repository at this point in the history
# Conflicts:
#	train.py
  • Loading branch information
SeanNaren committed Jun 14, 2017
2 parents e51bad0 + 5ccac99 commit b0370d0
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 35 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,20 @@ python model.py --model_path models/deepspeech.pth.tar

To also note, there is no final softmax layer on the model as when trained, warp-ctc does this softmax internally. This will have to also be implemented in complex decoders if anything is built on top of the model, so take this into consideration!

## Testing/Inference

To evaluate a trained model on a test set (has to be in the same format as the training set):

```
python test.py --model_path models/deepspeech.pth.tar --test_manifest /path/to/test_manifest.csv --cuda
```

An example script to output a prediction has been provided:

```
python predict.py --model_path models/deepspeech.pth.tar --audio_path /path/to/audio.wav
```

## Acknowledgements

Thanks to [Egor](https://github.com/EgorLakomkin) and [Ryan](https://github.com/ryanleary) for their contributions!
8 changes: 5 additions & 3 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
labels=labels,
rnn_type=supported_rnns[rnn_type])

print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

parameters = model.parameters()
optimizer = torch.optim.SGD(parameters, lr=3e-4,
momentum=0.9, nesterov=True)
Expand All @@ -56,14 +58,14 @@ def iteration(input_data):
input_percentages = torch.IntTensor(batch_size).fill_(1)

inputs = Variable(input_data, requires_grad=False)
target_sizes = Variable(target_size requires_grad=False)
targets = Variable(target requires_grad=False)
target_sizes = Variable(target_size, requires_grad=False)
targets = Variable(target, requires_grad=False)
start = time.time()
out = model(inputs)
out = out.transpose(0, 1) # TxNxH

seq_length = out.size(0)
sizes = Variable(input_percentages.mul_(int(seq_length)).int() requires_grad=False)
sizes = Variable(input_percentages.mul_(int(seq_length)).int(), requires_grad=False)
loss = criterion(out, targets, sizes, target_sizes)
loss = loss / inputs.size(0) # average the loss by minibatch
# compute gradient
Expand Down
2 changes: 1 addition & 1 deletion logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from io import BytesIO # Python 3.x


class Logger(object):
class TensorBoardLogger(object):

def __init__(self, log_dir):
"""Create a summary writer logging to log_dir."""
Expand Down
34 changes: 24 additions & 10 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
'rnn': nn.RNN,
'gru': nn.GRU
}
supported_rnns_inv = dict((v,k) for k,v in supported_rnns.items())
supported_rnns_inv = dict((v, k) for k, v in supported_rnns.items())


class SequenceWise(nn.Module):
Expand Down Expand Up @@ -41,15 +41,14 @@ def __init__(self, input_size, hidden_size, rnn_type=nn.LSTM, bidirectional=Fals
super(BatchRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.batch_norm_activate = batch_norm
self.bidirectional = bidirectional
self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size))
self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
self.rnn = rnn_type(input_size=input_size, hidden_size=hidden_size,
bidirectional=bidirectional, bias=False)
self.num_directions = 2 if bidirectional else 1

def forward(self, x):
if self.batch_norm_activate:
if self.batch_norm is not None:
x = self.batch_norm(x)
x, _ = self.rnn(x)
if self.bidirectional:
Expand All @@ -58,10 +57,13 @@ def forward(self, x):


class DeepSpeech(nn.Module):
def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5, audio_conf={}, bidirectional=True):
def __init__(self, rnn_type=nn.LSTM, labels="abc", rnn_hidden_size=768, nb_layers=5, audio_conf=None,
bidirectional=True):
super(DeepSpeech, self).__init__()

# model metadata needed for serialization/deserialization
if audio_conf is None:
audio_conf = {}
self._version = '0.0.1'
self._hidden_size = rnn_hidden_size
self._hidden_layers = nb_layers
Expand Down Expand Up @@ -121,7 +123,8 @@ def forward(self, x):
def load_model(cls, path, cuda=False):
package = torch.load(path, map_location=lambda storage, loc: storage)
model = cls(rnn_hidden_size=package['hidden_size'], nb_layers=package['hidden_layers'],
labels=package['labels'], audio_conf=package['audio_conf'], rnn_type=supported_rnns[package['rnn_type']])
labels=package['labels'], audio_conf=package['audio_conf'],
rnn_type=supported_rnns[package['rnn_type']])
model.load_state_dict(package['state_dict'])
if cuda:
model = torch.nn.DataParallel(model).cuda()
Expand Down Expand Up @@ -162,15 +165,26 @@ def get_labels(model):
model_is_cuda = next(model.parameters()).is_cuda
return model.module._labels if model_is_cuda else model._labels

@staticmethod
def get_param_size(model):
params = 0
for p in model.parameters():
tmp = 1
for x in p.size():
tmp *= x
params += tmp
return params

@staticmethod
def get_audio_conf(model):
model_is_cuda = next(model.parameters()).is_cuda
return model.module._audio_conf if model_is_cuda else model._audio_conf


if __name__ == '__main__':
import os.path
import argparse
import json

parser = argparse.ArgumentParser(description='DeepSpeech model information')
parser.add_argument('--model_path', default='models/deepspeech_final.pth.tar',
help='Path to model file created by training')
Expand Down Expand Up @@ -199,9 +213,9 @@ def get_audio_conf(model):
print("Training Information")
epochs = package['epoch']
print(" Epochs: ", epochs)
print(" Current Loss: {0:.3f}".format(package['loss_results'][epochs-1]))
print(" Current CER: {0:.3f}".format(package['cer_results'][epochs-1]))
print(" Current WER: {0:.3f}".format(package['wer_results'][epochs-1]))
print(" Current Loss: {0:.3f}".format(package['loss_results'][epochs - 1]))
print(" Current CER: {0:.3f}".format(package['cer_results'][epochs - 1]))
print(" Current WER: {0:.3f}".format(package['wer_results'][epochs - 1]))

if package.get('meta', None) is not None:
print("")
Expand Down
8 changes: 4 additions & 4 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
parser.add_argument('--model_path', default='models/deepspeech_final.pth.tar',
help='Path to model file created by training')
parser.add_argument('--cuda', action="store_true", help='Use cuda to test model')
parser.add_argument('--val_manifest', metavar='DIR',
help='path to validation manifest csv', default='data/val_manifest.csv')
parser.add_argument('--test_manifest', metavar='DIR',
help='path to validation manifest csv', default='data/test_manifest.csv')
parser.add_argument('--batch_size', default=20, type=int, help='Batch size for training')
parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading')
args = parser.parse_args()
Expand All @@ -26,7 +26,7 @@
audio_conf = DeepSpeech.get_audio_conf(model)
decoder = ArgMaxDecoder(labels)

test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.val_manifest, labels=labels,
test_dataset = SpectrogramDataset(audio_conf=audio_conf, manifest_filepath=args.test_manifest, labels=labels,
normalize=True)
test_loader = AudioDataLoader(test_dataset, batch_size=args.batch_size,
num_workers=args.num_workers)
Expand Down Expand Up @@ -63,6 +63,6 @@
wer = total_wer / len(test_loader.dataset)
cer = total_cer / len(test_loader.dataset)

print('Validation Summary \t'
print('Test Summary \t'
'Average WER {wer:.3f}\t'
'Average CER {cer:.3f}\t'.format(wer=wer * 100, cer=cer * 100))
28 changes: 11 additions & 17 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@
help='Turn off bucketing and sample from dataset based on sequence length (smallest to largest)')
parser.set_defaults(cuda=False, silent=False, checkpoint=False, visdom=False, augment=False, tensorboard=False,
log_params=False, no_bucketing=False)


def to_np(x):
return x.data.cpu().numpy()

Expand Down Expand Up @@ -86,7 +84,8 @@ def main():
args = parser.parse_args()
save_folder = args.save_folder

loss_results, cer_results, wer_results = None, None, None
loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
args.epochs)
if args.visdom:
from visdom import Visdom
viz = Visdom()
Expand All @@ -96,11 +95,9 @@ def main():
dict(title='CER', ylabel='CER', xlabel='Epoch')]

viz_windows = [None, None, None]
loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
args.epochs)
epochs = torch.arange(1, args.epochs + 1)
if args.tensorboard:
from logger import Logger
from logger import TensorBoardLogger
try:
os.makedirs(args.log_dir)
except OSError as e:
Expand All @@ -115,9 +112,7 @@ def main():
raise
else:
raise
loss_results, cer_results, wer_results = torch.Tensor(args.epochs), torch.Tensor(args.epochs), torch.Tensor(
args.epochs)
logger = Logger(args.log_dir)
logger = TensorBoardLogger(args.log_dir)

try:
os.makedirs(save_folder)
Expand Down Expand Up @@ -165,7 +160,7 @@ def main():
package = torch.load(args.continue_from)
model.load_state_dict(package['state_dict'])
optimizer.load_state_dict(package['optim_dict'])
start_epoch = int(package.get('epoch', None) or 1) - 1 # Python index start at 0 for training
start_epoch = int(package.get('epoch', 1)) - 1 # Python index start at 0 for training
start_iter = package.get('iteration', None)
if start_iter is None:
start_epoch += 1 # Assume that we saved a model after an epoch finished, so start at the next epoch.
Expand All @@ -177,8 +172,7 @@ def main():
package['loss_results'] is not None and start_epoch > 0: # Add previous scores to visdom graph
epoch = start_epoch
loss_results[0:epoch], cer_results[0:epoch], wer_results[0:epoch] = package['loss_results'], package[
'cer_results'], package[
'wer_results']
'cer_results'], package['wer_results']
x_axis = epochs[0:epoch]
y_axis = [loss_results[0:epoch], wer_results[0:epoch], cer_results[0:epoch]]
for x in range(len(viz_windows)):
Expand All @@ -187,9 +181,8 @@ def main():
Y=y_axis[x],
opts=opts[x],
)
if args.tensorboard and package[
'loss_results'] is not None and start_epoch > 0: # Add previous scores to tensorboard logs
epoch = start_epoch
if args.tensorboard and \
package['loss_results'] is not None and start_epoch > 0: # Previous scores to tensorboard logs
loss_results, cer_results, wer_results = package['loss_results'], package['cer_results'], package[
'wer_results']
for i in range(len(loss_results)):
Expand All @@ -208,6 +201,8 @@ def main():
model = torch.nn.DataParallel(model).cuda()

print(model)
print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
Expand Down Expand Up @@ -365,8 +360,7 @@ def main():
for tag, value in model.named_parameters():
tag = tag.replace('.', '/')
logger.histo_summary(tag, to_np(value), epoch + 1)
if value.grad is not None: # Condition inserted because batch_norm RNN_0 weights.grad and bias.grad are None. Check why
logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1)
logger.histo_summary(tag + '/grad', to_np(value.grad), epoch + 1)
if args.checkpoint:
file_path = '%s/deepspeech_%d.pth.tar' % (save_folder, epoch + 1)
torch.save(DeepSpeech.serialize(model, optimizer=optimizer, epoch=epoch, loss_results=loss_results,
Expand Down

0 comments on commit b0370d0

Please sign in to comment.