Simplify decoder structure

experimenti · Aug 24, 2017 · 7025ec6 · 7025ec6
1 parent d5e86f8
commit 7025ec6
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 40 deletions.
diff --git a/decoder.py b/decoder.py
@@ -17,15 +17,8 @@
 
 import Levenshtein as Lev
 import torch
-from enum import Enum
 from six.moves import xrange
 
-try:
- from pytorch_ctc import CTCBeamDecoder as CTCBD
- from pytorch_ctc import Scorer, KenLMScorer
-except ImportError:
- print("warn: pytorch_ctc unavailable. Only greedy decoding is supported.")
-
 
 class Decoder(object):
  """
@@ -134,17 +127,25 @@ def decode(self, probs, sizes=None):
 
 
 class BeamCTCDecoder(Decoder):
- def __init__(self, labels, scorer, beam_width=20, top_paths=1, blank_index=0, space_index=28):
+ def __init__(self, labels, beam_width=20, top_paths=1, blank_index=0, space_index=28, lm_path=None, trie_path=None,
+ lm_alpha=None, lm_beta1=None, lm_beta2=None):
  super(BeamCTCDecoder, self).__init__(labels, blank_index=blank_index, space_index=space_index)
  self._beam_width = beam_width
  self._top_n = top_paths
+
  try:
- import pytorch_ctc
+ from pytorch_ctc import CTCBeamDecoder, Scorer, KenLMScorer
  except ImportError:
  raise ImportError("BeamCTCDecoder requires pytorch_ctc package.")
-
- self._decoder = CTCBD(scorer, labels, top_paths=top_paths, beam_width=beam_width,
- blank_index=blank_index, space_index=space_index, merge_repeated=False)
+ if lm_path is not None:
+ scorer = KenLMScorer(labels, lm_path, trie_path)
+ scorer.set_lm_weight(lm_alpha)
+ scorer.set_word_weight(lm_beta1)
+ scorer.set_valid_word_weight(lm_beta2)
+ else:
+ scorer = Scorer()
+ self._decoder = CTCBeamDecoder(scorer, labels, top_paths=top_paths, beam_width=beam_width,
+ blank_index=blank_index, space_index=space_index, merge_repeated=False)
 
  def decode(self, probs, sizes=None):
  sizes = sizes.cpu() if sizes is not None else None

diff --git a/predict.py b/predict.py
@@ -2,11 +2,11 @@
 import sys
 import time
 
-import torch
+from decoder import GreedyDecoder, BeamCTCDecoder
+
 from torch.autograd import Variable
 
 from data.data_loader import SpectrogramParser
-from decoder import GreedyDecoder, BeamCTCDecoder, Scorer, KenLMScorer
 from model import DeepSpeech
 
 parser = argparse.ArgumentParser(description='DeepSpeech prediction')
@@ -18,8 +18,10 @@
 parser.add_argument('--decoder', default="greedy", choices=["greedy", "beam"], type=str, help="Decoder to use")
 beam_args = parser.add_argument_group("Beam Decode Options", "Configurations options for the CTC Beam Search decoder")
 beam_args.add_argument('--beam_width', default=10, type=int, help='Beam width to use')
-beam_args.add_argument('--lm_path', default=None, type=str, help='Path to an (optional) kenlm language model for use with beam search (req\'d with trie)')
-beam_args.add_argument('--trie_path', default=None, type=str, help='Path to an (optional) trie dictionary for use with beam search (req\'d with LM)')
+beam_args.add_argument('--lm_path', default=None, type=str,
+ help='Path to an (optional) kenlm language model for use with beam search (req\'d with trie)')
+beam_args.add_argument('--trie_path', default=None, type=str,
+ help='Path to an (optional) trie dictionary for use with beam search (req\'d with LM)')
 beam_args.add_argument('--lm_alpha', default=0.8, type=float, help='Language model weight')
 beam_args.add_argument('--lm_beta1', default=1, type=float, help='Language model word bonus (all words)')
 beam_args.add_argument('--lm_beta2', default=1, type=float, help='Language model word bonus (IV words)')
@@ -33,15 +35,10 @@
  audio_conf = DeepSpeech.get_audio_conf(model)
 
  if args.decoder == "beam":
- scorer = None
- if args.lm_path is not None:
- scorer = KenLMScorer(labels, args.lm_path, args.trie_path)
- scorer.set_lm_weight(args.lm_alpha)
- scorer.set_word_weight(args.lm_beta1)
- scorer.set_valid_word_weight(args.lm_beta2)
- else:
- scorer = Scorer()
- decoder = BeamCTCDecoder(labels, scorer, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_'))
+ decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '),
+ blank_index=labels.index('_'), lm_path=args.lm_path,
+ trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta1=args.lm_beta1,
+ lm_beta2=args.lm_beta2)
  else:
  decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_'))
 
@@ -56,4 +53,5 @@
  t1 = time.time()
 
  print(decoded_output[0])
- print("Decoded {0:.2f} seconds of audio in {1:.2f} seconds".format(spect.size(3)*audio_conf['window_stride'], t1-t0), file=sys.stderr)
+ print("Decoded {0:.2f} seconds of audio in {1:.2f} seconds".format(spect.size(3) * audio_conf['window_stride'],
+ t1 - t0), file=sys.stderr)
diff --git a/test.py b/test.py
@@ -1,11 +1,9 @@
 import argparse
-import json
 
-import torch
 from torch.autograd import Variable
+from decoder import GreedyDecoder, BeamCTCDecoder
 
 from data.data_loader import SpectrogramDataset, AudioDataLoader
-from decoder import GreedyDecoder, BeamCTCDecoder, Scorer, KenLMScorer
 from model import DeepSpeech
 
 parser = argparse.ArgumentParser(description='DeepSpeech prediction')
@@ -19,8 +17,10 @@
 parser.add_argument('--decoder', default="greedy", choices=["greedy", "beam"], type=str, help="Decoder to use")
 beam_args = parser.add_argument_group("Beam Decode Options", "Configurations options for the CTC Beam Search decoder")
 beam_args.add_argument('--beam_width', default=10, type=int, help='Beam width to use')
-beam_args.add_argument('--lm_path', default=None, type=str, help='Path to an (optional) kenlm language model for use with beam search (req\'d with trie)')
-beam_args.add_argument('--trie_path', default=None, type=str, help='Path to an (optional) trie dictionary for use with beam search (req\'d with LM)')
+beam_args.add_argument('--lm_path', default=None, type=str,
+ help='Path to an (optional) kenlm language model for use with beam search (req\'d with trie)')
+beam_args.add_argument('--trie_path', default=None, type=str,
+ help='Path to an (optional) trie dictionary for use with beam search (req\'d with LM)')
 beam_args.add_argument('--lm_alpha', default=0.8, type=float, help='Language model weight')
 beam_args.add_argument('--lm_beta1', default=1, type=float, help='Language model word bonus (all words)')
 beam_args.add_argument('--lm_beta2', default=1, type=float, help='Language model word bonus (IV words)')
@@ -34,15 +34,10 @@
  audio_conf = DeepSpeech.get_audio_conf(model)
 
  if args.decoder == "beam":
- scorer = None
- if args.lm_path is not None:
- scorer = KenLMScorer(labels, args.lm_path, args.trie_path)
- scorer.set_lm_weight(args.lm_alpha)
- scorer.set_word_weight(args.lm_beta1)
- scorer.set_valid_word_weight(args.lm_beta2)
- else:
- scorer = Scorer()
- decoder = BeamCTCDecoder(labels, scorer, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '), blank_index=labels.index('_'))
+ decoder = BeamCTCDecoder(labels, beam_width=args.beam_width, top_paths=1, space_index=labels.index(' '),
+ blank_index=labels.index('_'), lm_path=args.lm_path,
+ trie_path=args.trie_path, lm_alpha=args.lm_alpha, lm_beta1=args.lm_beta1,
+ lm_beta2=args.lm_beta2)
  else:
  decoder = GreedyDecoder(labels, space_index=labels.index(' '), blank_index=labels.index('_'))