[egs] madcat arabic: clean scripts, tuning, rescoring, text localizat…

…ion (#2716)
kaldi-asr · Oct 9, 2018 · 43ec82e · 43ec82e
1 parent 735e2a5
commit 43ec82e
Show file tree

Hide file tree

Showing 30 changed files with 1,108 additions and 444 deletions.
diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py
@@ -45,10 +45,13 @@
  'and right side of the image.')
 parser.add_argument('--num-channels', type=int, default=1,
  help='Number of color channels')
+parser.add_argument('--vertical-shift', type=int, default=0,
+ help='total number of padding pixel per column')
 parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
  help="Flip the image left-right for right to left languages")
-parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
- help="performs image augmentation")
+parser.add_argument('--augment_type', type=str, default='no_aug',
+ choices=['no_aug', 'random_scale','random_shift'],
+ help='Subset of data to process.')
 args = parser.parse_args()
 
 
@@ -68,7 +71,6 @@ def write_kaldi_matrix(file_handle, matrix, key):
  file_handle.write("\n")
  file_handle.write(" ]\n")
 
-
 def horizontal_pad(im, allowed_lengths = None):
  if allowed_lengths is None:
  left_padding = right_padding = args.padding
@@ -112,6 +114,33 @@ def get_scaled_image_aug(im, mode='normal'):
  return im_scaled_up
  return im
 
+def vertical_shift(im, mode='normal'):
+ if args.vertical_shift == 0:
+ return im
+ total = args.vertical_shift
+ if mode == 'notmid':
+ val = random.randint(0, 1)
+ if val == 0:
+ mode = 'top'
+ else:
+ mode = 'bottom'
+ if mode == 'normal':
+ top = int(total / 2)
+ bottom = total - top
+ elif mode == 'top': # more padding on top
+ top = random.randint(total / 2, total)
+ bottom = total - top
+ elif mode == 'bottom': # more padding on bottom
+ top = random.randint(0, total / 2)
+ bottom = total - top
+ width = im.shape[1]
+ im_pad = np.concatenate(
+ (255 * np.ones((top, width), dtype=int) -
+ np.random.normal(2, 1, (top, width)).astype(int), im), axis=0)
+ im_pad = np.concatenate(
+ (im_pad, 255 * np.ones((bottom, width), dtype=int) -
+ np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0)
+ return im_pad
 
 ### main ###
 random.seed(1)
@@ -134,7 +163,6 @@ def get_scaled_image_aug(im, mode='normal'):
 
 num_fail = 0
 num_ok = 0
-aug_setting = ['normal', 'scaled']
 with open(data_list_path) as f:
  for line in f:
  line = line.strip()
@@ -144,21 +172,25 @@ def get_scaled_image_aug(im, mode='normal'):
  im = misc.imread(image_path)
  if args.fliplr:
  im = np.fliplr(im)
- if args.augment:
- im_aug = get_scaled_image_aug(im, aug_setting[1])
- else:
- im_aug = get_scaled_image_aug(im, aug_setting[0])
- im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
- if im_horizontal_padded is None:
+ if args.augment_type == 'no_aug' or 'random_shift':
+ im = get_scaled_image_aug(im, 'normal')
+ elif args.augment_type == 'random_scale':
+ im = get_scaled_image_aug(im, 'scaled')
+ im = horizontal_pad(im, allowed_lengths)
+ if im is None:
  num_fail += 1
  continue
+ if args.augment_type == 'no_aug' or 'random_scale':
+ im = vertical_shift(im, 'normal')
+ elif args.augment_type == 'random_shift':
+ im = vertical_shift(im, 'notmid')
  if args.num_channels == 1:
- data = np.transpose(im_horizontal_padded, (1, 0))
+ data = np.transpose(im, (1, 0))
  elif args.num_channels == 3:
- H = im_horizontal_padded.shape[0]
- W = im_horizontal_padded.shape[1]
- C = im_horizontal_padded.shape[2]
- data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C))
+ H = im.shape[0]
+ W = im.shape[1]
+ C = im.shape[2]
+ data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C))
  data = np.divide(data, 255.0)
  num_ok += 1
  write_kaldi_matrix(out_fh, data, image_id)

diff --git a/egs/madcat_ar/v1/local/chain/compare_wer.sh b/egs/madcat_ar/v1/local/chain/compare_wer.sh
@@ -27,13 +27,27 @@ for x in $*; do
 done
 echo
 
+echo -n "# WER (rescored) "
+for x in $*; do
+ wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
 echo -n "# CER "
 for x in $*; do
  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
  printf "% 10s" $cer
 done
 echo
 
+echo -n "# CER (rescored) "
+for x in $*; do
+ cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
  exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi

diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
@@ -21,18 +21,16 @@ reporting_email=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
 # we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
 srand=0
 remove_egs=false
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@" # Print the command line for logging
 
@@ -168,13 +166,13 @@ if [ $stage -le 5 ]; then
  --chain.leaky-hmm-coefficient=0.1 \
  --chain.l2-regularize=0.00005 \
  --chain.apply-deriv-weights=false \
- --chain.lm-opts="--num-extra-lm-states=500" \
- --chain.frame-subsampling-factor=$frame_subsampling_factor \
- --chain.alignment-subsampling-factor=$frame_subsampling_factor \
+ --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+ --chain.frame-subsampling-factor=4 \
+ --chain.alignment-subsampling-factor=4 \
  --trainer.srand=$srand \
  --trainer.max-param-change=2.0 \
  --trainer.num-epochs=4 \
- --trainer.frames-per-iter=1000000 \
+ --trainer.frames-per-iter=2000000 \
  --trainer.optimization.num-jobs-initial=3 \
  --trainer.optimization.num-jobs-final=16 \
  --trainer.optimization.initial-effective-lrate=0.001 \
@@ -183,10 +181,6 @@ if [ $stage -le 5 ]; then
  --trainer.num-chunk-per-minibatch=64,32 \
  --trainer.optimization.momentum=0.0 \
  --egs.chunk-width=$chunk_width \
- --egs.chunk-left-context=$chunk_left_context \
- --egs.chunk-right-context=$chunk_right_context \
- --egs.chunk-left-context-initial=0 \
- --egs.chunk-right-context-final=0 \
  --egs.dir="$common_egs_dir" \
  --egs.opts="--frames-overlap-per-eg 0" \
  --cleanup.remove-egs=$remove_egs \
@@ -207,18 +201,20 @@ if [ $stage -le 6 ]; then
  # as long as phones.txt was compatible.
 
  utils/mkgraph.sh \
- --self-loop-scale 1.0 data/$lang_test \
+ --self-loop-scale 1.0 $lang_decode \
  $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
- --extra-left-context $chunk_left_context \
- --extra-right-context $chunk_right_context \
- --extra-left-context-initial 0 \
- --extra-right-context-final 0 \
  --frames-per-chunk $frames_per_chunk \
  --nj $nj --cmd "$cmd" \
  $dir/graph data/test $dir/decode_test || exit 1;
+
+ steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+ data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
@@ -18,18 +18,15 @@ lats_affix=
 # chain options
 train_stage=-10
 xent_regularize=0.1
-frame_subsampling_factor=4
 # training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
 srand=0
 remove_egs=false
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@" # Print the command line for logging
 
@@ -170,13 +167,13 @@ if [ $stage -le 5 ]; then
  --chain.leaky-hmm-coefficient=0.1 \
  --chain.l2-regularize=0.00005 \
  --chain.apply-deriv-weights=false \
- --chain.lm-opts="--num-extra-lm-states=500" \
- --chain.frame-subsampling-factor=$frame_subsampling_factor \
+ --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
+ --chain.frame-subsampling-factor=4 \
  --chain.alignment-subsampling-factor=1 \
  --trainer.srand=$srand \
  --trainer.max-param-change=2.0 \
  --trainer.num-epochs=4 \
- --trainer.frames-per-iter=1000000 \
+ --trainer.frames-per-iter=2000000 \
  --trainer.optimization.num-jobs-initial=3 \
  --trainer.optimization.num-jobs-final=16 \
  --trainer.optimization.initial-effective-lrate=0.001 \
@@ -185,10 +182,6 @@ if [ $stage -le 5 ]; then
  --trainer.num-chunk-per-minibatch=64,32 \
  --trainer.optimization.momentum=0.0 \
  --egs.chunk-width=$chunk_width \
- --egs.chunk-left-context=$chunk_left_context \
- --egs.chunk-right-context=$chunk_right_context \
- --egs.chunk-left-context-initial=0 \
- --egs.chunk-right-context-final=0 \
  --egs.dir="$common_egs_dir" \
  --egs.opts="--frames-overlap-per-eg 0" \
  --cleanup.remove-egs=$remove_egs \
@@ -209,18 +202,20 @@ if [ $stage -le 6 ]; then
  # as long as phones.txt was compatible.
 
  utils/mkgraph.sh \
- --self-loop-scale 1.0 data/$lang_test \
+ --self-loop-scale 1.0 $lang_decode \
  $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
- --extra-left-context $chunk_left_context \
- --extra-right-context $chunk_right_context \
- --extra-left-context-initial 0 \
- --extra-right-context-final 0 \
  --frames-per-chunk $frames_per_chunk \
  --nj $nj --cmd "$cmd" \
  $dir/graph data/test $dir/decode_test || exit 1;
+
+ steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+ data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
@@ -19,17 +19,14 @@ reporting_email=
 train_stage=-10
 xent_regularize=0.1
 frame_subsampling_factor=4
-# training chunk-options
 chunk_width=340,300,200,100
 num_leaves=500
-# we don't need extra left/right context for TDNN systems.
-chunk_left_context=0
-chunk_right_context=0
 tdnn_dim=450
 # training options
 srand=0
 remove_egs=true
-lang_test=lang_test
+lang_decode=data/lang
+lang_rescore=data/lang_rescore_6g
 # End configuration section.
 echo "$0 $@" # Print the command line for logging
 
@@ -171,28 +168,24 @@ if [ $stage -le 5 ]; then
  --chain.leaky-hmm-coefficient=0.1 \
  --chain.l2-regularize=0.00005 \
  --chain.apply-deriv-weights=false \
- --chain.lm-opts="--num-extra-lm-states=500" \
+ --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
  --chain.frame-subsampling-factor=$frame_subsampling_factor \
  --chain.alignment-subsampling-factor=1 \
  --chain.left-tolerance 3 \
  --chain.right-tolerance 3 \
  --trainer.srand=$srand \
  --trainer.max-param-change=2.0 \
- --trainer.num-epochs=2 \
- --trainer.frames-per-iter=1000000 \
+ --trainer.num-epochs=4 \
+ --trainer.frames-per-iter=2000000 \
  --trainer.optimization.num-jobs-initial=3 \
  --trainer.optimization.num-jobs-final=16 \
  --trainer.optimization.initial-effective-lrate=0.001 \
  --trainer.optimization.final-effective-lrate=0.0001 \
  --trainer.optimization.shrink-value=1.0 \
- --trainer.num-chunk-per-minibatch=96,64 \
+ --trainer.num-chunk-per-minibatch=64,32 \
  --trainer.optimization.momentum=0.0 \
  --trainer.add-option="--optimization.memory-compression-level=2" \
  --egs.chunk-width=$chunk_width \
- --egs.chunk-left-context=$chunk_left_context \
- --egs.chunk-right-context=$chunk_right_context \
- --egs.chunk-left-context-initial=0 \
- --egs.chunk-right-context-final=0 \
  --egs.dir="$common_egs_dir" \
  --egs.opts="--frames-overlap-per-eg 0" \
  --cleanup.remove-egs=$remove_egs \
@@ -213,18 +206,20 @@ if [ $stage -le 6 ]; then
  # as long as phones.txt was compatible.
 
  utils/mkgraph.sh \
- --self-loop-scale 1.0 data/$lang_test \
+ --self-loop-scale 1.0 $lang_decode \
  $dir $dir/graph || exit 1;
 fi
 
 if [ $stage -le 7 ]; then
  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
- --extra-left-context $chunk_left_context \
- --extra-right-context $chunk_right_context \
- --extra-left-context-initial 0 \
- --extra-right-context-final 0 \
  --frames-per-chunk $frames_per_chunk \
  --nj $nj --cmd "$cmd" \
  $dir/graph data/test $dir/decode_test || exit 1;
+
+ steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
+ data/test $dir/decode_test{,_rescored} || exit 1
 fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir