kaldi-asr · danpovey · Jan 21, 2019 · Dec 31, 2018 · Dec 31, 2018 · Dec 31, 2018
diff --git a/egs/iam/v1/RESULTS b/egs/iam/v1/RESULTS
@@ -0,0 +1,42 @@
+Run_end2end.sh (WER using lang_test, lang_unk)
+flat_start:
+ • %WER 14.41 [ 2671 / 18542, 262 ins, 561 del, 1848 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+ • %WER 15.21 [ 2821 / 18542, 375 ins, 500 del, 1946 sub ] exp/chain/e2e_cnn_1a/decode_test/wer_11_1.0
+
+cnn_e2eali_1a: 
+ • %WER 11.94 [ 2214 / 18542, 267 ins, 380 del, 1567 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_1.0
+ • %WER 13.30 [ 2467 / 18542, 441 ins, 330 del, 1696 sub ] exp/chain/cnn_e2eali_1a/decode_test/wer_9_0.5
+
+cnn_e2eali_1b: 
+ • %WER 11.20 [ 2076 / 18542, 260 ins, 335 del, 1481 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+ • %WER 12.46 [ 2311 / 18542, 371 ins, 326 del, 1614 sub ] exp/chain/cnn_e2eali_1b/decode_test/wer_9_1.0
+
+cnn_e2eali_1c: 
+ • %WER 9.90 [ 1836 / 18542, 257 ins, 227 del, 1352 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_10_1.0
+ • %WER 12.10 [ 2243 / 18542, 411 ins, 269 del, 1563 sub ] exp/chain/cnn_e2eali_1c/decode_test/wer_12_0.5
+
+
+Run.sh (WER using lang_test, lang_unk)
+cnn_1a:
+ • %WER 15.18 [ 2815 / 18542, 285 ins, 509 del, 2021 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+ • %WER 16.88 [ 3130 / 18542, 444 ins, 611 del, 2075 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+
+cnn_chainali_1a:
+ • %WER 14.09 [ 2612 / 18542, 245 ins, 505 del, 1862 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_13_0.0
+ • %WER 15.93 [ 2954 / 18542, 454 ins, 470 del, 2030 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_10_0.0
+
+cnn_chainali_1b:
+ • %WER 13.29 [ 2465 / 18542, 221 ins, 499 del, 1745 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.5
+ • %WER 15.09 [ 2798 / 18542, 418 ins, 468 del, 1912 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_10_0.5
+
+cnn_chainali_1c:
+ • %WER 11.59 [ 2149 / 18542, 276 ins, 362 del, 1511 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+ • %WER 13.75 [ 2550 / 18542, 465 ins, 368 del, 1717 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+
+cnn_chainali_1d:
+ • %WER 11.07 [ 2053 / 18542, 261 ins, 311 del, 1481 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_9_0.0
+ • %WER 12.95 [ 2402 / 18542, 436 ins, 313 del, 1653 sub ] exp/chain/cnn_chainali_1c/decode_test/wer_8_0.0
+
+cnn_chainali_1e:
+ • %WER 10.03 [ 1859 / 18542, 226 ins, 291 del, 1342 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_11_0.5
+ %WER 12.15 [ 2253 / 18542, 406 ins, 282 del, 1565 sub ] exp/chain/cnn_chainali_1e/decode_test/wer_10_0.5
diff --git a/egs/yomdle_russian/README.txt b/egs/yomdle_russian/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_russian/v1/cmd.sh b/egs/yomdle_russian/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine). queue.pl works with GridEngine (qsub). slurm.pl works
+# with slurm. Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration. Search for
+# conf/queue.conf in https://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_russian/v1/image b/egs/yomdle_russian/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
diff --git a/egs/yomdle_russian/v1/local/chain/compare_wer.sh b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright 2017 Chun Chieh Chang
+# 2017 Ashish Arora
+
+if [ $# == 0 ]; then
+ echo "Usage: $0: <dir1> [<dir2> ... ]"
+ echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+ exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System "
+for x in $*; do printf "% 10s" " $(basename $x)"; done
+echo
+
+echo -n "# WER "
+for x in $*; do
+ wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored) "
+for x in $*; do
+ wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER "
+for x in $*; do
+ cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored) "
+for x in $*; do
+ cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+ exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
diff --git a/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright 2017 Hossein Hadian
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System e2e_cnn_1a
+# score_basic rescoring + nomalized
+# WER 16.24 11.0
+# WER (rescored) 15.63 10.5
+# CER 5.98 5.6
+# CER (rescored) 5.66 5.3
+# Final train prob 0.1376
+# Final valid prob 0.1913
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=27 nj=5..8 num-params=3.0M dim=40->470 combine=0.091->0.091 (over 1) logprob:train/valid[17,26,final]=(0.135,0.137,0.138/0.191,0.191,0.191)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+ # Create a version of the lang/ directory that has one state per phone in the
+ # topo file. [note, it really has two states.. the first one is only repeated
+ # once, the second one has zero or more repeats.]
+ rm -rf $lang
+ cp -r data/lang $lang
+ silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+ nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+ steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+ steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+ --shared-phones true \
+ --type mono \
+ data/$train_set $lang $treedir
+ $cmd $treedir/log/make_phone_lm.log \
+ cat data/$train_set/text \| \
+ steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+ utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+ chain-est-phone-lm --num-extra-lm-states=500 \
+ ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+ num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+ cnn_opts="l2-regularize=0.075"
+ tdnn_opts="l2-regularize=0.075"
+ output_opts="l2-regularize=0.1"
+ common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+ common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+ common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+ mkdir -p $dir/configs
+ cat <<EOF > $dir/configs/network.xconfig
+ input dim=40 name=input
+ conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+ conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+ conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+ conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+ conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+ conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+ conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+ relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+ relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+ relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+ ## adding the layers for chain branch
+ relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+ output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+ steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+ --cmd "$cmd" \
+ --feat.cmvn-opts "$cmvn_opts" \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.apply-deriv-weights true \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+ --chain.frame-subsampling-factor 4 \
+ --chain.alignment-subsampling-factor 4 \
+ --trainer.add-option="--optimization.memory-compression-level=2" \
+ --trainer.num-chunk-per-minibatch $minibatch_size \
+ --trainer.frames-per-iter 1500000 \
+ --trainer.num-epochs 3 \
+ --trainer.optimization.momentum 0 \
+ --trainer.optimization.num-jobs-initial 5 \
+ --trainer.optimization.num-jobs-final 8 \
+ --trainer.optimization.initial-effective-lrate 0.001 \
+ --trainer.optimization.final-effective-lrate 0.0001 \
+ --trainer.optimization.shrink-value 1.0 \
+ --trainer.max-param-change 2.0 \
+ --cleanup.remove-egs true \
+ --feat-dir data/${train_set} \
+ --tree-dir $treedir \
+ --dir $dir || exit 1;
+fi