kaldi-asr · danpovey · Jan 21, 2019 · Dec 31, 2018 · Dec 31, 2018 · Dec 31, 2018
diff --git a/egs/yomdle_russian/README.txt b/egs/yomdle_russian/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources
diff --git a/egs/yomdle_russian/v1/cmd.sh b/egs/yomdle_russian/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine). queue.pl works with GridEngine (qsub). slurm.pl works
+# with slurm. Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration. Search for
+# conf/queue.conf in https://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_russian/v1/image b/egs/yomdle_russian/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
diff --git a/egs/yomdle_russian/v1/local/augment_data.sh b/egs/yomdle_russian/v1/local/augment_data.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright 2018 Hossein Hadian
+# 2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+verticle_shift=0
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+ image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+ $srcdir $datadir/augmentations/$set
+ cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+ local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+ --vertical-shift $verticle_shift \
+ --fliplr false --augment 'random_scale' $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_russian/v1/local/chain/compare_wer.sh b/egs/yomdle_russian/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright 2017 Chun Chieh Chang
+# 2017 Ashish Arora
+
+if [ $# == 0 ]; then
+ echo "Usage: $0: <dir1> [<dir2> ... ]"
+ echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+ exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System "
+for x in $*; do printf "% 10s" " $(basename $x)"; done
+echo
+
+echo -n "# WER "
+for x in $*; do
+ wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored) "
+for x in $*; do
+ wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER "
+for x in $*; do
+ cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored) "
+for x in $*; do
+ cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+ exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob "
+for x in $*; do
+ prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+ printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_russian/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
diff --git a/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh b/egs/yomdle_russian/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+
+# Copyright 2017 Hossein Hadian
+# This script does end2end chain training (i.e. from scratch)
+# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/
+# System e2e_cnn_1a
+# score_basic rescoring + nomalized
+# WER 16.24 11.0
+# WER (rescored) 15.63 10.5
+# CER 5.98 5.6
+# CER (rescored) 5.66 5.3
+# Final train prob 0.1376
+# Final valid prob 0.1913
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a
+# exp/chain/e2e_cnn_1a: num-iters=27 nj=5..8 num-params=3.0M dim=40->470 combine=0.091->0.091 (over 1) logprob:train/valid[17,26,final]=(0.135,0.137,0.138/0.191,0.191,0.191)
+
+set -e
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_decode=data/lang
+decode_e2e=true
+# End configuration section.
+echo "$0 $@" # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+ cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_monotree # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+ # Create a version of the lang/ directory that has one state per phone in the
+ # topo file. [note, it really has two states.. the first one is only repeated
+ # once, the second one has zero or more repeats.]
+ rm -rf $lang
+ cp -r data/lang $lang
+ silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+ nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+ steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+ steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+ --shared-phones true \
+ --type mono \
+ data/$train_set $lang $treedir
+ $cmd $treedir/log/make_phone_lm.log \
+ cat data/$train_set/text \| \
+ steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+ utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+ chain-est-phone-lm --num-extra-lm-states=500 \
+ ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+ echo "$0: creating neural net configs using the xconfig parser";
+ num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+ cnn_opts="l2-regularize=0.075"
+ tdnn_opts="l2-regularize=0.075"
+ output_opts="l2-regularize=0.1"
+ common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+ common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+ common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+
+ mkdir -p $dir/configs
+ cat <<EOF > $dir/configs/network.xconfig
+ input dim=40 name=input
+ conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+ conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+ conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+ conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+ conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+ conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+ conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+ relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+ relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+ relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+ ## adding the layers for chain branch
+ relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+ output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+ steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+ steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+ --cmd "$cmd" \
+ --feat.cmvn-opts "$cmvn_opts" \
+ --chain.leaky-hmm-coefficient 0.1 \
+ --chain.apply-deriv-weights true \
+ --egs.stage $get_egs_stage \
+ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+ --chain.frame-subsampling-factor 4 \
+ --chain.alignment-subsampling-factor 4 \
+ --trainer.add-option="--optimization.memory-compression-level=2" \
+ --trainer.num-chunk-per-minibatch $minibatch_size \
+ --trainer.frames-per-iter 1500000 \
+ --trainer.num-epochs 3 \
+ --trainer.optimization.momentum 0 \
+ --trainer.optimization.num-jobs-initial 5 \
+ --trainer.optimization.num-jobs-final 8 \
+ --trainer.optimization.initial-effective-lrate 0.001 \
+ --trainer.optimization.final-effective-lrate 0.0001 \
+ --trainer.optimization.shrink-value 1.0 \
+ --trainer.max-param-change 2.0 \
+ --cleanup.remove-egs true \
+ --feat-dir data/${train_set} \
+ --tree-dir $treedir \
+ --dir $dir || exit 1;
+fi