From a3a18e2527f911d5d3425b7caa0911b868d31691 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 30 Aug 2018 17:29:13 -0400 Subject: [PATCH 01/37] adding changes for language modelling --- egs/iam/v2/cmd.sh | 0 egs/iam/v2/local/chain/run_cnn_e2eali.sh | 2 +- .../local/chain/tuning/run_cnn_e2eali_1c.sh | 2 +- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 246 ++++++++++++++++++ egs/iam/v2/local/process_corpus.py | 30 +++ 5 files changed, 278 insertions(+), 2 deletions(-) mode change 100644 => 100755 egs/iam/v2/cmd.sh create mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh create mode 100755 egs/iam/v2/local/process_corpus.py diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh old mode 100644 new mode 100755 diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh index ad51803ab0e..da731bcb0b1 120000 --- a/egs/iam/v2/local/chain/run_cnn_e2eali.sh +++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh @@ -1 +1 @@ -tuning/run_cnn_e2eali_1c.sh \ No newline at end of file +tuning/run_cnn_e2eali_1d.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh index ef851c8ae2f..48e0a76dead 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -26,7 +26,7 @@ stage=0 nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1b6 #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a common_egs_dir= reporting_email= diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh new file mode 100755 index 00000000000..d8aa2561f17 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller +# l2-regularize, more epochs and uses dropout. + + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c +# System cnn_e2eali_1b cnn_e2eali_1c +# WER 10.33 10.05 +# WER (rescored) 10.10 9.75 +# CER 5.00 4.76 +# CER (rescored) 4.88 4.68 +# Final train prob -0.0428 -0.0317 +# Final valid prob -0.0666 -0.0630 +# Final train prob (xent) -0.9210 -0.5413 +# Final valid prob (xent) -1.0264 -0.7096 +# Parameters 3.98M 5.12M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c +# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +#lang_decode=data.new/lang +#lang_rescore=data.new/lang_rescore_6g +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi diff --git a/egs/iam/v2/local/process_corpus.py b/egs/iam/v2/local/process_corpus.py new file mode 100755 index 00000000000..9f8e1d275d3 --- /dev/null +++ b/egs/iam/v2/local/process_corpus.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora +# Apache 2.0 +# This script reads valid phones and removes the lines in the corpus +# which have any other phone. + +import os +import sys, io + +phone_file = os.path.join('data/local/phones.txt') +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +phone_dict = dict() +with open(phone_file, 'r', encoding='utf-8') as phone_fh: + for line in phone_fh: + line = line.strip().split()[0] + phone_dict[line] = line + +phone_dict[' '] = ' ' +corpus_text = list() +for line in infile: + text = line.strip() + skip_text = False + for phone in text: + if phone not in phone_dict.keys(): + skip_text = True + break + if not skip_text: + output.write(text+ '\n') + From 91508b5f322b06f5d214a7fa90bb9b375f359252 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 30 Aug 2018 22:35:10 -0400 Subject: [PATCH 02/37] adding modifications for augmentation, topology, shearing, run.sh --- egs/iam/v2/local/augment_data.sh | 33 ++ egs/iam/v2/local/chain/run_cnn_1a.sh | 241 +++++++++ egs/iam/v2/local/chain/run_cnn_chainali_1c.sh | 247 +++++++++ .../local/chain/tuning/run_cnn_e2eali_1d.sh | 13 +- egs/iam/v2/local/extract_features.sh | 48 ++ egs/iam/v2/local/gen_topo.py | 93 ++++ egs/iam/v2/local/make_features.py | 193 ++++++- egs/iam/v2/local/prepare_data.sh | 5 +- egs/iam/v2/local/prepare_lang.sh | 474 ++++++++++++++++++ egs/iam/v2/local/score.sh | 155 +----- egs/iam/v2/run.sh | 174 +++++++ egs/iam/v2/run_end2end.sh | 53 +- egs/wsj/s5/utils/copy_data_dir.sh | 10 +- 13 files changed, 1537 insertions(+), 202 deletions(-) create mode 100755 egs/iam/v2/local/augment_data.sh create mode 100755 egs/iam/v2/local/chain/run_cnn_1a.sh create mode 100755 egs/iam/v2/local/chain/run_cnn_chainali_1c.sh create mode 100755 egs/iam/v2/local/extract_features.sh create mode 100755 egs/iam/v2/local/gen_topo.py create mode 100755 egs/iam/v2/local/prepare_lang.sh create mode 100755 egs/iam/v2/run.sh diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh new file mode 100755 index 00000000000..443a16874f2 --- /dev/null +++ b/egs/iam/v2/local/augment_data.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 + +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/aug1" +utils/copy_data_dir.sh --spk-prefix aug1- --utt-prefix aug1- $srcdir $datadir/augmentations/aug1 + +echo " copying allowed length for training with augmented data..." +cat $srcdir/allowed_lengths.txt > $datadir/augmentations/aug1/allowed_lengths.txt + +echo " Extracting features, creating feats.scp file for augmentated data" +local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim --fliplr false --augment true $datadir/augmentations/aug1 + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/iam/v2/local/chain/run_cnn_1a.sh b/egs/iam/v2/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..41a76920e37 --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_1a.sh @@ -0,0 +1,241 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) + +# local/chain/compare_wer.sh exp/chain/cnn_1a/ +# System cnn_1a +# WER 18.52 +# CER 10.07 +# Final train prob -0.0077 +# Final valid prob -0.0970 +# Final train prob (xent) -0.5484 +# Final valid prob (xent) -0.9643 +# Parameters 4.36M + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh new file mode 100755 index 00000000000..54c52d913de --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh @@ -0,0 +1,247 @@ +#!/bin/bash + +# chainali_1c is as chainali_1b except it uses l2-regularize +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c +# System cnn_chainali_1b cnn_chainali_1c +# WER 14.38 12.72 +# CER 7.14 5.99 +# Final train prob -0.0113 -0.0291 +# Final valid prob -0.0400 -0.0359 +# Final train prob (xent) -0.6043 -0.9781 +# Final valid prob (xent) -0.9030 -1.1544 +# Parameters 3.96M 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c +# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index d8aa2561f17..6ab74dc2f0d 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -45,8 +45,6 @@ tdnn_dim=550 # training options srand=0 remove_egs=true -#lang_decode=data.new/lang -#lang_rescore=data.new/lang_rescore_6g lang_decode=data/lang lang_rescore=data/lang_rescore_6g @@ -71,7 +69,6 @@ fi ali_dir=exp/chain/e2e_ali_train lat_dir=exp/chain${nnet3_affix}/e2e_${train_set}_lats dir=exp/chain${nnet3_affix}/cnn_e2eali${affix} -#dir=exp/chain/cnn_e2eali_1c train_data_dir=data/${train_set} tree_dir=exp/chain${nnet3_affix}/tree_e2e @@ -164,6 +161,16 @@ if [ $stage -le 4 ]; then ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts EOF diff --git a/egs/iam/v2/local/extract_features.sh b/egs/iam/v2/local/extract_features.sh new file mode 100755 index 00000000000..1741ad3f9b2 --- /dev/null +++ b/egs/iam/v2/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment=false +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + local/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py new file mode 100755 index 00000000000..a74c6d4bbae --- /dev/null +++ b/egs/iam/v2/local/gen_topo.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set(string.punctuation) +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split('_')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) + +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_nonsil_states) + " ") +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_punctuation_states) + " ") +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " " + str(x) + " " + str(x) + " " + for y in range(1, args.num_sil_states): + state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") + print(" " + str(args.num_sil_states) + " ") +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" " + str(args.num_sil_states) + " ") +print("") +print("") diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py index 84e012daedb..3ce501732cf 100755 --- a/egs/iam/v2/local/make_features.py +++ b/egs/iam/v2/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2017 Yiwen Shao # 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to @@ -14,20 +15,27 @@ to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. - eg. local/make_features.py data/train --feat-dim 40 """ - +import random import argparse import os import sys +import scipy.io as sio import numpy as np from scipy import misc +from scipy.ndimage.interpolation import affine_transform +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") -parser.add_argument('dir', type=str, - help='Source data directory (containing images.scp)') +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') parser.add_argument('--out-ark', type=str, default='-', help='Where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, @@ -35,8 +43,10 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') - - +parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, + help="Flip the image left-right for right to left languages") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() @@ -56,18 +66,12 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im, allowed_lengths = None): - scale_size = args.feat_dim - sx = im.shape[1] - sy = im.shape[0] - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) + +def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding else: # Find an allowed length for the image - imlen = im.shape[1] + imlen = im.shape[1] # width allowed_len = 0 for l in allowed_lengths: if l > imlen: @@ -77,28 +81,153 @@ def get_scaled_image(im, allowed_lengths = None): # No allowed length was found for the image (the image is too long) return None padding = allowed_len - imlen - left_padding = padding // 2 + left_padding = int(padding // 2) right_padding = padding - left_padding - dim_y = im.shape[0] + dim_y = im.shape[0] # height im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 -### main ### -data_list_path = os.path.join(args.dir, 'images.scp') +def get_scaled_image_aug(im, mode='normal'): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + scale_size = random.randint(10, 30) + scale = (1.0 * scale_size) / sy + down_nx = int(scale_size) + down_ny = int(scale * sx) + if mode == 'normal': + im = misc.imresize(im, (nx, ny)) + return im + else: + im_scaled_down = misc.imresize(im, (down_nx, down_ny)) + im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) + return im_scaled_up + return im + +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + +def find_slant_project(im): + rows = im.shape[0] + cols = im.shape[1] + std_max = 0 + alpha_max = 0 + col_disp = np.zeros(90, int) + proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) + for r in range(rows): + for alpha in range(-45, 45, 1): + col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) + for c in range(cols): + if im[r, c] < 100: + for alpha in range(-45, 45, 1): + proj[alpha + 45, c + col_disp[alpha] + rows] += 1 + for alpha in range(-45, 45, 1): + proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) + proj_std = np.std(proj_histogram) + if proj_std > std_max: + std_max = proj_std + alpha_max = alpha + proj_std = np.std(proj, axis=1) + return -alpha_max + + +def horizontal_shear(im, degree): + rad = degree / 180.0 * math.pi + padding_x = int(abs(np.tan(rad)) * im.shape[0]) + padding_y = im.shape[0] + if rad > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + elif rad < 0: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + else: + im_pad = im + shear_matrix = np.array([[1, 0], + [np.tan(rad), 1]]) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +### main ### +random.seed(1) +data_list_path = args.images_scp_path if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark,'w') allowed_lengths = None -if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): print("Found 'allowed_lengths.txt' file...", file=sys.stderr) allowed_lengths = [] - with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + with open(allowed_len_handle) as f: for line in f: allowed_lengths.append(int(line.strip())) print("Read {} allowed lengths and will apply them to the " @@ -106,6 +235,7 @@ def get_scaled_image(im, allowed_lengths = None): num_fail = 0 num_ok = 0 +aug_setting = ['normal', 'scaled'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -113,15 +243,24 @@ def get_scaled_image(im, allowed_lengths = None): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scaled = get_scaled_image(im, allowed_lengths) - - if im_scaled is None: + if args.fliplr: + im = np.fliplr(im) + if args.augment: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_contrast = contrast_normalization(im_aug, 0.05, 0.2) + slant_degree = find_slant_project(im_contrast) + im_sheared = horizontal_shear(im_contrast, slant_degree) + im_aug = im_sheared + else: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) + if im_horizontal_padded is None: num_fail += 1 continue - data = np.transpose(im_scaled, (1, 0)) + data = np.transpose(im_horizontal_padded, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) -print('Generated features for {} images. Failed for {} (iamge too ' +print('Generated features for {} images. Failed for {} (image too ' 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index 73d711c73f0..9c01ac90f28 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -165,6 +165,7 @@ if [ $stage -le 0 ]; then local/process_data.py data/local data/test --dataset test || exit 1 local/process_data.py data/local data/val --dataset validation || exit 1 - utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt - utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt + image/fix_data_dir.sh data/train + image/fix_data_dir.sh data/test + image/fix_data_dir.sh data/val fi diff --git a/egs/iam/v2/local/prepare_lang.sh b/egs/iam/v2/local/prepare_lang.sh new file mode 100755 index 00000000000..cc6bc03a432 --- /dev/null +++ b/egs/iam/v2/local/prepare_lang.sh @@ -0,0 +1,474 @@ +#!/bin/bash +# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal +# 2014 Guoguo Chen +# 2015 Hainan Xu +# 2016 FAU Erlangen (Author: Axel Horndasch) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script prepares a directory such as data/lang/, in the standard format, +# given a source directory containing a dictionary lexicon.txt in a form like: +# word phone1 phone2 ... phoneN +# per line (alternate prons would be separate lines), or a dictionary with probabilities +# called lexiconp.txt in a form: +# word pron-prob phone1 phone2 ... phoneN +# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if +# lexicon.txt exists. +# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt +# and extra_questions.txt +# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and +# non-silence phones respectively (where silence includes various kinds of +# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the +# "real" phones.) +# In each line of those files is a list of phones, and the phones on each line +# are assumed to correspond to the same "base phone", i.e. they will be +# different stress or tone variations of the same basic phone. +# The file "optional_silence.txt" contains just a single phone (typically SIL) +# which is used for optional silence in the lexicon. +# extra_questions.txt might be empty; typically will consist of lists of phones, +# all members of each list with the same stress or tone; and also possibly a +# list for the silence phones. This will augment the automatically generated +# questions (note: the automatically generated ones will treat all the +# stress/tone versions of a phone the same, so will not "get to ask" about +# stress or tone). +# + +# This script adds word-position-dependent phones and constructs a host of other +# derived files, that go in data/lang/. + +# Begin configuration section. +num_sil_states=5 +num_nonsil_states=3 +position_dependent_phones=true +# position_dependent_phones is false also when position dependent phones and word_boundary.txt +# have been generated by another source +share_silence_phones=false # if true, then share pdfs of different silence + # phones together. +sil_prob=0.5 +unk_fst= # if you want to model the unknown-word () + # with a phone-level LM as created by make_unk_lm.sh, + # provide the text-form FST via this flag, e.g. /unk_fst.txt + # where was the 2nd argument of make_unk_lm.sh. +phone_symbol_table= # if set, use a specified phones.txt file. +extra_word_disambig_syms= # if set, add disambiguation symbols from this file (one per line) + # to phones/disambig.txt, phones/wdisambig.txt and words.txt +num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence. + # Increasing this number does not harm, but is only useful if you later + # want to introduce this labels to L_disambig.fst +# end configuration sections + +echo "$0 $@" # Print the command line for logging + +. utils/parse_options.sh + +if [ $# -ne 4 ]; then + echo "usage: utils/prepare_lang.sh " + echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" + echo " should contain the following files:" + echo " extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt" + echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info." + echo "options: " + echo " --num-sil-states # default: 5, #states in silence models." + echo " --num-nonsil-states # default: 3, #states in non-silence models." + echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" + echo " # markers on phones to indicate word-internal positions. " + echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " + echo " # all non-silence phones. " + echo " --sil-prob # default: 0.5 [must have 0 <= silprob < 1]" + echo " --phone-symbol-table # default: \"\"; if not empty, use the provided " + echo " # phones.txt as phone symbol table. This is useful " + echo " # if you use a new dictionary for the existing setup." + echo " --unk-fst # default: none. e.g. exp/make_unk_lm/unk_fst.txt." + echo " # This is for if you want to model the unknown word" + echo " # via a phone-level LM rather than a special phone" + echo " # (this should be more useful for test-time than train-time)." + echo " --extra-word-disambig-syms # default: \"\"; if not empty, add disambiguation symbols" + echo " # from this file (one per line) to phones/disambig.txt," + echo " # phones/wdisambig.txt and words.txt" + exit 1; +fi + +srcdir=$1 +oov_word=$2 +tmpdir=$3 +dir=$4 +mkdir -p $dir $tmpdir $dir/phones + +silprob=false +[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true + +[ -f path.sh ] && . ./path.sh + +! utils/validate_dict_dir.pl $srcdir && \ + echo "*Error validating directory $srcdir*" && exit 1; + +if [[ ! -f $srcdir/lexicon.txt ]]; then + echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1; +fi +if [[ ! -f $srcdir/lexiconp.txt ]]; then + echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt" + perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1; +fi + +if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then + echo "$0: expected --unk-fst $unk_fst to exist as a file" + exit 1 +fi + +if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then + utils/validate_dict_dir.pl $srcdir # show the output. + echo "Validation failed (second time)" + exit 1; +fi + +# phones.txt file provided, we will do some sanity check here. +if [[ ! -z $phone_symbol_table ]]; then + # Checks if we have position dependent phones + n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l` + n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l` + $position_dependent_phones && [ $n1 -eq $n2 ] &&\ + echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1; + ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\ + echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1; + + # Checks if the phone sets match. + cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table ' + BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} + { for (x = 1; x <= NF; ++x) { if (!($x in phones)) { + print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1; +fi + +# In case there are extra word-level disambiguation symbols we need +# to make sure that all symbols in the provided file are valid. +if [ ! -z "$extra_word_disambig_syms" ]; then + if ! utils/lang/validate_disambig_sym_file.pl --allow-numeric "false" $extra_word_disambig_syms; then + echo "$0: Validation of disambiguation file \"$extra_word_disambig_syms\" failed." + exit 1; + fi +fi + +if $position_dependent_phones; then + # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or + # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by + # adding the markers _B, _E, _S, _I depending on word position. + # In this recipe, these markers apply to silence also. + # Do this starting from lexiconp.txt only. + if "$silprob"; then + perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; + $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; + if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } + else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; + for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ + < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt + else + perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; + if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; + for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ + < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1; + fi + + # create $tmpdir/phone_map.txt + # this has the format (on each line) + # ... + # where the versions depend on the position of the phone within a word. + # For instance, we'd have: + # AA AA_B AA_E AA_I AA_S + # for (B)egin, (E)nd, (I)nternal and (S)ingleton + # and in the case of silence + # SIL SIL SIL_B SIL_E SIL_I SIL_S + # [because SIL on its own is one of the variants; this is for when it doesn't + # occur inside a word but as an option in the lexicon.] + + # This phone map expands the phone lists into all the word-position-dependent + # versions of the phone lists. + cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + > $tmpdir/phone_map.txt +else + if "$silprob"; then + cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt + else + cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt + fi + cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \ + awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones + paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt +fi +mkdir -p $dir/phones # various sets of phones... +# Sets of phones for use in clustering, and making monophone systems. +if $share_silence_phones; then + # build a roots file that will force all the silence phones to share the + # same pdf's. [three distinct states, only the transitions will differ.] + # 'shared'/'not-shared' means, do we share the 3 states of the HMM + # in the same tree-root? + # Sharing across models(phones) is achieved by writing several phones + # into one line of roots.txt (shared/not-shared doesn't affect this). + # 'not-shared not-split' means we have separate tree roots for the 3 states, + # but we never split the tree so they remain stumps, + # so all phones in the line correspond to the same model. + cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \ + utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt + cat $dir/phones/sets.txt | \ + awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt +else + # different silence phones will have different GMMs. [note: here, all "shared split" means + # is that we may have one GMM for all the states, or we can split on states. because they're + # context-independent phones, they don't see the context.] + cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt + cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt +fi +cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt +cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt +cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt +cp $dir/phones/silence.txt $dir/phones/context_indep.txt +# if extra_questions.txt is empty, it's OK. +cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \ + >$dir/phones/extra_questions.txt +# Want extra questions about the word-start/word-end stuff. Make it separate for +# silence and non-silence. Probably doesn't matter, as silence will rarely +# be inside a word. +if $position_dependent_phones; then + for suffix in _B _E _I _S; do + (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + done + for suffix in "" _B _E _I _S; do + (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + done +fi +# add_lex_disambig.pl is responsible for adding disambiguation symbols to +# the lexicon, for telling us how many disambiguation symbols it used, +# and and also for modifying the unknown-word's pronunciation (if the +# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those +# disambig symbols for that purpose. +# The #2 will later be replaced with the actual unk model. The reason +# for the #1 and the #3 is for disambiguation and also to keep the +# FST compact. If we didn't have the #1, we might have a different copy of +# the unk-model FST, or at least some of its arcs, for each start-state from +# which an transition comes (instead of per end-state, which is more compact); +# and adding the #3 prevents us from potentially having 2 copies of the unk-model +# FST due to the optional-silence [the last phone of any word gets 2 arcs]. +if [ ! -z "$unk_fst" ]; then # if the --unk-fst option was provided... + if "$silprob"; then + utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1 + else + utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1 + fi + unk_opt="--first-allowed-disambig 4" +else + unk_opt= +fi +if "$silprob"; then + ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt) +else + ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt) +fi +ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST. +echo $ndisambig > $tmpdir/lex_ndisambig +# Format of lexiconp_disambig.txt: +# !SIL 1.0 SIL_S +# 1.0 SPN_S #1 +# 1.0 SPN_S #2 +# 1.0 NSN_S +# !EXCLAMATION-POINT 1.0 EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E +( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt +# In case there are extra word-level disambiguation symbols they also +# need to be added to the list of phone-level disambiguation symbols. +if [ ! -z "$extra_word_disambig_syms" ]; then + # We expect a file containing valid word-level disambiguation symbols. + cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/disambig.txt +fi +# Create phone symbol table. +if [[ ! -z $phone_symbol_table ]]; then + start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` + echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' + BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ + cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt +else + echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ + awk '{n=NR-1; print $1, n;}' > $dir/phones.txt +fi +# Create a file that describes the word-boundary information for +# each phone. 5 categories. +if $position_dependent_phones; then + cat $dir/phones/{silence,nonsilence}.txt | \ + awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; } + /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; } + {print $1, "nonword";} ' > $dir/phones/word_boundary.txt +else + # word_boundary.txt might have been generated by another source + [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt +fi +# Create word symbol table. +# and are only needed due to the need to rescore lattices with +# ConstArpaLm format language model. They do not normally appear in G.fst or +# L.fst. +if "$silprob"; then + # remove the silprob + cat $tmpdir/lexiconp_silprob.txt |\ + awk '{ + for(i=1; i<=NF; i++) { + if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print ""; + } + }' > $tmpdir/lexiconp.txt +fi +cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' + BEGIN { + print " 0"; + } + { + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + if ($1 == "") { + print " is in the vocabulary!" | "cat 1>&2" + exit 1; + } + printf("%s %d\n", $1, NR); + } + END { + printf("#0 %d\n", NR+1); + printf(" %d\n", NR+2); + printf(" %d\n", NR+3); + }' > $dir/words.txt || exit 1; +# In case there are extra word-level disambiguation symbols they also +# need to be added to words.txt +if [ ! -z "$extra_word_disambig_syms" ]; then + # Since words.txt already exists, we need to extract the current word count. + word_count=`tail -n 1 $dir/words.txt | awk '{ print $2 }'` + # We expect a file containing valid word-level disambiguation symbols. + # The list of symbols is attached to the current words.txt (including + # a numeric identifier for each symbol). + cat $extra_word_disambig_syms | \ + awk -v WC=$word_count '{ printf("%s %d\n", $1, ++WC); }' >> $dir/words.txt || exit 1; +fi +# format of $dir/words.txt: +# 0 +#!EXCLAMATION-POINT 1 +#!SIL 2 +#"CLOSE-QUOTE 3 +#... +silphone=`cat $srcdir/optional_silence.txt` || exit 1; +[ -z "$silphone" ] && \ + ( echo "You have no optional-silence phone; it is required in the current scripts" + echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \ + exit 1; +# create $dir/phones/align_lexicon.{txt,int}. +# This is the method we use for lattice word alignment if we are not +# using word-position-dependent phones. +# First remove pron-probs from the lexicon. +perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt +# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence, +# and is not part of a word. +[ ! -z "$silphone" ] && echo " $silphone" >> $tmpdir/align_lexicon.txt +cat $tmpdir/align_lexicon.txt | \ + perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt +# create phones/align_lexicon.int +cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ + utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int +# Create the basic L.fst without disambiguation symbols, for use +# in training. +if $silprob; then + # Add silence probabilities (modlels the prob. of silence before and after each + # word). On some setups this helps a bit. See utils/dict_dir_add_pronprobs.sh + # and where it's called in the example scripts (run.sh). + utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt $silphone "" | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; +else + utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; +fi +# The file oov.txt contains a word that we will map any OOVs to during +# training. +echo "$oov_word" > $dir/oov.txt || exit 1; +cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; +# integer version of oov symbol, used in some scripts. +# the file wdisambig.txt contains a (line-by-line) list of the text-form of the +# disambiguation symbols that are used in the grammar and passed through by the +# lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork +# for more generality (which probably would be added by another script). +# wdisambig_words.int contains the corresponding list interpreted by the +# symbol table words.txt, and wdisambig_phones.int contains the corresponding +# list interpreted by the symbol table phones.txt. +echo '#0' >$dir/phones/wdisambig.txt +# In case there are extra word-level disambiguation symbols they need +# to be added to the existing word-level disambiguation symbols file. +if [ ! -z "$extra_word_disambig_syms" ]; then + # We expect a file containing valid word-level disambiguation symbols. + # The regular expression for awk is just a paranoia filter (e.g. for empty lines). + cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/wdisambig.txt +fi +utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int +utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int +# Create these lists of phones in colon-separated integer list form too, +# for purposes of being given to programs as command-line options. +for f in silence nonsilence optional_silence disambig context_indep; do + utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int + utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \ + awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1; +done +for x in sets extra_questions; do + utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1; +done +utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \ + > $dir/phones/roots.int || exit 1; +if [ -f $dir/phones/word_boundary.txt ]; then + utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \ + > $dir/phones/word_boundary.int || exit 1; +fi +silphonelist=`cat $dir/phones/silence.csl` +nonsilphonelist=`cat $dir/phones/nonsilence.csl` +# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file +# with another one of your choice if the 'topo' file you want can't be generated by +# utils/gen_topo.pl. We do this in the 'chain' recipes. Of course, the 'topo' file +# should cover all the phones. Try running utils/validate_lang.pl to check that +# everything is OK after modifying the topo file. +local/gen_topo.py $num_nonsil_states $num_sil_states 4 $nonsilphonelist $silphonelist $dir/phones.txt >$dir/topo +# Create the lexicon FST with disambiguation symbols, and put it in lang_test. +# There is an extra step where we create a loop to "pass through" the +# disambiguation symbols from G.fst. +if $silprob; then + utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ + fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; +else + utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ + fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; +fi +if [ ! -z "$unk_fst" ]; then + utils/lang/internal/apply_unk_lm.sh $unk_fst $dir || exit 1 + if ! $position_dependent_phones; then + echo "$0: warning: you are using the --unk-lm option and setting --position-dependent-phones false." + echo " ... this will make it impossible to properly work out the word boundaries after" + echo " ... decoding; quite a few scripts will not work as a result, and many scoring scripts" + echo " ... will die." + sleep 4 + fi +fi +echo "$(basename $0): validating output directory" +! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" && exit 1; +exit 0; diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh index b2032909333..1d84815fc69 100755 --- a/egs/iam/v2/local/score.sh +++ b/egs/iam/v2/local/score.sh @@ -1,155 +1,6 @@ -#!/bin/bash -# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) -# Apache 2.0 - -# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's -# using local/unk_arc_post_to_transcription.py and also it calls -# steps/scoring/score_kaldi_cer.sh at the end. - -[ -f ./path.sh ] && . ./path.sh - -# begin configuration section. -cmd=run.pl -stage=0 -decode_mbr=false -stats=true -beam=6 -word_ins_penalty=0.0,0.5,1.0 -min_lmwt=3 -max_lmwt=13 -iter=final -#end configuration section. - -echo "$0 $@" # Print the command line for logging -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --stage (0|1|2) # start scoring script from part-way through." - echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi - -data=$1 -lang_or_graph=$2 -dir=$3 -model_path=`echo $dir |xargs dirname` -symtab=$lang_or_graph/words.txt - -for f in $symtab $dir/lat.1.gz $data/text; do - [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; -done - - -ref_filtering_cmd="cat" -[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" -[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" -hyp_filtering_cmd="cat" -[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" -[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" - - -if $decode_mbr ; then - echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" -else - echo "$0: scoring with word insertion penalty=$word_ins_penalty" -fi - - -mkdir -p $dir/scoring_kaldi -cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; -if [ $stage -le 0 ]; then - - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - mkdir -p $dir/scoring_kaldi/penalty_$wip/log - - if $decode_mbr ; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ - acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-prune --beam=$beam ark:- ark:- \| \ - lattice-mbr-decode --word-symbol-table=$symtab \ - ark:- ark,t:- \| \ - utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; - else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ - utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; - fi - - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ - cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ - compute-wer --text --mode=present \ - "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; - - done -fi - - - -if [ $stage -le 1 ]; then - - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - for lmwt in $(seq $min_lmwt $max_lmwt); do - # adding /dev/null to the command list below forces grep to output the filename - grep WER $dir/wer_${lmwt}_${wip} /dev/null - done - done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 - - best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) - best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') - best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') - - if [ -z "$best_lmwt" ]; then - echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." - exit 1; - fi - - if $stats; then - mkdir -p $dir/scoring_kaldi/wer_details - echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight - echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty - - $cmd $dir/scoring_kaldi/log/stats1.log \ - cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ - align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ - utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ - utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; - - $cmd $dir/scoring_kaldi/log/stats2.log \ - cat $dir/scoring_kaldi/wer_details/per_utt \| \ - utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ - sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; - - $cmd $dir/scoring_kaldi/log/wer_bootci.log \ - compute-wer-bootci --mode=present \ - ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ - '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; - - fi -fi - -steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ - --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ - $data $lang_or_graph $dir +#!/bin/bash -# If we got here, the scoring was successful. -# As a small aid to prevent confusion, we remove all wer_{?,??} files; -# these originate from the previous version of the scoring files -# i keep both statement here because it could lead to confusion about -# the capabilities of the script (we don't do cer in the script) -rm $dir/wer_{?,??} 2>/dev/null -rm $dir/cer_{?,??} 2>/dev/null -exit 0; +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh new file mode 100755 index 00000000000..c74397ccc48 --- /dev/null +++ b/egs/iam/v2/run.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +decode_gmm=false +username= +password= +iam_database=/export/corpora5/handwriting_ocr/IAM +wellington_database=/export/corpora5/Wellington/WWC/ + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --wellington-dir "$wellington_database" \ + --username "$username" --password "$password" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in train test; do + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; + done + image/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ + local/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train val train_aug; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang + + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_aug \ + data/lang exp/mono +fi + +if [ $stage -le 5 ] && $decode_gmm; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ + exp/mono/decode_test +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ + exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd $cmd 500 20000 data/train_aug data/lang \ + exp/mono_ali exp/tri +fi + +if [ $stage -le 7 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ + exp/tri/decode_test +fi + +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ + exp/tri exp/tri_ali + + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/tri_ali exp/tri2 +fi + +if [ $stage -le 9 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ + data/test exp/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd $cmd 500 20000 \ + data/train_aug data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 11 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + + steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ + data/test exp/tri3/decode_test +fi + +if [ $stage -le 12 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train_aug data/lang exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 13 ]; then + local/chain/run_cnn_1a.sh --train_set train_aug +fi + +if [ $stage -le 14 ]; then + local/chain/run_cnn_chainali_1c.sh --train_set train_aug \ + --chain-model-dir exp/chain/cnn_1a --stage 2 +fi diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index de5c7086ec2..bd78c011b75 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -35,27 +35,50 @@ fi mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - echo "$0: Preparing the test and train feature files..." - for dataset in train test; do - local/make_features.py data/$dataset --feat-dim 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp - steps/compute_cmvn_stats.sh data/$dataset + for set in train test; do + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; + done + image/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done - utils/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then echo "$0: Preparing BPE..." + # getting non-silence phones. cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ local/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - for set in test train val; do + for set in test train val train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | \ local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ @@ -73,8 +96,6 @@ fi if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang @@ -87,7 +108,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Calling the flat-start chain recipe..." - local/chain/run_e2e_cnn.sh + local/chain/run_e2e_cnn.sh --train_set train_aug fi if [ $stage -le 6 ]; then @@ -100,5 +121,5 @@ fi if [ $stage -le 7 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - local/chain/run_cnn_e2eali.sh + local/chain/run_cnn_e2eali.sh --train_set train_aug fi diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh index f3b885c5e79..996cf76ec33 100755 --- a/egs/wsj/s5/utils/copy_data_dir.sh +++ b/egs/wsj/s5/utils/copy_data_dir.sh @@ -86,10 +86,16 @@ fi if [ -f $srcdir/segments ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir + if [ -f $srcdir/wav.scp ]; then + cp $srcdir/wav.scp $destdir + elif [ -f $srcdir/images.scp ]; then + cp $srcdir/images.scp $destdir + fi else # no segments->wav indexed by utt. if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp + elif [ -f $srcdir/images.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/images.scp >$destdir/images.scp fi fi @@ -126,7 +132,7 @@ rm $destdir/spk_map $destdir/utt_map echo "$0: copied data from $srcdir to $destdir" -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do +for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp images.scp reco2file_and_channel stm glm ctm; do if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" echo " ... $destdir/.backup/$f" From 5f273d6c5548f22f4f1f857d13df6b29986d9a1e Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 31 Aug 2018 00:50:46 -0400 Subject: [PATCH 03/37] fixing bugs --- egs/iam/v2/local/chain/run_cnn_1a.sh | 2 +- egs/iam/v2/local/chain/run_cnn_chainali_1c.sh | 2 +- egs/iam/v2/run.sh | 32 +++++++++---------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/egs/iam/v2/local/chain/run_cnn_1a.sh b/egs/iam/v2/local/chain/run_cnn_1a.sh index 41a76920e37..5e7d5bac77b 100755 --- a/egs/iam/v2/local/chain/run_cnn_1a.sh +++ b/egs/iam/v2/local/chain/run_cnn_1a.sh @@ -46,7 +46,7 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_unk +lang_test=lang # End configuration section. echo "$0 $@" # Print the command line for logging diff --git a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh index 54c52d913de..a17d7307fb4 100755 --- a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh +++ b/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh @@ -43,7 +43,7 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_unk +lang_test=lang # End configuration section. echo "$0 $@" # Print the command line for logging diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index c74397ccc48..33a78bf41d0 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -6,7 +6,7 @@ set -e stage=0 -nj=20 +nj=70 decode_gmm=false username= password= @@ -101,19 +101,19 @@ if [ $stage -le 4 ]; then data/lang data/lang_rescore_6g fi -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_aug \ data/lang exp/mono fi -if [ $stage -le 5 ] && $decode_gmm; then +if [ $stage -le 6 ] && $decode_gmm; then utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ exp/mono/decode_test fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ exp/mono exp/mono_ali @@ -121,30 +121,30 @@ if [ $stage -le 6 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 7 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph +if [ $stage -le 8 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ exp/tri/decode_test fi -if [ $stage -le 8 ]; then +if [ $stage -le 9 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ exp/tri exp/tri_ali steps/train_lda_mllt.sh --cmd $cmd \ --splice-opts "--left-context=3 --right-context=3" 500 20000 \ - data/train data/lang exp/tri_ali exp/tri2 + data/train_aug data/lang exp/tri_ali exp/tri2 fi -if [ $stage -le 9 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph +if [ $stage -le 10 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ data/test exp/tri2/decode_test fi -if [ $stage -le 10 ]; then +if [ $stage -le 11 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train data/lang exp/tri2 exp/tri2_ali @@ -152,23 +152,23 @@ if [ $stage -le 10 ]; then data/train_aug data/lang exp/tri2_ali exp/tri3 fi -if [ $stage -le 11 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph +if [ $stage -le 12 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ data/test exp/tri3/decode_test fi -if [ $stage -le 12 ]; then +if [ $stage -le 13 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train_aug data/lang exp/tri3 exp/tri3_ali fi -if [ $stage -le 13 ]; then +if [ $stage -le 14 ]; then local/chain/run_cnn_1a.sh --train_set train_aug fi -if [ $stage -le 14 ]; then +if [ $stage -le 15 ]; then local/chain/run_cnn_chainali_1c.sh --train_set train_aug \ --chain-model-dir exp/chain/cnn_1a --stage 2 fi From 2645f146491a27ebbf2e127246f5d8ae4a0efc22 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 31 Aug 2018 02:19:00 -0400 Subject: [PATCH 04/37] fixing bug --- egs/iam/v2/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 33a78bf41d0..0881df1ff05 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -146,7 +146,7 @@ fi if [ $stage -le 11 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train data/lang exp/tri2 exp/tri2_ali + data/train_aug data/lang exp/tri2 exp/tri2_ali steps/train_sat.sh --cmd $cmd 500 20000 \ data/train_aug data/lang exp/tri2_ali exp/tri3 From 6ebfdb2213bf9ea76b7884dd31bc51f981f0ea56 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 31 Aug 2018 20:37:08 -0400 Subject: [PATCH 05/37] adding parameter tuning --- egs/iam/v2/local/chain/run_cnn_chainali_1c.sh | 7 +- egs/iam/v2/local/chain/run_cnn_chainali_1d.sh | 251 ++++++++++++++++++ egs/iam/v2/run.sh | 2 +- 3 files changed, 258 insertions(+), 2 deletions(-) create mode 100755 egs/iam/v2/local/chain/run_cnn_chainali_1d.sh diff --git a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh index a17d7307fb4..53039377baf 100755 --- a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh +++ b/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh @@ -44,6 +44,8 @@ tdnn_dim=450 srand=0 remove_egs=false lang_test=lang +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -230,7 +232,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -244,4 +246,7 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi diff --git a/egs/iam/v2/local/chain/run_cnn_chainali_1d.sh b/egs/iam/v2/local/chain/run_cnn_chainali_1d.sh new file mode 100755 index 00000000000..3123ee897d5 --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_chainali_1d.sh @@ -0,0 +1,251 @@ +#!/bin/bash + +# chainali_1c is as chainali_1b except it uses l2-regularize +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c +# System cnn_chainali_1b cnn_chainali_1c +# WER 14.38 12.72 +# CER 7.14 5.99 +# Final train prob -0.0113 -0.0291 +# Final valid prob -0.0400 -0.0359 +# Final train prob (xent) -0.6043 -0.9781 +# Final valid prob (xent) -0.9030 -1.1544 +# Parameters 3.96M 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c +# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=false +lang_test=lang +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=6 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 0881df1ff05..209fcb5de2f 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -169,6 +169,6 @@ if [ $stage -le 14 ]; then fi if [ $stage -le 15 ]; then - local/chain/run_cnn_chainali_1c.sh --train_set train_aug \ + local/chain/run_cnn_chainali_1d.sh --train_set train_aug \ --chain-model-dir exp/chain/cnn_1a --stage 2 fi From b5329781b0e049c653d92eed06a911e450ef79a1 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sat, 1 Sep 2018 04:01:54 -0400 Subject: [PATCH 06/37] cosmetic fixes and updating results --- egs/iam/v2/local/chain/run_cnn.sh | 1 + egs/iam/v2/local/chain/run_cnn_chainali.sh | 1 + .../v2/local/chain/{ => tuning}/run_cnn_1a.sh | 0 .../run_cnn_chainali_1a.sh} | 24 +++++++------- .../run_cnn_chainali_1b.sh} | 32 +++++++++---------- egs/iam/v2/run.sh | 6 ++-- 6 files changed, 32 insertions(+), 32 deletions(-) create mode 120000 egs/iam/v2/local/chain/run_cnn.sh create mode 120000 egs/iam/v2/local/chain/run_cnn_chainali.sh rename egs/iam/v2/local/chain/{ => tuning}/run_cnn_1a.sh (100%) rename egs/iam/v2/local/chain/{run_cnn_chainali_1c.sh => tuning/run_cnn_chainali_1a.sh} (91%) rename egs/iam/v2/local/chain/{run_cnn_chainali_1d.sh => tuning/run_cnn_chainali_1b.sh} (90%) diff --git a/egs/iam/v2/local/chain/run_cnn.sh b/egs/iam/v2/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_cnn_chainali.sh b/egs/iam/v2/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..86568421fe1 --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1b.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh similarity index 100% rename from egs/iam/v2/local/chain/run_cnn_1a.sh rename to egs/iam/v2/local/chain/tuning/run_cnn_1a.sh diff --git a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh similarity index 91% rename from egs/iam/v2/local/chain/run_cnn_chainali_1c.sh rename to egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh index 53039377baf..aa4d65c0fde 100755 --- a/egs/iam/v2/local/chain/run_cnn_chainali_1c.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh @@ -1,18 +1,16 @@ #!/bin/bash -# chainali_1c is as chainali_1b except it uses l2-regularize -# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c -# System cnn_chainali_1b cnn_chainali_1c -# WER 14.38 12.72 -# CER 7.14 5.99 -# Final train prob -0.0113 -0.0291 -# Final valid prob -0.0400 -0.0359 -# Final train prob (xent) -0.6043 -0.9781 -# Final valid prob (xent) -0.9030 -1.1544 -# Parameters 3.96M 3.96M - -# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c -# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) +# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ +# System cnn_chainali_1a +# WER 10.48 +# WER (rescored) 10.23 +# CER 4.82 +# CER (rescored) 4.69 +# Final train prob -0.0444 +# Final valid prob -0.0645 +# Final train prob (xent) -0.4523 +# Final valid prob (xent) -0.5350 +# Parameters 5.65M set -e -o pipefail diff --git a/egs/iam/v2/local/chain/run_cnn_chainali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh similarity index 90% rename from egs/iam/v2/local/chain/run_cnn_chainali_1d.sh rename to egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh index 3123ee897d5..c648f189dca 100755 --- a/egs/iam/v2/local/chain/run_cnn_chainali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh @@ -1,19 +1,19 @@ #!/bin/bash -# chainali_1c is as chainali_1b except it uses l2-regularize -# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c -# System cnn_chainali_1b cnn_chainali_1c -# WER 14.38 12.72 -# CER 7.14 5.99 -# Final train prob -0.0113 -0.0291 -# Final valid prob -0.0400 -0.0359 -# Final train prob (xent) -0.6043 -0.9781 -# Final valid prob (xent) -0.9030 -1.1544 -# Parameters 3.96M 3.96M - -# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c -# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) - +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b +# System cnn_chainali_1b +# WER 9.49 +# WER (rescored) 9.27 +# CER 4.39 +# CER (rescored) 4.32 +# Final train prob -0.0466 +# Final valid prob -0.0692 +# Final train prob (xent) -0.4811 +# Final valid prob (xent) -0.5538 +# Parameters 5.65M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1b +# exp/chain/cnn_chainali_1d: num-iters=40 nj=2..4 num-params=5.7M dim=40->400 combine=-0.051->-0.051 (over 1) xent:train/valid[25,39,final]=(-0.818,-0.500,-0.481/-0.828,-0.570,-0.554) logprob:train/valid[25,39,final]=(-0.097,-0.050,-0.047/-0.114,-0.073,-0.069) set -e -o pipefail stage=0 @@ -23,7 +23,7 @@ train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= @@ -196,7 +196,7 @@ if [ $stage -le 5 ]; then --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=6 \ + --trainer.num-epochs=5 \ --trainer.frames-per-iter=1500000 \ --trainer.optimization.num-jobs-initial=2 \ --trainer.optimization.num-jobs-final=4 \ diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 209fcb5de2f..319741d814d 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -165,10 +165,10 @@ if [ $stage -le 13 ]; then fi if [ $stage -le 14 ]; then - local/chain/run_cnn_1a.sh --train_set train_aug + local/chain/run_cnn.sh --train_set train_aug fi if [ $stage -le 15 ]; then - local/chain/run_cnn_chainali_1d.sh --train_set train_aug \ - --chain-model-dir exp/chain/cnn_1a --stage 2 + local/chain/run_cnn_chainali.sh --train_set train_aug \ + --chain-model-dir exp/chain/cnn_1a --stage 4 fi From f383334f221f69c03a607579c6fb0564de116032 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sat, 1 Sep 2018 04:27:30 -0400 Subject: [PATCH 07/37] cosmetic fixes --- egs/iam/v2/local/augment_data.sh | 17 ++++++----- egs/iam/v2/local/chain/run_e2e_cnn.sh | 28 ++++++++--------- egs/iam/v2/local/chain/tuning/run_cnn_1a.sh | 20 +++++++------ .../local/chain/tuning/run_cnn_chainali_1a.sh | 2 +- .../local/chain/tuning/run_cnn_chainali_1b.sh | 2 +- egs/iam/v2/local/extract_features.sh | 2 +- egs/iam/v2/local/process_corpus.py | 30 ------------------- egs/wsj/s5/utils/copy_data_dir.sh | 10 ++----- 8 files changed, 39 insertions(+), 72 deletions(-) delete mode 100755 egs/iam/v2/local/process_corpus.py diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh index 443a16874f2..82fa5230a43 100755 --- a/egs/iam/v2/local/augment_data.sh +++ b/egs/iam/v2/local/augment_data.sh @@ -19,14 +19,15 @@ outdir=$2 datadir=$3 mkdir -p $datadir/augmentations -echo "copying $srcdir to $datadir/augmentations/aug1" -utils/copy_data_dir.sh --spk-prefix aug1- --utt-prefix aug1- $srcdir $datadir/augmentations/aug1 - -echo " copying allowed length for training with augmented data..." -cat $srcdir/allowed_lengths.txt > $datadir/augmentations/aug1/allowed_lengths.txt - -echo " Extracting features, creating feats.scp file for augmentated data" -local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim --fliplr false --augment true $datadir/augmentations/aug1 +echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" + +for set in aug1; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done echo " combine original data and data from different augmentations" utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh index 15bdf610cd3..bd9f788d702 100755 --- a/egs/iam/v2/local/chain/run_e2e_cnn.sh +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -2,20 +2,20 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) - -# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a -# System cnn_1a cnn_chainali_1c e2e_cnn_1a -# WER 18.52 12.72 12.15 -# CER 10.07 5.99 6.03 -# Final train prob -0.0077 -0.0291 -0.0371 -# Final valid prob -0.0970 -0.0359 -0.0636 -# Final train prob (xent) -0.5484 -0.9781 -# Final valid prob (xent) -0.9643 -1.1544 -# Parameters 4.36M 3.96M 9.13M - -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) - +# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1d/ +# System cnn_e2eali_1d +# WER 9.92 +# WER (rescored) 9.50 +# CER 4.53 +# CER (rescored) 4.46 +# Final train prob -0.0472 +# Final valid prob -0.0713 +# Final train prob (xent) -0.4751 +# Final valid prob (xent) -0.5506 +# Parameters 5.64M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d +# exp/chain/cnn_e2eali_1d/: num-iters=40 nj=2..4 num-params=5.6M dim=40->392 combine=-0.051->-0.051 (over 1) xent:train/valid[25,39,final]=(-0.764,-0.493,-0.475/-0.770,-0.566,-0.551) logprob:train/valid[25,39,final]=(-0.094,-0.051,-0.047/-0.111,-0.075,-0.071) set -e # configs for 'chain' diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh index 5e7d5bac77b..e39b14ac8dc 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh @@ -5,17 +5,19 @@ # 2017 Ashish Arora # steps/info/chain_dir_info.pl exp/chain/cnn_1a/ -# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098) +# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->400 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-0.502,-0.380,-0.376/-0.679,-0.626,-0.625) logprob:train/valid[27,41,final]=(-0.038,-0.032,-0.032/-0.063,-0.064,-0.064) -# local/chain/compare_wer.sh exp/chain/cnn_1a/ +# ./local/chain/compare_wer.sh exp/chain/cnn_1a/ # System cnn_1a -# WER 18.52 -# CER 10.07 -# Final train prob -0.0077 -# Final valid prob -0.0970 -# Final train prob (xent) -0.5484 -# Final valid prob (xent) -0.9643 -# Parameters 4.36M +# WER 14.91 +# WER (rescored) -- +# CER 7.92 +# CER (rescored) -- +# Final train prob -0.0320 +# Final valid prob -0.0643 +# Final train prob (xent) -0.3762 +# Final valid prob (xent) -0.6247 +# Parameters 4.39M set -e -o pipefail diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh index aa4d65c0fde..07bdac88468 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh @@ -21,7 +21,7 @@ train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh index c648f189dca..36a30b2df29 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh @@ -23,7 +23,7 @@ train_set=train gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it # should have alignments for the specified training data. nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= diff --git a/egs/iam/v2/local/extract_features.sh b/egs/iam/v2/local/extract_features.sh index 1741ad3f9b2..4ed6ba04348 100755 --- a/egs/iam/v2/local/extract_features.sh +++ b/egs/iam/v2/local/extract_features.sh @@ -36,7 +36,7 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ diff --git a/egs/iam/v2/local/process_corpus.py b/egs/iam/v2/local/process_corpus.py deleted file mode 100755 index 9f8e1d275d3..00000000000 --- a/egs/iam/v2/local/process_corpus.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2018 Ashish Arora -# Apache 2.0 -# This script reads valid phones and removes the lines in the corpus -# which have any other phone. - -import os -import sys, io - -phone_file = os.path.join('data/local/phones.txt') -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -phone_dict = dict() -with open(phone_file, 'r', encoding='utf-8') as phone_fh: - for line in phone_fh: - line = line.strip().split()[0] - phone_dict[line] = line - -phone_dict[' '] = ' ' -corpus_text = list() -for line in infile: - text = line.strip() - skip_text = False - for phone in text: - if phone not in phone_dict.keys(): - skip_text = True - break - if not skip_text: - output.write(text+ '\n') - diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh index 996cf76ec33..f3b885c5e79 100755 --- a/egs/wsj/s5/utils/copy_data_dir.sh +++ b/egs/wsj/s5/utils/copy_data_dir.sh @@ -86,16 +86,10 @@ fi if [ -f $srcdir/segments ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - if [ -f $srcdir/wav.scp ]; then - cp $srcdir/wav.scp $destdir - elif [ -f $srcdir/images.scp ]; then - cp $srcdir/images.scp $destdir - fi + cp $srcdir/wav.scp $destdir else # no segments->wav indexed by utt. if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - elif [ -f $srcdir/images.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/images.scp >$destdir/images.scp fi fi @@ -132,7 +126,7 @@ rm $destdir/spk_map $destdir/utt_map echo "$0: copied data from $srcdir to $destdir" -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp images.scp reco2file_and_channel stm glm ctm; do +for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" echo " ... $destdir/.backup/$f" From 44c9e5866f9fb554fef31558a4eb1d28a1577be2 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sat, 1 Sep 2018 04:36:27 -0400 Subject: [PATCH 08/37] adding results --- egs/iam/v2/local/chain/run_e2e_cnn.sh | 28 +++++++-------- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 34 ++++++++----------- egs/iam/v2/local/extract_features.sh | 2 +- 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh index bd9f788d702..c1e9780876c 100755 --- a/egs/iam/v2/local/chain/run_e2e_cnn.sh +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -2,20 +2,20 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1d/ -# System cnn_e2eali_1d -# WER 9.92 -# WER (rescored) 9.50 -# CER 4.53 -# CER (rescored) 4.46 -# Final train prob -0.0472 -# Final valid prob -0.0713 -# Final train prob (xent) -0.4751 -# Final valid prob (xent) -0.5506 -# Parameters 5.64M - -# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d -# exp/chain/cnn_e2eali_1d/: num-iters=40 nj=2..4 num-params=5.6M dim=40->392 combine=-0.051->-0.051 (over 1) xent:train/valid[25,39,final]=(-0.764,-0.493,-0.475/-0.770,-0.566,-0.551) logprob:train/valid[25,39,final]=(-0.094,-0.051,-0.047/-0.111,-0.075,-0.071) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 11.24 +# WER (rescored) 10.80 +# CER 5.32 +# CER (rescored) 5.24 +# Final train prob 0.0568 +# Final valid prob 0.0381 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) set -e # configs for 'chain' diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 6ab74dc2f0d..9cf5fbadcc8 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -1,24 +1,20 @@ #!/bin/bash -# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller -# l2-regularize, more epochs and uses dropout. - - -# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c -# System cnn_e2eali_1b cnn_e2eali_1c -# WER 10.33 10.05 -# WER (rescored) 10.10 9.75 -# CER 5.00 4.76 -# CER (rescored) 4.88 4.68 -# Final train prob -0.0428 -0.0317 -# Final valid prob -0.0666 -0.0630 -# Final train prob (xent) -0.9210 -0.5413 -# Final valid prob (xent) -1.0264 -0.7096 -# Parameters 3.98M 5.12M - -# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c -# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063) - +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1d/ +# System cnn_e2eali_1d +# WER 9.92 +# WER (rescored) 9.50 +# CER 4.53 +# CER (rescored) 4.46 +# Final train prob -0.0472 +# Final valid prob -0.0713 +# Final train prob (xent) -0.4751 +# Final valid prob (xent) -0.5506 +# Parameters 5.64M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d +# exp/chain/cnn_e2eali_1d/: num-iters=40 nj=2..4 num-params=5.6M dim=40->392 combine=-0.051->-0.051 (over 1) xent:train/valid[25,39,final]=(-0.764,-0.493,-0.475/-0.770,-0.566,-0.551) logprob:train/valid[25,39,final]=(-0.094,-0.051,-0.047/-0.111,-0.075,-0.071) set -e -o pipefail stage=0 diff --git a/egs/iam/v2/local/extract_features.sh b/egs/iam/v2/local/extract_features.sh index 4ed6ba04348..1741ad3f9b2 100755 --- a/egs/iam/v2/local/extract_features.sh +++ b/egs/iam/v2/local/extract_features.sh @@ -36,7 +36,7 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - image/ocr/make_features.py $logdir/images.JOB.scp \ + local/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ copy-feats --compress=true --compression-method=7 \ From 2d11672f50aa24d772159712351a2f0248c27861 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sat, 1 Sep 2018 07:21:54 -0400 Subject: [PATCH 09/37] removing local/prepare_lang and adding gen_topo in run.sh --- egs/iam/v2/local/prepare_lang.sh | 474 ------------------------------- egs/iam/v2/run.sh | 3 + 2 files changed, 3 insertions(+), 474 deletions(-) delete mode 100755 egs/iam/v2/local/prepare_lang.sh diff --git a/egs/iam/v2/local/prepare_lang.sh b/egs/iam/v2/local/prepare_lang.sh deleted file mode 100755 index cc6bc03a432..00000000000 --- a/egs/iam/v2/local/prepare_lang.sh +++ /dev/null @@ -1,474 +0,0 @@ -#!/bin/bash -# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal -# 2014 Guoguo Chen -# 2015 Hainan Xu -# 2016 FAU Erlangen (Author: Axel Horndasch) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script prepares a directory such as data/lang/, in the standard format, -# given a source directory containing a dictionary lexicon.txt in a form like: -# word phone1 phone2 ... phoneN -# per line (alternate prons would be separate lines), or a dictionary with probabilities -# called lexiconp.txt in a form: -# word pron-prob phone1 phone2 ... phoneN -# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if -# lexicon.txt exists. -# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt -# and extra_questions.txt -# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and -# non-silence phones respectively (where silence includes various kinds of -# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the -# "real" phones.) -# In each line of those files is a list of phones, and the phones on each line -# are assumed to correspond to the same "base phone", i.e. they will be -# different stress or tone variations of the same basic phone. -# The file "optional_silence.txt" contains just a single phone (typically SIL) -# which is used for optional silence in the lexicon. -# extra_questions.txt might be empty; typically will consist of lists of phones, -# all members of each list with the same stress or tone; and also possibly a -# list for the silence phones. This will augment the automatically generated -# questions (note: the automatically generated ones will treat all the -# stress/tone versions of a phone the same, so will not "get to ask" about -# stress or tone). -# - -# This script adds word-position-dependent phones and constructs a host of other -# derived files, that go in data/lang/. - -# Begin configuration section. -num_sil_states=5 -num_nonsil_states=3 -position_dependent_phones=true -# position_dependent_phones is false also when position dependent phones and word_boundary.txt -# have been generated by another source -share_silence_phones=false # if true, then share pdfs of different silence - # phones together. -sil_prob=0.5 -unk_fst= # if you want to model the unknown-word () - # with a phone-level LM as created by make_unk_lm.sh, - # provide the text-form FST via this flag, e.g. /unk_fst.txt - # where was the 2nd argument of make_unk_lm.sh. -phone_symbol_table= # if set, use a specified phones.txt file. -extra_word_disambig_syms= # if set, add disambiguation symbols from this file (one per line) - # to phones/disambig.txt, phones/wdisambig.txt and words.txt -num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence. - # Increasing this number does not harm, but is only useful if you later - # want to introduce this labels to L_disambig.fst -# end configuration sections - -echo "$0 $@" # Print the command line for logging - -. utils/parse_options.sh - -if [ $# -ne 4 ]; then - echo "usage: utils/prepare_lang.sh " - echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" - echo " should contain the following files:" - echo " extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt" - echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info." - echo "options: " - echo " --num-sil-states # default: 5, #states in silence models." - echo " --num-nonsil-states # default: 3, #states in non-silence models." - echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" - echo " # markers on phones to indicate word-internal positions. " - echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " - echo " # all non-silence phones. " - echo " --sil-prob # default: 0.5 [must have 0 <= silprob < 1]" - echo " --phone-symbol-table # default: \"\"; if not empty, use the provided " - echo " # phones.txt as phone symbol table. This is useful " - echo " # if you use a new dictionary for the existing setup." - echo " --unk-fst # default: none. e.g. exp/make_unk_lm/unk_fst.txt." - echo " # This is for if you want to model the unknown word" - echo " # via a phone-level LM rather than a special phone" - echo " # (this should be more useful for test-time than train-time)." - echo " --extra-word-disambig-syms # default: \"\"; if not empty, add disambiguation symbols" - echo " # from this file (one per line) to phones/disambig.txt," - echo " # phones/wdisambig.txt and words.txt" - exit 1; -fi - -srcdir=$1 -oov_word=$2 -tmpdir=$3 -dir=$4 -mkdir -p $dir $tmpdir $dir/phones - -silprob=false -[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true - -[ -f path.sh ] && . ./path.sh - -! utils/validate_dict_dir.pl $srcdir && \ - echo "*Error validating directory $srcdir*" && exit 1; - -if [[ ! -f $srcdir/lexicon.txt ]]; then - echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" - perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1; -fi -if [[ ! -f $srcdir/lexiconp.txt ]]; then - echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt" - perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1; -fi - -if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then - echo "$0: expected --unk-fst $unk_fst to exist as a file" - exit 1 -fi - -if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then - utils/validate_dict_dir.pl $srcdir # show the output. - echo "Validation failed (second time)" - exit 1; -fi - -# phones.txt file provided, we will do some sanity check here. -if [[ ! -z $phone_symbol_table ]]; then - # Checks if we have position dependent phones - n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l` - n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l` - $position_dependent_phones && [ $n1 -eq $n2 ] &&\ - echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1; - ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\ - echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1; - - # Checks if the phone sets match. - cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table ' - BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} - { for (x = 1; x <= NF; ++x) { if (!($x in phones)) { - print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1; -fi - -# In case there are extra word-level disambiguation symbols we need -# to make sure that all symbols in the provided file are valid. -if [ ! -z "$extra_word_disambig_syms" ]; then - if ! utils/lang/validate_disambig_sym_file.pl --allow-numeric "false" $extra_word_disambig_syms; then - echo "$0: Validation of disambiguation file \"$extra_word_disambig_syms\" failed." - exit 1; - fi -fi - -if $position_dependent_phones; then - # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or - # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by - # adding the markers _B, _E, _S, _I depending on word position. - # In this recipe, these markers apply to silence also. - # Do this starting from lexiconp.txt only. - if "$silprob"; then - perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; - $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; - if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } - else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; - for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ - < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt - else - perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; - if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; - for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ - < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1; - fi - - # create $tmpdir/phone_map.txt - # this has the format (on each line) - # ... - # where the versions depend on the position of the phone within a word. - # For instance, we'd have: - # AA AA_B AA_E AA_I AA_S - # for (B)egin, (E)nd, (I)nternal and (S)ingleton - # and in the case of silence - # SIL SIL SIL_B SIL_E SIL_I SIL_S - # [because SIL on its own is one of the variants; this is for when it doesn't - # occur inside a word but as an option in the lexicon.] - - # This phone map expands the phone lists into all the word-position-dependent - # versions of the phone lists. - cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - > $tmpdir/phone_map.txt -else - if "$silprob"; then - cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt - else - cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt - fi - cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \ - awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones - paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt -fi -mkdir -p $dir/phones # various sets of phones... -# Sets of phones for use in clustering, and making monophone systems. -if $share_silence_phones; then - # build a roots file that will force all the silence phones to share the - # same pdf's. [three distinct states, only the transitions will differ.] - # 'shared'/'not-shared' means, do we share the 3 states of the HMM - # in the same tree-root? - # Sharing across models(phones) is achieved by writing several phones - # into one line of roots.txt (shared/not-shared doesn't affect this). - # 'not-shared not-split' means we have separate tree roots for the 3 states, - # but we never split the tree so they remain stumps, - # so all phones in the line correspond to the same model. - cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \ - utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt - cat $dir/phones/sets.txt | \ - awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt -else - # different silence phones will have different GMMs. [note: here, all "shared split" means - # is that we may have one GMM for all the states, or we can split on states. because they're - # context-independent phones, they don't see the context.] - cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt - cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt -fi -cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ - awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt -cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ - awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt -cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt -cp $dir/phones/silence.txt $dir/phones/context_indep.txt -# if extra_questions.txt is empty, it's OK. -cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \ - >$dir/phones/extra_questions.txt -# Want extra questions about the word-start/word-end stuff. Make it separate for -# silence and non-silence. Probably doesn't matter, as silence will rarely -# be inside a word. -if $position_dependent_phones; then - for suffix in _B _E _I _S; do - (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt - done - for suffix in "" _B _E _I _S; do - (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt - done -fi -# add_lex_disambig.pl is responsible for adding disambiguation symbols to -# the lexicon, for telling us how many disambiguation symbols it used, -# and and also for modifying the unknown-word's pronunciation (if the -# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those -# disambig symbols for that purpose. -# The #2 will later be replaced with the actual unk model. The reason -# for the #1 and the #3 is for disambiguation and also to keep the -# FST compact. If we didn't have the #1, we might have a different copy of -# the unk-model FST, or at least some of its arcs, for each start-state from -# which an transition comes (instead of per end-state, which is more compact); -# and adding the #3 prevents us from potentially having 2 copies of the unk-model -# FST due to the optional-silence [the last phone of any word gets 2 arcs]. -if [ ! -z "$unk_fst" ]; then # if the --unk-fst option was provided... - if "$silprob"; then - utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1 - else - utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1 - fi - unk_opt="--first-allowed-disambig 4" -else - unk_opt= -fi -if "$silprob"; then - ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt) -else - ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt) -fi -ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST. -echo $ndisambig > $tmpdir/lex_ndisambig -# Format of lexiconp_disambig.txt: -# !SIL 1.0 SIL_S -# 1.0 SPN_S #1 -# 1.0 SPN_S #2 -# 1.0 NSN_S -# !EXCLAMATION-POINT 1.0 EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt -# In case there are extra word-level disambiguation symbols they also -# need to be added to the list of phone-level disambiguation symbols. -if [ ! -z "$extra_word_disambig_syms" ]; then - # We expect a file containing valid word-level disambiguation symbols. - cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/disambig.txt -fi -# Create phone symbol table. -if [[ ! -z $phone_symbol_table ]]; then - start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` - echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' - BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ - cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt -else - echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ - awk '{n=NR-1; print $1, n;}' > $dir/phones.txt -fi -# Create a file that describes the word-boundary information for -# each phone. 5 categories. -if $position_dependent_phones; then - cat $dir/phones/{silence,nonsilence}.txt | \ - awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; } - /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; } - {print $1, "nonword";} ' > $dir/phones/word_boundary.txt -else - # word_boundary.txt might have been generated by another source - [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt -fi -# Create word symbol table. -# and are only needed due to the need to rescore lattices with -# ConstArpaLm format language model. They do not normally appear in G.fst or -# L.fst. -if "$silprob"; then - # remove the silprob - cat $tmpdir/lexiconp_silprob.txt |\ - awk '{ - for(i=1; i<=NF; i++) { - if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print ""; - } - }' > $tmpdir/lexiconp.txt -fi -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; - } - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; - } - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; -# In case there are extra word-level disambiguation symbols they also -# need to be added to words.txt -if [ ! -z "$extra_word_disambig_syms" ]; then - # Since words.txt already exists, we need to extract the current word count. - word_count=`tail -n 1 $dir/words.txt | awk '{ print $2 }'` - # We expect a file containing valid word-level disambiguation symbols. - # The list of symbols is attached to the current words.txt (including - # a numeric identifier for each symbol). - cat $extra_word_disambig_syms | \ - awk -v WC=$word_count '{ printf("%s %d\n", $1, ++WC); }' >> $dir/words.txt || exit 1; -fi -# format of $dir/words.txt: -# 0 -#!EXCLAMATION-POINT 1 -#!SIL 2 -#"CLOSE-QUOTE 3 -#... -silphone=`cat $srcdir/optional_silence.txt` || exit 1; -[ -z "$silphone" ] && \ - ( echo "You have no optional-silence phone; it is required in the current scripts" - echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \ - exit 1; -# create $dir/phones/align_lexicon.{txt,int}. -# This is the method we use for lattice word alignment if we are not -# using word-position-dependent phones. -# First remove pron-probs from the lexicon. -perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt -# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence, -# and is not part of a word. -[ ! -z "$silphone" ] && echo " $silphone" >> $tmpdir/align_lexicon.txt -cat $tmpdir/align_lexicon.txt | \ - perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt -# create phones/align_lexicon.int -cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ - utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int -# Create the basic L.fst without disambiguation symbols, for use -# in training. -if $silprob; then - # Add silence probabilities (modlels the prob. of silence before and after each - # word). On some setups this helps a bit. See utils/dict_dir_add_pronprobs.sh - # and where it's called in the example scripts (run.sh). - utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt $silphone "" | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; -else - utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; -fi -# The file oov.txt contains a word that we will map any OOVs to during -# training. -echo "$oov_word" > $dir/oov.txt || exit 1; -cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; -# integer version of oov symbol, used in some scripts. -# the file wdisambig.txt contains a (line-by-line) list of the text-form of the -# disambiguation symbols that are used in the grammar and passed through by the -# lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork -# for more generality (which probably would be added by another script). -# wdisambig_words.int contains the corresponding list interpreted by the -# symbol table words.txt, and wdisambig_phones.int contains the corresponding -# list interpreted by the symbol table phones.txt. -echo '#0' >$dir/phones/wdisambig.txt -# In case there are extra word-level disambiguation symbols they need -# to be added to the existing word-level disambiguation symbols file. -if [ ! -z "$extra_word_disambig_syms" ]; then - # We expect a file containing valid word-level disambiguation symbols. - # The regular expression for awk is just a paranoia filter (e.g. for empty lines). - cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/wdisambig.txt -fi -utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int -utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int -# Create these lists of phones in colon-separated integer list form too, -# for purposes of being given to programs as command-line options. -for f in silence nonsilence optional_silence disambig context_indep; do - utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int - utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \ - awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1; -done -for x in sets extra_questions; do - utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1; -done -utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \ - > $dir/phones/roots.int || exit 1; -if [ -f $dir/phones/word_boundary.txt ]; then - utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \ - > $dir/phones/word_boundary.int || exit 1; -fi -silphonelist=`cat $dir/phones/silence.csl` -nonsilphonelist=`cat $dir/phones/nonsilence.csl` -# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file -# with another one of your choice if the 'topo' file you want can't be generated by -# utils/gen_topo.pl. We do this in the 'chain' recipes. Of course, the 'topo' file -# should cover all the phones. Try running utils/validate_lang.pl to check that -# everything is OK after modifying the topo file. -local/gen_topo.py $num_nonsil_states $num_sil_states 4 $nonsilphonelist $silphonelist $dir/phones.txt >$dir/topo -# Create the lexicon FST with disambiguation symbols, and put it in lang_test. -# There is an extra step where we create a loop to "pass through" the -# disambiguation symbols from G.fst. -if $silprob; then - utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ - fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; -else - utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ - fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; -fi -if [ ! -z "$unk_fst" ]; then - utils/lang/internal/apply_unk_lm.sh $unk_fst $dir || exit 1 - if ! $position_dependent_phones; then - echo "$0: warning: you are using the --unk-lm option and setting --position-dependent-phones false." - echo " ... this will make it impossible to properly work out the word boundaries after" - echo " ... decoding; quite a few scripts will not work as a result, and many scoring scripts" - echo " ... will die." - sleep 4 - fi -fi -echo "$(basename $0): validating output directory" -! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" && exit 1; -exit 0; diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 319741d814d..92061121f6c 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -93,6 +93,9 @@ if [ $stage -le 4 ]; then local/prepare_dict.sh utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ From 4fc67057878a235d24c51a1939b979fa76595896 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sat, 1 Sep 2018 10:16:21 -0400 Subject: [PATCH 10/37] fixing bugs --- egs/cifar/v1/image/copy_data_dir.sh | 118 ++++++++++++++++++++++++++++ egs/iam/v2/local/gen_topo.py | 5 +- 2 files changed, 120 insertions(+), 3 deletions(-) create mode 100755 egs/cifar/v1/image/copy_data_dir.sh diff --git a/egs/cifar/v1/image/copy_data_dir.sh b/egs/cifar/v1/image/copy_data_dir.sh new file mode 100755 index 00000000000..c923f5cc07a --- /dev/null +++ b/egs/cifar/v1/image/copy_data_dir.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Copyright 2013 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# feats.scp +# images.scp +# vad.scp +# spk2utt +# utt2spk +# text +# +# It copies to another directory, possibly adding a specified prefix or a suffix +# to the utterance and/or speaker names. Note, the recording-ids stay the same. +# + + +# begin configuration section +spk_prefix= +utt_prefix= +spk_suffix= +utt_suffix= +validate_opts= # should rarely be needed. +# end configuration section + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" + echo "Options" + echo " --spk-prefix= # Prefix for speaker ids, default empty" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --spk-suffix= # Suffix for speaker ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + + +export LC_ALL=C + +srcdir=$1 +destdir=$2 + +if [ ! -f $srcdir/utt2spk ]; then + echo "copy_data_dir.sh: no such file $srcdir/utt2spk" + exit 1; +fi + +if [ "$destdir" == "$srcdir" ]; then + echo "$0: this script requires and to be different." + exit 1 +fi + +set -e; + +mkdir -p $destdir + +cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map +cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map + +if [ ! -f $srcdir/utt2uniq ]; then + if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then + cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq + fi +else + cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq +fi + +cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ + utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk + +utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt + +if [ -f $srcdir/feats.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp +fi + +if [ -f $srcdir/vad.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp +fi + +if [ -f $srcdir/images.scp ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/images.scp >$destdir/images.scp +fi + +if [ -f $srcdir/text ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text +fi +if [ -f $srcdir/utt2dur ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur +fi +if [ -f $srcdir/cmvn.scp ]; then + utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp +fi + +rm $destdir/spk_map $destdir/utt_map + +echo "$0: copied data from $srcdir to $destdir" + +for f in feats.scp cmvn.scp vad.scp utt2uniq utt2dur utt2num_frames text images.scp; do + if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then + echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" + echo " ... $destdir/.backup/$f" + mkdir -p $destdir/.backup + mv $destdir/$f $destdir/.backup/ + fi +done + + +[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" +[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" + +utils/validate_data_dir.sh $validate_opts $destdir diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py index a74c6d4bbae..540bfbcf270 100755 --- a/egs/iam/v2/local/gen_topo.py +++ b/egs/iam/v2/local/gen_topo.py @@ -32,14 +32,13 @@ all_phones = silence_phones + nonsilence_phones punctuation_phones = [] -exclude = set(string.punctuation) +exclude = set("!(),.?;:'-\"") with open(args.phone_list) as f: for line in f: line = line.strip() - phone = line.split('_')[0] + phone = line.split(' ')[0] if len(phone) == 1 and phone in exclude: punctuation_phones.append(int(line.split(' ')[1])) - # For nonsilence phones that are not punctuations print("") print("") From 8877530fa3e54e100427e2d411fda5fed9e75ac7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sun, 2 Sep 2018 04:29:26 -0400 Subject: [PATCH 11/37] updating result --- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 24 +++++++++---------- egs/iam/v2/run_end2end.sh | 11 +-------- 2 files changed, 13 insertions(+), 22 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 9cf5fbadcc8..27988beafdd 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -3,18 +3,18 @@ # This script does end2end chain training (i.e. from scratch) # ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1d/ # System cnn_e2eali_1d -# WER 9.92 -# WER (rescored) 9.50 -# CER 4.53 -# CER (rescored) 4.46 -# Final train prob -0.0472 -# Final valid prob -0.0713 -# Final train prob (xent) -0.4751 -# Final valid prob (xent) -0.5506 -# Parameters 5.64M +# WER 9.52 +# WER (rescored) 9.29 +# CER 4.45 +# CER (rescored) 4.43 +# Final train prob -0.0473 +# Final valid prob -0.0706 +# Final train prob (xent) -0.4623 +# Final valid prob (xent) -0.5371 +# Parameters 5.08M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d -# exp/chain/cnn_e2eali_1d/: num-iters=40 nj=2..4 num-params=5.6M dim=40->392 combine=-0.051->-0.051 (over 1) xent:train/valid[25,39,final]=(-0.764,-0.493,-0.475/-0.770,-0.566,-0.551) logprob:train/valid[25,39,final]=(-0.094,-0.051,-0.047/-0.111,-0.075,-0.071) +# exp/chain/cnn_e2eali_1d/: num-iters=40 nj=2..4 num-params=5.1M dim=40->400 combine=-0.052->-0.052 (over 1) xent:train/valid[25,39,final]=(-0.739,-0.483,-0.462/-0.763,-0.551,-0.537) logprob:train/valid[25,39,final]=(-0.092,-0.052,-0.047/-0.112,-0.076,-0.071) set -e -o pipefail stage=0 @@ -140,7 +140,7 @@ if [ $stage -le 4 ]; then output_opts="l2-regularize=0.04" common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -194,7 +194,7 @@ if [ $stage -le 5 ]; then --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=8 \ + --trainer.num-epochs=5 \ --trainer.frames-per-iter=1500000 \ --trainer.optimization.num-jobs-initial=2 \ --trainer.optimization.num-jobs-final=4 \ diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index bd78c011b75..cf0d8476e55 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -6,15 +6,7 @@ stage=0 nj=20 username= password= -# iam_database points to the database path on the JHU grid. If you have not -# already downloaded the database you can set it to a local directory -# like "data/download" and follow the instructions -# in "local/prepare_data.sh" to download the database: iam_database=/export/corpora5/handwriting_ocr/IAM -# wellington_database points to the database path on the JHU grid. The Wellington -# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). -# This corpus is of written NZ English that can be purchased here: -# "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. @@ -114,9 +106,8 @@ fi if [ $stage -le 6 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ - --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train + data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi if [ $stage -le 7 ]; then From 59e2c8b19be4013e8d79680748328f36e8ef13a8 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sun, 2 Sep 2018 04:44:18 -0400 Subject: [PATCH 12/37] updating documentation, results and parameter tuning --- egs/iam/v2/local/chain/run_e2e_cnn.sh | 175 +----------------- .../v2/local/chain/tuning/run_e2e_cnn_1a.sh | 174 +++++++++++++++++ .../v2/local/chain/tuning/run_e2e_cnn_1b.sh | 160 ++++++++++++++++ egs/iam/v2/run.sh | 17 +- egs/iam/v2/run_end2end.sh | 16 +- 5 files changed, 365 insertions(+), 177 deletions(-) mode change 100755 => 120000 egs/iam/v2/local/chain/run_e2e_cnn.sh create mode 100755 egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh create mode 100755 egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh deleted file mode 100755 index c1e9780876c..00000000000 --- a/egs/iam/v2/local/chain/run_e2e_cnn.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# Copyright 2017 Hossein Hadian - -# This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ -# System e2e_cnn_1a -# WER 11.24 -# WER (rescored) 10.80 -# CER 5.32 -# CER (rescored) 5.24 -# Final train prob 0.0568 -# Final valid prob 0.0381 -# Final train prob (xent) -# Final valid prob (xent) -# Parameters 9.13M - -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) -set -e - -# configs for 'chain' -stage=0 -train_stage=-10 -get_egs_stage=-10 -affix=1a - -# training options -tdnn_dim=450 -num_epochs=4 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 -common_egs_dir= -l2_regularize=0.00005 -frames_per_iter=1000000 -cmvn_opts="--norm-means=true --norm-vars=true" -train_set=train -lang_decode=data/lang -lang_rescore=data/lang_rescore_6g - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 1 ]; then - steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ - --shared-phones true \ - --type biphone \ - data/$train_set $lang $treedir - $cmd $treedir/log/make_phone_lm.log \ - cat data/$train_set/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ - utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=500 \ - ark:- $treedir/phone_lm.fst -fi - -if [ $stage -le 2 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - - cnn_opts="l2-regularize=0.075" - tdnn_opts="l2-regularize=0.075" - output_opts="l2-regularize=0.1" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts -EOF - - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs -fi - -if [ $stage -le 3 ]; then - # no need to store the egs in a shared storage because we always - # remove them. Anyway, it takes only 5 minutes to generate them. - - steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ - --cmd "$cmd" \ - --feat.cmvn-opts "$cmvn_opts" \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ - --chain.apply-deriv-weights false \ - --egs.dir "$common_egs_dir" \ - --egs.stage $get_egs_stage \ - --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ - --chain.frame-subsampling-factor 4 \ - --chain.alignment-subsampling-factor 4 \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.shrink-value 1.0 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir data/${train_set} \ - --tree-dir $treedir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 4 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 $lang_decode \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..7dca9c30e23 --- /dev/null +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1b.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..d88e1a38820 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 11.24 +# WER (rescored) 10.80 +# CER 5.32 +# CER (rescored) 5.24 +# Final train prob 0.0568 +# Final valid prob 0.0381 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) + + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh new file mode 100755 index 00000000000..a3b0d8c582f --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 11.24 +# WER (rescored) 10.80 +# CER 5.32 +# CER (rescored) 5.24 +# Final train prob 0.0568 +# Final valid prob 0.0381 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 92061121f6c..dcdbb92ed68 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -10,7 +10,15 @@ nj=70 decode_gmm=false username= password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. @@ -31,7 +39,10 @@ mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then echo "$(date) stage 1: getting allowed image widths for e2e training..." - image/get_image2num_frames.py --feat-dim 40 data/train + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train for set in train test; do echo "$(date) Extracting features, creating feats.scp file" @@ -91,6 +102,8 @@ fi if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang silphonelist=`cat data/lang/phones/silence.csl` @@ -173,5 +186,5 @@ fi if [ $stage -le 15 ]; then local/chain/run_cnn_chainali.sh --train_set train_aug \ - --chain-model-dir exp/chain/cnn_1a --stage 4 + --chain-model-dir exp/chain/cnn_1a --stage 2 fi diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index cf0d8476e55..346acbed1d3 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -6,7 +6,15 @@ stage=0 nj=20 username= password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. @@ -28,7 +36,10 @@ mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then echo "$(date) stage 1: getting allowed image widths for e2e training..." - image/get_image2num_frames.py --feat-dim 40 data/train + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train for set in train test; do echo "$(date) Extracting features, creating feats.scp file" @@ -88,6 +99,8 @@ fi if [ $stage -le 4 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang @@ -106,6 +119,7 @@ fi if [ $stage -le 6 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi From 5fc0d17914dcaf718c55c30455402a86a3bb0525 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sun, 2 Sep 2018 13:17:25 -0400 Subject: [PATCH 13/37] fixing chain scripts --- egs/iam/v2/local/chain/tuning/run_cnn_1a.sh | 2 +- .../local/chain/tuning/run_cnn_chainali_1b.sh | 12 +++++----- .../v2/local/chain/tuning/run_e2e_cnn_1b.sh | 22 +++++++++---------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh index e39b14ac8dc..6583e1725c3 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh @@ -186,7 +186,7 @@ if [ $stage -le 5 ]; then --chain.xent-regularize $xent_regularize \ --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ - --chain.apply-deriv-weights=false \ + --chain.apply-deriv-weights=true \ --chain.lm-opts="--num-extra-lm-states=500" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ --chain.alignment-subsampling-factor=$frame_subsampling_factor \ diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh index 36a30b2df29..41b800c9136 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh @@ -42,10 +42,10 @@ chunk_right_context=0 tdnn_dim=550 # training options srand=0 -remove_egs=false -lang_test=lang +remove_egs=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g + dropout_schedule='0,0@0.20,0.2@0.50,0' # End configuration section. echo "$0 $@" # Print the command line for logging @@ -120,10 +120,11 @@ if [ $stage -le 3 ]; then # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use # those. The num-leaves is always somewhat less than the num-leaves from # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; fi + steps/nnet3/chain/build_tree.sh \ --frame-subsampling-factor $frame_subsampling_factor \ --context-opts "--context-width=2 --central-position=1" \ @@ -156,6 +157,7 @@ if [ $stage -le 4 ]; then relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh index a3b0d8c582f..0ffc6b78fa7 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -2,20 +2,20 @@ # Copyright 2017 Hossein Hadian # This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ -# System e2e_cnn_1a -# WER 11.24 -# WER (rescored) 10.80 -# CER 5.32 -# CER (rescored) 5.24 -# Final train prob 0.0568 -# Final valid prob 0.0381 +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ +# System e2e_cnn_1b +# WER 13.59 +# WER (rescored) 13.27 +# CER 6.92 +# CER (rescored) 6.71 +# Final train prob 0.0345 +# Final valid prob 0.0269 # Final train prob (xent) # Final valid prob (xent) -# Parameters 9.13M +# Parameters 9.52M -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1b +# exp/chain/e2e_cnn_1b: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=0.041->0.041 (over 2) logprob:train/valid[27,41,final]=(0.032,0.035,0.035/0.025,0.026,0.027) set -e # configs for 'chain' From 1138ee31dfae5aa9824a809c86029f49e139f668 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Sun, 2 Sep 2018 14:19:06 -0400 Subject: [PATCH 14/37] updating parameters --- egs/iam/v2/local/chain/tuning/run_cnn_1a.sh | 8 ++++++-- egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh index 6583e1725c3..fe19c16ff13 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh @@ -48,7 +48,8 @@ tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -226,7 +227,7 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -240,4 +241,7 @@ if [ $stage -le 7 ]; then --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh index 41b800c9136..95c299f36db 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh @@ -144,7 +144,7 @@ if [ $stage -le 4 ]; then output_opts="l2-regularize=0.04" common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=90" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input From b3532ced75ba5c0518873a71181062f8e2766820 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 3 Sep 2018 19:05:09 -0400 Subject: [PATCH 15/37] updating parameters and results --- egs/iam/v2/local/chain/tuning/run_cnn_1a.sh | 21 ++++++----- .../local/chain/tuning/run_cnn_chainali_1b.sh | 34 ++++++++++-------- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 36 ++++++++++--------- .../v2/local/chain/tuning/run_e2e_cnn_1b.sh | 2 +- 4 files changed, 52 insertions(+), 41 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh index fe19c16ff13..cf4024c9d16 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh @@ -5,18 +5,18 @@ # 2017 Ashish Arora # steps/info/chain_dir_info.pl exp/chain/cnn_1a/ -# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->400 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-0.502,-0.380,-0.376/-0.679,-0.626,-0.625) logprob:train/valid[27,41,final]=(-0.038,-0.032,-0.032/-0.063,-0.064,-0.064) +# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->400 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-0.547,-0.404,-0.401/-0.746,-0.685,-0.684) logprob:train/valid[27,41,final]=(-0.046,-0.036,-0.036/-0.072,-0.071,-0.071) # ./local/chain/compare_wer.sh exp/chain/cnn_1a/ # System cnn_1a -# WER 14.91 -# WER (rescored) -- -# CER 7.92 -# CER (rescored) -- -# Final train prob -0.0320 -# Final valid prob -0.0643 -# Final train prob (xent) -0.3762 -# Final valid prob (xent) -0.6247 +# WER 17.05 +# WER (rescored) 16.70 +# CER 9.75 +# CER (rescored) 9.61 +# Final train prob -0.0358 +# Final valid prob -0.0709 +# Final train prob (xent) -0.4013 +# Final valid prob (xent) -0.6841 # Parameters 4.39M set -e -o pipefail @@ -245,3 +245,6 @@ if [ $stage -le 7 ]; then steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh index 95c299f36db..105b8f50854 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh @@ -1,19 +1,20 @@ #!/bin/bash -# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b -# System cnn_chainali_1b -# WER 9.49 -# WER (rescored) 9.27 -# CER 4.39 -# CER (rescored) 4.32 -# Final train prob -0.0466 -# Final valid prob -0.0692 -# Final train prob (xent) -0.4811 -# Final valid prob (xent) -0.5538 -# Parameters 5.65M +# ./local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b +# System cnn_1a cnn_chainali_1b +# WER 17.05 9.45 +# WER (rescored) 16.70 9.01 +# CER 9.75 4.43 +# CER (rescored) 9.61 4.28 +# Final train prob -0.0358 -0.0522 +# Final valid prob -0.0709 -0.0702 +# Final train prob (xent) -0.4013 -0.4992 +# Final valid prob (xent) -0.6841 -0.5658 +# Parameters 4.39M 5.13M # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1b -# exp/chain/cnn_chainali_1d: num-iters=40 nj=2..4 num-params=5.7M dim=40->400 combine=-0.051->-0.051 (over 1) xent:train/valid[25,39,final]=(-0.818,-0.500,-0.481/-0.828,-0.570,-0.554) logprob:train/valid[25,39,final]=(-0.097,-0.050,-0.047/-0.114,-0.073,-0.069) +# exp/chain/cnn_chainali_1b/: num-iters=36 nj=3..5 num-params=5.1M dim=40->400 combine=-0.054->-0.054 (over 1) xent:train/valid[23,35,final]=(-0.769,-0.524,-0.499/-0.773,-0.584,-0.566) logprob:train/valid[23,35,final]=(-0.092,-0.056,-0.052/-0.107,-0.076,-0.070) + set -e -o pipefail stage=0 @@ -198,10 +199,10 @@ if [ $stage -le 5 ]; then --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=5 \ + --trainer.num-epochs=6 \ --trainer.frames-per-iter=1500000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ --trainer.dropout-schedule $dropout_schedule \ --trainer.optimization.initial-effective-lrate=0.001 \ --trainer.optimization.final-effective-lrate=0.0001 \ @@ -251,3 +252,6 @@ if [ $stage -le 7 ]; then steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 27988beafdd..e8287cf929d 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -1,20 +1,21 @@ #!/bin/bash # This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1d/ -# System cnn_e2eali_1d -# WER 9.52 -# WER (rescored) 9.29 -# CER 4.45 -# CER (rescored) 4.43 -# Final train prob -0.0473 -# Final valid prob -0.0706 -# Final train prob (xent) -0.4623 -# Final valid prob (xent) -0.5371 -# Parameters 5.08M +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ exp/chain/cnn_e2eali_1d +# System e2e_cnn_1a cnn_e2eali_1d +# WER 13.59 9.45 +# WER (rescored) 13.27 9.28 +# CER 6.92 4.41 +# CER (rescored) 6.71 4.31 +# Final train prob 0.0345 -0.0451 +# Final valid prob 0.0269 -0.0684 +# Final train prob (xent) -0.4241 +# Final valid prob (xent) -0.5068 +# Parameters 9.52M 5.13M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d -# exp/chain/cnn_e2eali_1d/: num-iters=40 nj=2..4 num-params=5.1M dim=40->400 combine=-0.052->-0.052 (over 1) xent:train/valid[25,39,final]=(-0.739,-0.483,-0.462/-0.763,-0.551,-0.537) logprob:train/valid[25,39,final]=(-0.092,-0.052,-0.047/-0.112,-0.076,-0.071) +# exp/chain/cnn_e2eali_1d/: num-iters=36 nj=3..5 num-params=5.1M dim=40->400 combine=-0.047->-0.047 (over 1) xent:train/valid[23,35,final]=(-0.705,-0.446,-0.424/-0.714,-0.523,-0.507) logprob:train/valid[23,35,final]=(-0.095,-0.049,-0.045/-0.110,-0.073,-0.068) + set -e -o pipefail stage=0 @@ -22,7 +23,7 @@ stage=0 nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1df #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a common_egs_dir= reporting_email= @@ -194,10 +195,10 @@ if [ $stage -le 5 ]; then --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=5 \ + --trainer.num-epochs=6 \ --trainer.frames-per-iter=1500000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ --trainer.dropout-schedule $dropout_schedule \ --trainer.optimization.initial-effective-lrate=0.001 \ --trainer.optimization.final-effective-lrate=0.0001 \ @@ -247,3 +248,6 @@ if [ $stage -le 7 ]; then steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh index 0ffc6b78fa7..7fb81c97ea7 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -22,7 +22,7 @@ set -e stage=0 train_stage=-10 get_egs_stage=-10 -affix=1a +affix=1b # training options tdnn_dim=450 From 9b67d9d0ad7600c25d5237549497b54ed4b778ff Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 3 Sep 2018 19:26:23 -0400 Subject: [PATCH 16/37] adding overwrite option and punctuation topology --- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 2 +- egs/iam/v2/local/prepare_data.sh | 19 ++++++++++++------- egs/iam/v2/run.sh | 4 +++- egs/iam/v2/run_end2end.sh | 9 +++++++-- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index e8287cf929d..9771245c683 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -23,7 +23,7 @@ stage=0 nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1df #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a common_egs_dir= reporting_email= diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index 9c01ac90f28..32e76143931 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -18,6 +18,7 @@ stage=0 download_dir=data/download +overwrite=false wellington_dir= username= password= # username and password for downloading the IAM database @@ -161,11 +162,15 @@ cat $test_old > $test_new cat $val1_old $val2_old > $val_new if [ $stage -le 0 ]; then - local/process_data.py data/local data/train --dataset train || exit 1 - local/process_data.py data/local data/test --dataset test || exit 1 - local/process_data.py data/local data/val --dataset validation || exit 1 - - image/fix_data_dir.sh data/train - image/fix_data_dir.sh data/test - image/fix_data_dir.sh data/val + if [ ! -f data/train/text ] || $overwrite; then + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 + + image/fix_data_dir.sh data/train + image/fix_data_dir.sh data/test + image/fix_data_dir.sh data/val + else + echo "Not processing data since it is already processed" + fi fi diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index dcdbb92ed68..8a56c35e052 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -10,6 +10,7 @@ nj=70 decode_gmm=false username= password= +overwrite=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -33,7 +34,8 @@ if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --overwrite $overwrite fi mkdir -p data/{train,test}/data diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index 346acbed1d3..dd6aa73de63 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -6,6 +6,7 @@ stage=0 nj=20 username= password= +overwrite=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -30,10 +31,11 @@ if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --overwrite $overwrite fi -mkdir -p data/{train,test}/data +mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then echo "$(date) stage 1: getting allowed image widths for e2e training..." image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command @@ -103,6 +105,9 @@ if [ $stage -le 4 ]; then # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ From 89c9ec79ff7dce369d1c5e1c030ef225bae053e2 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 3 Sep 2018 20:26:57 -0400 Subject: [PATCH 17/37] adding overwrite option --- egs/iam/v2/run.sh | 44 +++++++++++++++++++++++---------------- egs/iam/v2/run_end2end.sh | 28 ++++++++++++++++--------- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 8a56c35e052..44a85928d63 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -31,6 +31,14 @@ wellington_database=/export/corpora5/Wellington/WWC/ ./local/check_tools.sh if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then + echo "Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi +fi + +if [ $stage -le 1 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ @@ -39,8 +47,8 @@ if [ $stage -le 0 ]; then fi mkdir -p data/{train,test}/data -if [ $stage -le 1 ]; then - echo "$(date) stage 1: getting allowed image widths for e2e training..." +if [ $stage -le 2 ]; then + echo "$(date) stage 2: getting allowed image widths for e2e training..." image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to @@ -54,15 +62,15 @@ if [ $stage -le 1 ]; then image/fix_data_dir.sh data/train fi -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then for set in train; do - echo "$(date) stage 2: Performing augmentation, it will double training data" + echo "$(date) stage 3: Performing augmentation, it will double training data" local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done fi -if [ $stage -le 2 ]; then +if [ $stage -le 4 ]; then echo "$0: Preparing BPE..." # getting non-silence phones. cut -d' ' -f2- data/train/text | \ @@ -96,12 +104,12 @@ END done fi -if [ $stage -le 3 ]; then +if [ $stage -le 5 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh fi -if [ $stage -le 4 ]; then +if [ $stage -le 6 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. @@ -119,19 +127,19 @@ if [ $stage -le 4 ]; then data/lang data/lang_rescore_6g fi -if [ $stage -le 5 ]; then +if [ $stage -le 7 ]; then steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_aug \ data/lang exp/mono fi -if [ $stage -le 6 ] && $decode_gmm; then +if [ $stage -le 8 ] && $decode_gmm; then utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ exp/mono/decode_test fi -if [ $stage -le 7 ]; then +if [ $stage -le 9 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ exp/mono exp/mono_ali @@ -139,14 +147,14 @@ if [ $stage -le 7 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 8 ] && $decode_gmm; then +if [ $stage -le 10 ] && $decode_gmm; then utils/mkgraph.sh data/lang exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ exp/tri/decode_test fi -if [ $stage -le 9 ]; then +if [ $stage -le 11 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ exp/tri exp/tri_ali @@ -155,14 +163,14 @@ if [ $stage -le 9 ]; then data/train_aug data/lang exp/tri_ali exp/tri2 fi -if [ $stage -le 10 ] && $decode_gmm; then +if [ $stage -le 12 ] && $decode_gmm; then utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ data/test exp/tri2/decode_test fi -if [ $stage -le 11 ]; then +if [ $stage -le 13 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train_aug data/lang exp/tri2 exp/tri2_ali @@ -170,23 +178,23 @@ if [ $stage -le 11 ]; then data/train_aug data/lang exp/tri2_ali exp/tri3 fi -if [ $stage -le 12 ] && $decode_gmm; then +if [ $stage -le 14 ] && $decode_gmm; then utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ data/test exp/tri3/decode_test fi -if [ $stage -le 13 ]; then +if [ $stage -le 15 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train_aug data/lang exp/tri3 exp/tri3_ali fi -if [ $stage -le 14 ]; then +if [ $stage -le 16 ]; then local/chain/run_cnn.sh --train_set train_aug fi -if [ $stage -le 15 ]; then +if [ $stage -le 17 ]; then local/chain/run_cnn_chainali.sh --train_set train_aug \ --chain-model-dir exp/chain/cnn_1a --stage 2 fi diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index dd6aa73de63..a5e8906e406 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -28,6 +28,14 @@ wellington_database=/export/corpora5/Wellington/WWC/ ./local/check_tools.sh if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then + echo "Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi +fi + +if [ $stage -le 1 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ @@ -36,8 +44,8 @@ if [ $stage -le 0 ]; then fi mkdir -p data/{train,test}/data -if [ $stage -le 1 ]; then - echo "$(date) stage 1: getting allowed image widths for e2e training..." +if [ $stage -le 2 ]; then + echo "$(date) stage 2: getting allowed image widths for e2e training..." image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to @@ -51,15 +59,15 @@ if [ $stage -le 1 ]; then image/fix_data_dir.sh data/train fi -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then for set in train; do - echo "$(date) stage 2: Performing augmentation, it will double training data" + echo "$(date) stage 3: Performing augmentation, it will double training data" local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done fi -if [ $stage -le 2 ]; then +if [ $stage -le 4 ]; then echo "$0: Preparing BPE..." # getting non-silence phones. cut -d' ' -f2- data/train/text | \ @@ -93,12 +101,12 @@ END done fi -if [ $stage -le 3 ]; then +if [ $stage -le 5 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh fi -if [ $stage -le 4 ]; then +if [ $stage -le 6 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. @@ -116,12 +124,12 @@ if [ $stage -le 4 ]; then data/lang data/lang_rescore_6g fi -if [ $stage -le 5 ]; then +if [ $stage -le 7 ]; then echo "$0: Calling the flat-start chain recipe..." local/chain/run_e2e_cnn.sh --train_set train_aug fi -if [ $stage -le 6 ]; then +if [ $stage -le 8 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ @@ -129,7 +137,7 @@ if [ $stage -le 6 ]; then data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 7 ]; then +if [ $stage -le 9 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." local/chain/run_cnn_e2eali.sh --train_set train_aug fi From c05cd4df19953c65f76c09827ffa47513aa6953c Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 4 Sep 2018 00:05:25 -0400 Subject: [PATCH 18/37] adding aachen splits --- egs/iam/v2/local/prepare_data.sh | 18 ++--- egs/iam/v2/local/process_aachen_splits.py | 88 +++++++++++++++++++++++ egs/iam/v2/local/process_data.py | 1 - 3 files changed, 97 insertions(+), 10 deletions(-) create mode 100755 egs/iam/v2/local/process_aachen_splits.py diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index 32e76143931..a220c2725b1 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -161,16 +161,16 @@ cat $train_old > $train_new cat $test_old > $test_new cat $val1_old $val2_old > $val_new -if [ $stage -le 0 ]; then - if [ ! -f data/train/text ] || $overwrite; then +if $process_aachen_split; then + local/process_aachen_splits.py data/local aachen_split data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local aachen_split data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local aachen_split data/val --dataset validation || exit 1 +else local/process_data.py data/local data/train --dataset train || exit 1 local/process_data.py data/local data/test --dataset test || exit 1 local/process_data.py data/local data/val --dataset validation || exit 1 - - image/fix_data_dir.sh data/train - image/fix_data_dir.sh data/test - image/fix_data_dir.sh data/val - else - echo "Not processing data since it is already processed" - fi fi + +image/fix_data_dir.sh data/train +image/fix_data_dir.sh data/test +image/fix_data_dir.sh data/val diff --git a/egs/iam/v2/local/process_aachen_splits.py b/egs/iam/v2/local/process_aachen_splits.py new file mode 100755 index 00000000000..b8c59d0a7c8 --- /dev/null +++ b/egs/iam/v2/local/process_aachen_splits.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_aachen_splits.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('split_path', type=str, + help='location of the train/test/val set') +parser.add_argument('out_dir', type=str, + help='location to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train_list', 'dev_list', 'eval_list'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.split_path, + args.dataset + '.txt') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + + +### main ### + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + doc = minidom.parse(xml_path) + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder) + for file in os.listdir(lines_path): + if file.endswith(".png"): + image_file_path = os.path.join(lines_path, file) + base_name = os.path.splitext(os.path.basename(image_file_path))[0] + text = text_dict[base_name] + utt_id = writer_id + '_' + base_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py index fa5eb484707..2adae7bf7be 100755 --- a/egs/iam/v2/local/process_data.py +++ b/egs/iam/v2/local/process_data.py @@ -67,7 +67,6 @@ def process_text_file_for_word_model(): xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') img_num = line[-3:] doc = minidom.parse(xml_path) - form_elements = doc.getElementsByTagName('form')[0] writer_id = form_elements.getAttribute('writer-id') outerfolder = form_elements.getAttribute('id')[0:3] From 5dfe8fcbb5fcd2d85249e3b46a1f1d2f9bf9a07f Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 4 Sep 2018 00:59:40 -0400 Subject: [PATCH 19/37] fixing bugs --- egs/iam/v2/local/prepare_data.sh | 8 ++++---- egs/iam/v2/local/process_aachen_splits.py | 4 ++-- egs/iam/v2/run.sh | 4 ++-- egs/iam/v2/run_end2end.sh | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index a220c2725b1..8c2505601cc 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -18,7 +18,7 @@ stage=0 download_dir=data/download -overwrite=false +process_aachen_split=false wellington_dir= username= password= # username and password for downloading the IAM database @@ -162,9 +162,9 @@ cat $test_old > $test_new cat $val1_old $val2_old > $val_new if $process_aachen_split; then - local/process_aachen_splits.py data/local aachen_split data/train --dataset train || exit 1 - local/process_aachen_splits.py data/local aachen_split data/test --dataset test || exit 1 - local/process_aachen_splits.py data/local aachen_split data/val --dataset validation || exit 1 + local/process_aachen_splits.py data/local extra/splits data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local extra/splits data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local extra/splits data/val --dataset validation || exit 1 else local/process_data.py data/local data/train --dataset train || exit 1 local/process_data.py data/local data/test --dataset test || exit 1 diff --git a/egs/iam/v2/local/process_aachen_splits.py b/egs/iam/v2/local/process_aachen_splits.py index b8c59d0a7c8..cb6a6d4f0d8 100755 --- a/egs/iam/v2/local/process_aachen_splits.py +++ b/egs/iam/v2/local/process_aachen_splits.py @@ -27,7 +27,7 @@ parser.add_argument('out_dir', type=str, help='location to write output files.') parser.add_argument('--dataset', type=str, default='train', - choices=['train_list', 'dev_list', 'eval_list'], + choices=['train', 'test','validation'], help='Subset of data to process.') args = parser.parse_args() @@ -41,7 +41,7 @@ image_fh = open(image_file, 'w') dataset_path = os.path.join(args.split_path, - args.dataset + '.txt') + args.dataset + '.uttlist') text_file_path = os.path.join(args.database_path, 'ascii','lines.txt') diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh index 44a85928d63..41c6fdc1aec 100755 --- a/egs/iam/v2/run.sh +++ b/egs/iam/v2/run.sh @@ -10,7 +10,7 @@ nj=70 decode_gmm=false username= password= -overwrite=false +process_aachen_split=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -43,7 +43,7 @@ if [ $stage -le 1 ]; then local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ --username "$username" --password "$password" \ - --overwrite $overwrite + --process_aachen_split $process_aachen_split fi mkdir -p data/{train,test}/data diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index a5e8906e406..6ecca67bb9d 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -6,7 +6,7 @@ stage=0 nj=20 username= password= -overwrite=false +process_aachen_split=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -40,7 +40,7 @@ if [ $stage -le 1 ]; then local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ --username "$username" --password "$password" \ - --overwrite $overwrite + --process_aachen_split $process_aachen_split fi mkdir -p data/{train,test}/data From d7448dfb20fca39df15270d616a09a141ae66fa9 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 5 Sep 2018 07:43:44 -0400 Subject: [PATCH 20/37] modification from review --- egs/cifar/v1/image/copy_data_dir.sh | 118 -------- egs/iam/v2/local/chain/run_cnn.sh | 1 - egs/iam/v2/local/chain/run_cnn_chainali.sh | 1 - egs/iam/v2/local/chain/tuning/run_cnn_1a.sh | 250 ----------------- .../local/chain/tuning/run_cnn_chainali_1a.sh | 250 ----------------- .../local/chain/tuning/run_cnn_chainali_1b.sh | 257 ------------------ .../local/chain/tuning/run_cnn_e2eali_1d.sh | 2 +- egs/iam/v2/run.sh | 200 -------------- egs/iam/v2/run_end2end.sh | 3 +- 9 files changed, 3 insertions(+), 1079 deletions(-) delete mode 100755 egs/cifar/v1/image/copy_data_dir.sh delete mode 120000 egs/iam/v2/local/chain/run_cnn.sh delete mode 120000 egs/iam/v2/local/chain/run_cnn_chainali.sh delete mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_1a.sh delete mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh delete mode 100755 egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh delete mode 100755 egs/iam/v2/run.sh diff --git a/egs/cifar/v1/image/copy_data_dir.sh b/egs/cifar/v1/image/copy_data_dir.sh deleted file mode 100755 index c923f5cc07a..00000000000 --- a/egs/cifar/v1/image/copy_data_dir.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# images.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/images.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/images.scp >$destdir/images.scp -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2uniq utt2dur utt2num_frames text images.scp; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/egs/iam/v2/local/chain/run_cnn.sh b/egs/iam/v2/local/chain/run_cnn.sh deleted file mode 120000 index df6f0a468c1..00000000000 --- a/egs/iam/v2/local/chain/run_cnn.sh +++ /dev/null @@ -1 +0,0 @@ -tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_cnn_chainali.sh b/egs/iam/v2/local/chain/run_cnn_chainali.sh deleted file mode 120000 index 86568421fe1..00000000000 --- a/egs/iam/v2/local/chain/run_cnn_chainali.sh +++ /dev/null @@ -1 +0,0 @@ -tuning/run_cnn_chainali_1b.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh deleted file mode 100755 index cf4024c9d16..00000000000 --- a/egs/iam/v2/local/chain/tuning/run_cnn_1a.sh +++ /dev/null @@ -1,250 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Hossein Hadian -# 2017 Chun Chieh Chang -# 2017 Ashish Arora - -# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ -# exp/chain/cnn_1a/: num-iters=42 nj=2..4 num-params=4.4M dim=40->400 combine=-0.039->-0.039 (over 2) xent:train/valid[27,41,final]=(-0.547,-0.404,-0.401/-0.746,-0.685,-0.684) logprob:train/valid[27,41,final]=(-0.046,-0.036,-0.036/-0.072,-0.071,-0.071) - -# ./local/chain/compare_wer.sh exp/chain/cnn_1a/ -# System cnn_1a -# WER 17.05 -# WER (rescored) 16.70 -# CER 9.75 -# CER (rescored) 9.61 -# Final train prob -0.0358 -# Final valid prob -0.0709 -# Final train prob (xent) -0.4013 -# Final valid prob (xent) -0.6841 -# Parameters 4.39M - -set -e -o pipefail - -stage=0 - -nj=30 -train_set=train -gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -ali=tri3_ali -common_egs_dir= -reporting_email= - -# chain options -train_stage=-10 -xent_regularize=0.1 -frame_subsampling_factor=4 -alignment_subsampling_factor=1 -# training chunk-options -chunk_width=340,300,200,100 -num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 -tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_decode=data/lang -lang_rescore=data/lang_rescore_6g -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 2 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 3 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 4 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim - relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' mod?els... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 5 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$cmd" \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.00005 \ - --chain.apply-deriv-weights=true \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$frame_subsampling_factor \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=4 \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ - --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 6 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 $lang_decode \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 7 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh deleted file mode 100755 index 07bdac88468..00000000000 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1a.sh +++ /dev/null @@ -1,250 +0,0 @@ -#!/bin/bash - -# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ -# System cnn_chainali_1a -# WER 10.48 -# WER (rescored) 10.23 -# CER 4.82 -# CER (rescored) 4.69 -# Final train prob -0.0444 -# Final valid prob -0.0645 -# Final train prob (xent) -0.4523 -# Final valid prob (xent) -0.5350 -# Parameters 5.65M - -set -e -o pipefail - -stage=0 - -nj=30 -train_set=train -gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn_1a -common_egs_dir= -reporting_email= - -# chain options -train_stage=-10 -xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options -chunk_width=340,300,200,100 -num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 -tdnn_dim=450 -# training options -srand=0 -remove_egs=false -lang_test=lang -lang_decode=data/lang -lang_rescore=data/lang_rescore_6g -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 2 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ - --acoustic-scale 1.0 \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/lang $chain_model_dir $lat_dir - cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts -fi - -if [ $stage -le 3 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 4 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - cnn_opts="l2-regularize=0.075" - tdnn_opts="l2-regularize=0.075" - output_opts="l2-regularize=0.1" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' mod?els... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 5 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$cmd" \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.00005 \ - --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=1 \ - --chain.left-tolerance 3 \ - --chain.right-tolerance 3 \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=4 \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=64,32 \ - --trainer.optimization.momentum=0.0 \ - --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 6 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 $lang_decode \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 7 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh deleted file mode 100755 index 105b8f50854..00000000000 --- a/egs/iam/v2/local/chain/tuning/run_cnn_chainali_1b.sh +++ /dev/null @@ -1,257 +0,0 @@ -#!/bin/bash - -# ./local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b -# System cnn_1a cnn_chainali_1b -# WER 17.05 9.45 -# WER (rescored) 16.70 9.01 -# CER 9.75 4.43 -# CER (rescored) 9.61 4.28 -# Final train prob -0.0358 -0.0522 -# Final valid prob -0.0709 -0.0702 -# Final train prob (xent) -0.4013 -0.4992 -# Final valid prob (xent) -0.6841 -0.5658 -# Parameters 4.39M 5.13M - -# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1b -# exp/chain/cnn_chainali_1b/: num-iters=36 nj=3..5 num-params=5.1M dim=40->400 combine=-0.054->-0.054 (over 1) xent:train/valid[23,35,final]=(-0.769,-0.524,-0.499/-0.773,-0.584,-0.566) logprob:train/valid[23,35,final]=(-0.092,-0.056,-0.052/-0.107,-0.076,-0.070) - -set -e -o pipefail - -stage=0 - -nj=30 -train_set=train -gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it - # should have alignments for the specified training data. -nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. -affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn_1a -common_egs_dir= -reporting_email= - -# chain options -train_stage=-10 -xent_regularize=0.1 -frame_subsampling_factor=4 -# training chunk-options -chunk_width=340,300,200,100 -num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 -tdnn_dim=550 -# training options -srand=0 -remove_egs=true -lang_decode=data/lang -lang_rescore=data/lang_rescore_6g - -dropout_schedule='0,0@0.20,0.2@0.50,0' -# End configuration section. -echo "$0 $@" # Print the command line for logging - - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 2 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ - --acoustic-scale 1.0 \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/lang $chain_model_dir $lat_dir - cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts -fi - -if [ $stage -le 3 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor $frame_subsampling_factor \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$cmd" $num_leaves ${train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 4 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" - tdnn_opts="l2-regularize=0.03" - output_opts="l2-regularize=0.04" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 - relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 - relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 - relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' mod?els... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 5 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$cmd" \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.00005 \ - --chain.apply-deriv-weights=true \ - --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=1 \ - --chain.left-tolerance 3 \ - --chain.right-tolerance 3 \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=6 \ - --trainer.frames-per-iter=1500000 \ - --trainer.optimization.num-jobs-initial=3 \ - --trainer.optimization.num-jobs-final=5 \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=32,16 \ - --trainer.optimization.momentum=0.0 \ - --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 6 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 $lang_decode \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 7 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 9771245c683..3ed5dd745e5 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -24,7 +24,7 @@ nj=30 train_set=train nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. -e2echain_model_dir=exp/chain/e2e_cnn_1a +e2echain_model_dir=exp/chain/e2e_cnn_1b common_egs_dir= reporting_email= diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh deleted file mode 100755 index 41c6fdc1aec..00000000000 --- a/egs/iam/v2/run.sh +++ /dev/null @@ -1,200 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2017 Hossein Hadian - -set -e -stage=0 -nj=70 -decode_gmm=false -username= -password= -process_aachen_split=false -# iam_database points to the database path on the JHU grid. If you have not -# already downloaded the database you can set it to a local directory -# like "data/download" and follow the instructions -# in "local/prepare_data.sh" to download the database: -iam_database=/export/corpora5/handwriting_ocr/IAM -# wellington_database points to the database path on the JHU grid. The Wellington -# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). -# This corpus is of written NZ English that can be purchased here: -# "https://www.victoria.ac.nz/lals/resources/corpora-default" -wellington_database=/export/corpora5/Wellington/WWC/ - -. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. - ## This relates to the queue. -. ./path.sh -. ./utils/parse_options.sh # e.g. this parses the above options - # if supplied. - -./local/check_tools.sh - -if [ $stage -le 0 ]; then - if [ -f data/train/text ] && ! $overwrite; then - echo "Not processing, probably script have run from wrong stage" - echo "Exiting with status 1 to avoid data corruption" - exit 1; - fi -fi - -if [ $stage -le 1 ]; then - echo "$0: Preparing data..." - local/prepare_data.sh --download-dir "$iam_database" \ - --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" \ - --process_aachen_split $process_aachen_split -fi -mkdir -p data/{train,test}/data - -if [ $stage -le 2 ]; then - echo "$(date) stage 2: getting allowed image widths for e2e training..." - image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in train test; do - echo "$(date) Extracting features, creating feats.scp file" - local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} - steps/compute_cmvn_stats.sh data/${set} || exit 1; - done - image/fix_data_dir.sh data/train -fi - -if [ $stage -le 3 ]; then - for set in train; do - echo "$(date) stage 3: Performing augmentation, it will double training data" - local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data - steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; - done -fi - -if [ $stage -le 4 ]; then - echo "$0: Preparing BPE..." - # getting non-silence phones. - cut -d' ' -f2- data/train/text | \ -python3 <( -cat << "END" -import os, sys, io; -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); -phone_dict = dict(); -for line in infile: - line_vect = line.strip().split(); - for word in line_vect: - for phone in word: - phone_dict[phone] = phone; -for phone in phone_dict.keys(): - output.write(phone+ '\n'); -END - ) > data/local/phones.txt - - cut -d' ' -f2- data/train/text > data/local/train_data.txt - cat data/local/phones.txt data/local/train_data.txt | \ - local/prepend_words.py | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - for set in test train val train_aug; do - cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ - | sed 's/@@//g' > data/$set/bpe_text - mv data/$set/text data/$set/text.old - paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text - done -fi - -if [ $stage -le 5 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh -fi - -if [ $stage -le 6 ]; then - echo "$0: Preparing dictionary and lang..." - local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ - data/local/dict "" data/lang/temp data/lang - silphonelist=`cat data/lang/phones/silence.csl` - nonsilphonelist=`cat data/lang/phones/nonsilence.csl` - local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo - utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang - - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ - data/local/dict/lexicon.txt data/lang - utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/lang data/lang_rescore_6g -fi - -if [ $stage -le 7 ]; then - steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_aug \ - data/lang exp/mono -fi - -if [ $stage -le 8 ] && $decode_gmm; then - utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph - - steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ - exp/mono/decode_test -fi - -if [ $stage -le 9 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ - exp/mono exp/mono_ali - - steps/train_deltas.sh --cmd $cmd 500 20000 data/train_aug data/lang \ - exp/mono_ali exp/tri -fi - -if [ $stage -le 10 ] && $decode_gmm; then - utils/mkgraph.sh data/lang exp/tri exp/tri/graph - - steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ - exp/tri/decode_test -fi - -if [ $stage -le 11 ]; then - steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \ - exp/tri exp/tri_ali - - steps/train_lda_mllt.sh --cmd $cmd \ - --splice-opts "--left-context=3 --right-context=3" 500 20000 \ - data/train_aug data/lang exp/tri_ali exp/tri2 -fi - -if [ $stage -le 12 ] && $decode_gmm; then - utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph - - steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ - data/test exp/tri2/decode_test -fi - -if [ $stage -le 13 ]; then - steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train_aug data/lang exp/tri2 exp/tri2_ali - - steps/train_sat.sh --cmd $cmd 500 20000 \ - data/train_aug data/lang exp/tri2_ali exp/tri3 -fi - -if [ $stage -le 14 ] && $decode_gmm; then - utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph - - steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ - data/test exp/tri3/decode_test -fi - -if [ $stage -le 15 ]; then - steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ - data/train_aug data/lang exp/tri3 exp/tri3_ali -fi - -if [ $stage -le 16 ]; then - local/chain/run_cnn.sh --train_set train_aug -fi - -if [ $stage -le 17 ]; then - local/chain/run_cnn_chainali.sh --train_set train_aug \ - --chain-model-dir exp/chain/cnn_1a --stage 2 -fi diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index 6ecca67bb9d..cfc4653f24b 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -7,6 +7,7 @@ nj=20 username= password= process_aachen_split=false +overwrite=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -134,7 +135,7 @@ if [ $stage -le 8 ]; then steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train + data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train fi if [ $stage -le 9 ]; then From d7d5c22b63c6c4fe5a964bfb1768f9dba61f5c0d Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 6 Sep 2018 02:12:24 -0400 Subject: [PATCH 21/37] updating parameter and result --- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 3ed5dd745e5..bf70991ec2a 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -1,21 +1,20 @@ #!/bin/bash # This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ exp/chain/cnn_e2eali_1d -# System e2e_cnn_1a cnn_e2eali_1d -# WER 13.59 9.45 -# WER (rescored) 13.27 9.28 -# CER 6.92 4.41 -# CER (rescored) 6.71 4.31 -# Final train prob 0.0345 -0.0451 -# Final valid prob 0.0269 -0.0684 -# Final train prob (xent) -0.4241 -# Final valid prob (xent) -0.5068 -# Parameters 9.52M 5.13M +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d +# System e2e_cnn_1b cnn_e2eali_1d +# WER 13.91 9.59 +# WER (rescored) 13.64 9.09 +# CER 7.08 4.49 +# CER (rescored) 6.82 4.35 +# Final train prob 0.0148 -0.0504 +# Final valid prob 0.0105 -0.0716 +# Final train prob (xent) -0.4695 +# Final valid prob (xent) -0.5347 +# Parameters 9.52M 5.08M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d -# exp/chain/cnn_e2eali_1d/: num-iters=36 nj=3..5 num-params=5.1M dim=40->400 combine=-0.047->-0.047 (over 1) xent:train/valid[23,35,final]=(-0.705,-0.446,-0.424/-0.714,-0.523,-0.507) logprob:train/valid[23,35,final]=(-0.095,-0.049,-0.045/-0.110,-0.073,-0.068) - +# exp/chain/cnn_e2eali_1d: num-iters=24 nj=3..5 num-params=5.1M dim=40->400 combine=-0.054->-0.054 (over 1) xent:train/valid[15,23,final]=(-0.727,-0.497,-0.470/-0.734,-0.557,-0.535) logprob:train/valid[15,23,final]=(-0.093,-0.057,-0.050/-0.110,-0.078,-0.072) set -e -o pipefail stage=0 @@ -196,7 +195,7 @@ if [ $stage -le 5 ]; then --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=6 \ - --trainer.frames-per-iter=1500000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=5 \ --trainer.dropout-schedule $dropout_schedule \ From 43e9af9601992e37f7985611c4b404115c3a13cf Mon Sep 17 00:00:00 2001 From: aarora8 Date: Thu, 6 Sep 2018 12:13:43 -0400 Subject: [PATCH 22/37] updating parameter and result --- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index bf70991ec2a..d1930efefd7 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -1,20 +1,20 @@ #!/bin/bash # This script does end2end chain training (i.e. from scratch) -# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d # System e2e_cnn_1b cnn_e2eali_1d -# WER 13.91 9.59 -# WER (rescored) 13.64 9.09 -# CER 7.08 4.49 -# CER (rescored) 6.82 4.35 -# Final train prob 0.0148 -0.0504 -# Final valid prob 0.0105 -0.0716 +# WER 13.91 9.32 +# WER (rescored) 13.64 9.07 +# CER 7.08 4.35 +# CER (rescored) 6.82 4.24 +# Final train prob 0.0148 -0.0524 +# Final valid prob 0.0105 -0.0713 # Final train prob (xent) -0.4695 -# Final valid prob (xent) -0.5347 -# Parameters 9.52M 5.08M +# Final valid prob (xent) -0.5310 +# Parameters 9.52M 4.36M # steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d -# exp/chain/cnn_e2eali_1d: num-iters=24 nj=3..5 num-params=5.1M dim=40->400 combine=-0.054->-0.054 (over 1) xent:train/valid[15,23,final]=(-0.727,-0.497,-0.470/-0.734,-0.557,-0.535) logprob:train/valid[15,23,final]=(-0.093,-0.057,-0.050/-0.110,-0.078,-0.072) +# exp/chain/cnn_e2eali_1d: num-iters=30 nj=3..5 num-params=4.4M dim=40->400 combine=-0.055->-0.055 (over 1) xent:train/valid[19,29,final]=(-0.683,-0.489,-0.469/-0.703,-0.544,-0.531) logprob:train/valid[19,29,final]=(-0.090,-0.057,-0.052/-0.107,-0.076,-0.071) set -e -o pipefail stage=0 @@ -150,7 +150,8 @@ if [ $stage -le 4 ]; then conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 - relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 @@ -194,8 +195,8 @@ if [ $stage -le 5 ]; then --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=6 \ - --trainer.frames-per-iter=2000000 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=5 \ --trainer.dropout-schedule $dropout_schedule \ From 17c506b68e9c4629b55eba3bd4cea8acbdfeedaf Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 00:58:39 -0400 Subject: [PATCH 23/37] adding data preprocessing in test and val --- egs/iam/v2/local/augment_data.sh | 8 ++++---- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh | 5 +++-- egs/iam/v2/run_end2end.sh | 11 +++++++---- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh index 82fa5230a43..31e4a8217ca 100755 --- a/egs/iam/v2/local/augment_data.sh +++ b/egs/iam/v2/local/augment_data.sh @@ -17,11 +17,11 @@ echo "$0 $@" srcdir=$1 outdir=$2 datadir=$3 - +aug_set=aug1 mkdir -p $datadir/augmentations -echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp" +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" -for set in aug1; do +for set in $aug_set; do image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt @@ -30,5 +30,5 @@ for set in aug1; do done echo " combine original data and data from different augmentations" -utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1 +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index d1930efefd7..c1e98de68b4 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -21,6 +21,7 @@ stage=0 nj=30 train_set=train +test_dir=data/test nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1b @@ -243,10 +244,10 @@ if [ $stage -le 7 ]; then --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + $dir/graph $test_dir $dir/decode_test || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 + $test_dir $dir/decode_test{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index cfc4653f24b..ed6ae935401 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -52,12 +52,15 @@ if [ $stage -le 2 ]; then # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for set in train test; do - echo "$(date) Extracting features, creating feats.scp file" - local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} steps/compute_cmvn_stats.sh data/${set} || exit 1; done - image/fix_data_dir.sh data/train + utils/fix_data_dir.sh data/train fi if [ $stage -le 3 ]; then From d6407429bd4006db34efa7b8d032215c791f9f0c Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 01:01:06 -0400 Subject: [PATCH 24/37] updating results --- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index c1e98de68b4..2251595bec0 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -3,10 +3,10 @@ # This script does end2end chain training (i.e. from scratch) # local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d # System e2e_cnn_1b cnn_e2eali_1d -# WER 13.91 9.32 -# WER (rescored) 13.64 9.07 -# CER 7.08 4.35 -# CER (rescored) 6.82 4.24 +# WER 13.91 8.80 +# WER (rescored) 13.64 8.52 +# CER 7.08 4.06 +# CER (rescored) 6.82 3.98 # Final train prob 0.0148 -0.0524 # Final valid prob 0.0105 -0.0713 # Final train prob (xent) -0.4695 From 94a80ade3373fd1e17c43011a95048b786258ca0 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 01:17:31 -0400 Subject: [PATCH 25/37] replacing prepend words with common prepend words --- egs/iam/v2/local/prepend_words.py | 13 ------------- egs/iam/v2/local/train_lm.sh | 6 +++--- egs/iam/v2/run_end2end.sh | 4 ++-- 3 files changed, 5 insertions(+), 18 deletions(-) delete mode 100755 egs/iam/v2/local/prepend_words.py diff --git a/egs/iam/v2/local/prepend_words.py b/egs/iam/v2/local/prepend_words.py deleted file mode 100755 index d53eb8974bf..00000000000 --- a/egs/iam/v2/local/prepend_words.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# This script, prepend '|' to every words in the transcript to mark -# the beginning of the words for finding the initial-space of every word -# after decoding. - -import sys, io - -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -for line in infile: - output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh index 35eb56b1341..ff674c4de22 100755 --- a/egs/iam/v2/local/train_lm.sh +++ b/egs/iam/v2/local/train_lm.sh @@ -64,14 +64,14 @@ if [ $stage -le 0 ]; then > data/local/lob-train-only.txt fi cat data/local/lob-train-only.txt | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > ${dir}/data/text/brown.txt if [ -d "data/local/wellingtoncorpus" ]; then cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > ${dir}/data/text/wellington.txt fi diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index ed6ae935401..d1de8c5c6c2 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -93,12 +93,12 @@ END cut -d' ' -f2- data/train/text > data/local/train_data.txt cat data/local/phones.txt data/local/train_data.txt | \ - local/prepend_words.py | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt for set in test train val train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text From 711c3c9739a5ec2a5c0da1a4843827f9c2bb54b2 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 03:04:57 -0400 Subject: [PATCH 26/37] updating remove_test_utterances_from_lob for aachen split --- .../local/remove_test_utterances_from_lob.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py index 1b414ef47f6..bf3c72e1e2e 100755 --- a/egs/iam/v2/local/remove_test_utterances_from_lob.py +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -89,22 +89,26 @@ def read_utterances(text_file_path): remaining_utterances = dict() for line_id, line_to_find in utterance_dict.items(): found_line = False - for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): - # Combine 3 consecutive lines of the corpus into a single line - prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() - curr_words = corpus_text_lowercase_wo_sc[i].strip() - next_words = corpus_text_lowercase_wo_sc[i + 1].strip() - new_line = prev_words + curr_words + next_words - transcript = ''.join(new_line) - if line_to_find in transcript: - found_line = True - row_to_keep[i-1] = False - row_to_keep[i] = False - row_to_keep[i+1] = False + # avoiding very small utterance, it causes removing + # complete lob text + if len(line_to_find) < 13: + remaining_utterances[line_id] = line_to_find + else: + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False if not found_line: remaining_utterances[line_id] = line_to_find - for i in range(len(original_corpus_text)): transcript = original_corpus_text[i].strip() if row_to_keep[i]: From 5f2d96066de84b4edb9161f06578e9fc694a8e2e Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 13:28:21 -0400 Subject: [PATCH 27/37] removing data/val/text from train_lm --- egs/iam/v2/local/train_lm.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh index ff674c4de22..cc0119eb748 100755 --- a/egs/iam/v2/local/train_lm.sh +++ b/egs/iam/v2/local/train_lm.sh @@ -68,7 +68,8 @@ if [ $stage -le 0 ]; then | sed 's/@@//g' > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt | \ utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ - | sed 's/@@//g' > ${dir}/data/text/brown.txt + | sed 's/@@//g' > ${dir}/brown.txt + tail -n +5000 ${dir}/brown.txt > ${dir}/data/text/brown.txt if [ -d "data/local/wellingtoncorpus" ]; then cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \ utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ @@ -78,8 +79,7 @@ if [ $stage -le 0 ]; then # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically # becomes the dev set. - - cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + head -5000 ${dir}/brown.txt > ${dir}/data/text/dev.txt # use the training data as an additional data source. # we can later fold the dev data into this. From 7f2ad0ba4b4b33c6b9cd43d2a31ce7672b04d5db Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 14:32:34 -0400 Subject: [PATCH 28/37] cosmetic fixes in unk arc decoding --- .../v1/local/unk_arc_post_to_transcription.py | 137 +++++++++-------- .../v1/local/unk_arc_post_to_transcription.py | 141 ++++++++++-------- 2 files changed, 159 insertions(+), 119 deletions(-) diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py index c5ad1235427..f8b69820601 100755 --- a/egs/iam/v1/local/unk_arc_post_to_transcription.py +++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py @@ -1,88 +1,107 @@ #!/usr/bin/env python3 -# Copyright 2017 Ashish Arora +#Copyright 2017 Ashish Arora +""" This module will be used by scripts for open vocabulary setup. + If the hypothesis transcription contains , then it will replace the + with the word predicted by model by concatenating phones decoded + from the unk-model. It is currently supported only for triphone setup. + Args: + phones: File name of a file that contains the phones.txt, (symbol-table for phones). + phone and phoneID, Eg. a 217, phoneID of 'a' is 217. + words: File name of a file that contains the words.txt, (symbol-table for words). + word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234. + unk: ID of . Eg. 231. + one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior + of arcs along the one-best path from the lattice. + E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231 + [] + [ ...] + output-text: File containing hypothesis transcription with recognized by the + unk-model. + E.g. A move to stop mr. gaitskell. + + Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt + data/lang/oov.int +""" import argparse +import os import sys - parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") -parser.add_argument('phones', type=str, help='phones and phonesID') -parser.add_argument('words', type=str, help='word and wordID') -parser.add_argument('unk', type=str, default='-', help='location of unk file') -parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +parser.add_argument('phones', type=str, help='File name of a file that contains the' + 'symbol-table for phones. Each line must be: ') +parser.add_argument('words', type=str, help='File name of a file that contains the' + 'symbol-table for words. Each line must be: ') +parser.add_argument('unk', type=str, default='-', help='File name of a file that' + 'contains the ID of . The content must be: , e.g. 231') +parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post' + 'format, which is a list of timing info and posterior of arcs' + 'along the one-best path from the lattice') +parser.add_argument('--output-text', type=str, default='-', help='File containing' + 'hypothesis transcription with recognized by the unk-model') args = parser.parse_args() - ### main ### -phone_fh = open(args.phones, 'r', encoding='latin-1') -word_fh = open(args.words, 'r', encoding='latin-1') -unk_fh = open(args.unk, 'r', encoding='latin-1') -if args.input_ark == '-': - input_fh = sys.stdin +phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles +word_handle = open(args.words, 'r', encoding='latin-1') +unk_handle = open(args.unk,'r', encoding='latin-1') +if args.one_best_arc_post == '-': + arc_post_handle = sys.stdin else: - input_fh = open(args.input_ark, 'r', encoding='latin-1') -if args.out_ark == '-': - out_fh = sys.stdout + arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1') +if args.output_text == '-': + output_text_handle = sys.stdout else: - out_fh = open(args.out_ark, 'w', encoding='latin-1') + output_text_handle = open(args.output_text, 'w', encoding='latin-1') -phone_dict = dict() # Stores phoneID and phone mapping -phone_data_vect = phone_fh.read().strip().split("\n") -for key_val in phone_data_vect: +id2phone = dict() # Stores the mapping from phone_id (int) to phone (char) +phones_data = phone_handle.read().strip().split("\n") + +for key_val in phones_data: key_val = key_val.split(" ") - phone_dict[key_val[1]] = key_val[0] + id2phone[key_val[1]] = key_val[0] + word_dict = dict() -word_data_vect = word_fh.read().strip().split("\n") +word_data_vect = word_handle.read().strip().split("\n") + for key_val in word_data_vect: key_val = key_val.split(" ") word_dict[key_val[1]] = key_val[0] -unk_val = unk_fh.read().strip().split(" ")[0] +unk_val = unk_handle.read().strip().split(" ")[0] -utt_word_dict = dict() -utt_phone_dict = dict() # Stores utteranceID and phoneID -unk_word_dict = dict() -count=0 -for line in input_fh: +utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str) +for line in arc_post_handle: line_vect = line.strip().split("\t") - if len(line_vect) < 6: - print("Bad line: '{}' Expecting 6 fields. Skipping...".format(line), + if len(line_vect) < 6: # Check for 1best-arc-post output + print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line), file=sys.stderr) continue - uttID = line_vect[0] + utt_id = line_vect[0] word = line_vect[4] phones = line_vect[5] - if uttID in utt_word_dict.keys(): - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - else: - count = 0 - utt_word_dict[uttID] = dict() - utt_phone_dict[uttID] = dict() - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - if word == unk_val: # Get character sequence for unk - phone_key_vect = phones.split(" ") - phone_val_vect = list() - for pkey in phone_key_vect: - phone_val_vect.append(phone_dict[pkey]) + if utt_id not in list(utt_word_dict.keys()): + utt_word_dict[utt_id] = list() + + if word == unk_val: # Get the 1best phone sequence given by the unk-model + phone_id_seq = phones.split(" ") + phone_seq = list() + for pkey in phone_id_seq: + phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence. phone_2_word = list() - for phone_val in phone_val_vect: - phone_2_word.append(phone_val.split('_')[0]) - phone_2_word = ''.join(phone_2_word) - utt_word_dict[uttID][count] = phone_2_word + for phone_val in phone_seq: + phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B) + phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence + utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model else: - if word == '0': + if word == '0': # Store space/silence word_val = ' ' else: word_val = word_dict[word] - utt_word_dict[uttID][count] = word_val - count += 1 + utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post -transcription = "" -for key in sorted(utt_word_dict.keys()): - transcription = key - for index in sorted(utt_word_dict[key].keys()): - value = utt_word_dict[key][index] - transcription = transcription + " " + value - out_fh.write(transcription + '\n') +transcription = "" # Output transcription +for utt_key in sorted(utt_word_dict.keys()): + transcription = utt_key + for word in utt_word_dict[utt_key]: + transcription = transcription + " " + word + output_text_handle.write(transcription + '\n') diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py index c86d35e4b8a..f8b69820601 100755 --- a/egs/uw3/v1/local/unk_arc_post_to_transcription.py +++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py @@ -1,86 +1,107 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -# Copyright 2017 Ashish Arora +#Copyright 2017 Ashish Arora +""" This module will be used by scripts for open vocabulary setup. + If the hypothesis transcription contains , then it will replace the + with the word predicted by model by concatenating phones decoded + from the unk-model. It is currently supported only for triphone setup. + Args: + phones: File name of a file that contains the phones.txt, (symbol-table for phones). + phone and phoneID, Eg. a 217, phoneID of 'a' is 217. + words: File name of a file that contains the words.txt, (symbol-table for words). + word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234. + unk: ID of . Eg. 231. + one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior + of arcs along the one-best path from the lattice. + E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231 + [] + [ ...] + output-text: File containing hypothesis transcription with recognized by the + unk-model. + E.g. A move to stop mr. gaitskell. + + Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt + data/lang/oov.int +""" import argparse +import os import sys - parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") -parser.add_argument('phones', type=str, help='phones and phonesID') -parser.add_argument('words', type=str, help='word and wordID') -parser.add_argument('unk', type=str, default='-', help='location of unk file') -parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +parser.add_argument('phones', type=str, help='File name of a file that contains the' + 'symbol-table for phones. Each line must be: ') +parser.add_argument('words', type=str, help='File name of a file that contains the' + 'symbol-table for words. Each line must be: ') +parser.add_argument('unk', type=str, default='-', help='File name of a file that' + 'contains the ID of . The content must be: , e.g. 231') +parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post' + 'format, which is a list of timing info and posterior of arcs' + 'along the one-best path from the lattice') +parser.add_argument('--output-text', type=str, default='-', help='File containing' + 'hypothesis transcription with recognized by the unk-model') args = parser.parse_args() + ### main ### -phone_fh = open(args.phones, 'r') -word_fh = open(args.words, 'r') -unk_fh = open(args.unk,'r') -if args.input_ark == '-': - input_fh = sys.stdin +phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles +word_handle = open(args.words, 'r', encoding='latin-1') +unk_handle = open(args.unk,'r', encoding='latin-1') +if args.one_best_arc_post == '-': + arc_post_handle = sys.stdin else: - input_fh = open(args.input_ark,'r') -if args.out_ark == '-': - out_fh = sys.stdout + arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1') +if args.output_text == '-': + output_text_handle = sys.stdout else: - out_fh = open(args.out_ark,'wb') + output_text_handle = open(args.output_text, 'w', encoding='latin-1') -phone_dict = dict()# stores phoneID and phone mapping -phone_data_vect = phone_fh.read().strip().split("\n") -for key_val in phone_data_vect: +id2phone = dict() # Stores the mapping from phone_id (int) to phone (char) +phones_data = phone_handle.read().strip().split("\n") + +for key_val in phones_data: key_val = key_val.split(" ") - phone_dict[key_val[1]] = key_val[0] + id2phone[key_val[1]] = key_val[0] + word_dict = dict() -word_data_vect = word_fh.read().strip().split("\n") +word_data_vect = word_handle.read().strip().split("\n") + for key_val in word_data_vect: key_val = key_val.split(" ") word_dict[key_val[1]] = key_val[0] -unk_val = unk_fh.read().strip().split(" ")[0] +unk_val = unk_handle.read().strip().split(" ")[0] -utt_word_dict = dict() -utt_phone_dict = dict()# stores utteranceID and phoneID -unk_word_dict = dict() -count=0 -for line in input_fh: +utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str) +for line in arc_post_handle: line_vect = line.strip().split("\t") - if len(line_vect) < 6: - print "IndexError" - print line_vect + if len(line_vect) < 6: # Check for 1best-arc-post output + print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line), + file=sys.stderr) continue - uttID = line_vect[0] + utt_id = line_vect[0] word = line_vect[4] phones = line_vect[5] - if uttID in utt_word_dict.keys(): - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - else: - count = 0 - utt_word_dict[uttID] = dict() - utt_phone_dict[uttID] = dict() - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - if word == unk_val: # get character sequence for unk - phone_key_vect = phones.split(" ") - phone_val_vect = list() - for pkey in phone_key_vect: - phone_val_vect.append(phone_dict[pkey]) + if utt_id not in list(utt_word_dict.keys()): + utt_word_dict[utt_id] = list() + + if word == unk_val: # Get the 1best phone sequence given by the unk-model + phone_id_seq = phones.split(" ") + phone_seq = list() + for pkey in phone_id_seq: + phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence. phone_2_word = list() - for phone_val in phone_val_vect: - phone_2_word.append(phone_val.split('_')[0]) - phone_2_word = ''.join(phone_2_word) - utt_word_dict[uttID][count] = phone_2_word + for phone_val in phone_seq: + phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B) + phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence + utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model else: - if word == '0': + if word == '0': # Store space/silence word_val = ' ' else: word_val = word_dict[word] - utt_word_dict[uttID][count] = word_val - count += 1 + utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post -transcription = "" -for key in sorted(utt_word_dict.iterkeys()): - transcription = key - for index in sorted(utt_word_dict[key].iterkeys()): - value = utt_word_dict[key][index] - transcription = transcription + " " + value - out_fh.write(transcription + '\n') +transcription = "" # Output transcription +for utt_key in sorted(utt_word_dict.keys()): + transcription = utt_key + for word in utt_word_dict[utt_key]: + transcription = transcription + " " + word + output_text_handle.write(transcription + '\n') From 8f2ac25d50f7ee677987c5c8cf397435ef6e246a Mon Sep 17 00:00:00 2001 From: aarora8 Date: Fri, 7 Sep 2018 16:59:49 -0400 Subject: [PATCH 29/37] adding val data for decoding --- egs/iam/v2/local/chain/compare_wer.sh | 30 +++++++++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1a.sh | 19 ++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1b.sh | 19 ++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1c.sh | 19 ++++++++++++ .../local/chain/tuning/run_cnn_e2eali_1d.sh | 21 +++++++++++-- .../v2/local/chain/tuning/run_e2e_cnn_1a.sh | 14 ++++++++- .../v2/local/chain/tuning/run_e2e_cnn_1b.sh | 14 ++++++++- 7 files changed, 131 insertions(+), 5 deletions(-) diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh index d4076457463..1488981a348 100755 --- a/egs/iam/v2/local/chain/compare_wer.sh +++ b/egs/iam/v2/local/chain/compare_wer.sh @@ -50,6 +50,36 @@ for x in $*; do done echo +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) val " +for x in $*; do + wer="--" + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) val " +for x in $*; do + cer="--" + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh index ba28f681708..a5672417aff 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -22,6 +22,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -243,3 +244,21 @@ if [ $stage -le 7 ]; then --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; fi + +if [ $stage -le 8 ] && $decode_val; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/val $dir/decode_val || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/val $dir/decode_val{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh index 298e7053086..ea27386164d 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -23,6 +23,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -249,3 +250,21 @@ if [ $stage -le 7 ]; then steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi + +if [ $stage -le 8 ] && $decode_val; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/val $dir/decode_val || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/val $dir/decode_val{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh index 48e0a76dead..6411a300a12 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -25,6 +25,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -251,3 +252,21 @@ if [ $stage -le 7 ]; then steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi + +if [ $stage -le 8 ] && $decode_val; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/val $dir/decode_val || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/val $dir/decode_val{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 2251595bec0..a7d3af59038 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -21,7 +21,7 @@ stage=0 nj=30 train_set=train -test_dir=data/test +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1b @@ -244,10 +244,25 @@ if [ $stage -le 7 ]; then --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ - $dir/graph $test_dir $dir/decode_test || exit 1; + $dir/graph data/test $dir/decode_test || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - $test_dir $dir/decode_test{,_rescored} || exit 1 + data/test $dir/decode_test{,_rescored} || exit 1 +fi + +if [ $stage -le 8 ] && $decode_val; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/val $dir/decode_val || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/val $dir/decode_val{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh index d88e1a38820..078e4e2255c 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh @@ -25,6 +25,7 @@ stage=0 train_stage=-10 get_egs_stage=-10 affix=1a +nj=30 # training options tdnn_dim=450 @@ -37,6 +38,7 @@ l2_regularize=0.00005 frames_per_iter=1000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train +decode_val=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g @@ -163,12 +165,22 @@ fi if [ $stage -le 5 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ + --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi +if [ $stage -le 6 ] && $decode_val; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/val $dir/decode_val || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/val $dir/decode_val{,_rescored} || exit 1 +fi + echo "Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh index 7fb81c97ea7..db225263ef5 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -23,12 +23,14 @@ stage=0 train_stage=-10 get_egs_stage=-10 affix=1b +nj=30 # training options tdnn_dim=450 minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 common_egs_dir= train_set=train +decode_val=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g @@ -149,12 +151,22 @@ fi if [ $stage -le 5 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ + --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ data/test $dir/decode_test{,_rescored} || exit 1 fi +if [ $stage -le 6 ] && $decode_val; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/val $dir/decode_val || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/val $dir/decode_val{,_rescored} || exit 1 +fi + echo "Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir From b8e71b29e70bd1369705357e1d20afb3a6c02da7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 10 Sep 2018 02:54:48 -0400 Subject: [PATCH 30/37] modification from the review --- egs/iam/v2/local/chain/compare_wer.sh | 12 +++--- .../local/chain/tuning/run_cnn_e2eali_1a.sh | 42 ++++++++----------- .../local/chain/tuning/run_cnn_e2eali_1b.sh | 40 +++++++----------- .../local/chain/tuning/run_cnn_e2eali_1c.sh | 41 +++++++----------- .../local/chain/tuning/run_cnn_e2eali_1d.sh | 40 +++++++----------- .../v2/local/chain/tuning/run_e2e_cnn_1a.sh | 25 ++++------- .../v2/local/chain/tuning/run_e2e_cnn_1b.sh | 27 ++++-------- 7 files changed, 82 insertions(+), 145 deletions(-) diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh index 1488981a348..2ce14e13694 100755 --- a/egs/iam/v2/local/chain/compare_wer.sh +++ b/egs/iam/v2/local/chain/compare_wer.sh @@ -50,32 +50,32 @@ for x in $*; do done echo -echo -n "# WER val " +echo -n "# WER val " for x in $*; do wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') printf "% 10s" $wer done echo -echo -n "# WER (rescored) val " +echo -n "# WER (rescored) val " for x in $*; do wer="--" - [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') printf "% 10s" $wer done echo -echo -n "# CER val " +echo -n "# CER val " for x in $*; do cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') printf "% 10s" $cer done echo -echo -n "# CER (rescored) val " +echo -n "# CER (rescored) val " for x in $*; do cer="--" - [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') printf "% 10s" $cer done echo diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh index a5672417aff..c39f4bfe9e3 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -43,7 +43,9 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_unk +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -229,35 +231,25 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; -fi - -if [ $stage -le 8 ] && $decode_val; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/val $dir/decode_val || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/val $dir/decode_val{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + done fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh index ea27386164d..cc4fd84a85a 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -46,6 +46,7 @@ srand=0 remove_egs=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -238,32 +239,19 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -if [ $stage -le 8 ] && $decode_val; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/val $dir/decode_val || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/val $dir/decode_val{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + done fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh index 6411a300a12..752cb76c21b 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -48,7 +48,7 @@ srand=0 remove_egs=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g - +if $decode_val; then maybe_val=val; else maybe_val= ; fi dropout_schedule='0,0@0.20,0.2@0.50,0' # End configuration section. echo "$0 $@" # Print the command line for logging @@ -240,32 +240,19 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -if [ $stage -le 8 ] && $decode_val; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/val $dir/decode_val || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/val $dir/decode_val{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + done fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index a7d3af59038..45712b76499 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -44,7 +44,7 @@ srand=0 remove_egs=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g - +if $decode_val; then maybe_val=val; else maybe_val= ; fi dropout_schedule='0,0@0.20,0.2@0.50,0' # End configuration section. echo "$0 $@" # Print the command line for logging @@ -237,33 +237,21 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + done fi -if [ $stage -le 8 ] && $decode_val; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/val $dir/decode_val || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/val $dir/decode_val{,_rescored} || exit 1 -fi echo "Done. Date: $(date). Results:" local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh index 078e4e2255c..4eb3e4bdff1 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh @@ -163,23 +163,14 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -if [ $stage -le 6 ] && $decode_val; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/val $dir/decode_val || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/val $dir/decode_val{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + done fi echo "Done. Date: $(date). Results:" diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh index db225263ef5..495d5076cfc 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -33,7 +33,7 @@ train_set=train decode_val=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g - +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -149,23 +149,14 @@ if [ $stage -le 4 ]; then fi if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -if [ $stage -le 6 ] && $decode_val; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/val $dir/decode_val || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/val $dir/decode_val{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + done fi echo "Done. Date: $(date). Results:" From e9a75f6cfaffdcba4d9f372944ed3ddbf21a2747 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 10 Sep 2018 03:03:38 -0400 Subject: [PATCH 31/37] modification from review --- egs/iam/v2/run_end2end.sh | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index d1de8c5c6c2..c515c85fc72 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -29,14 +29,13 @@ wellington_database=/export/corpora5/Wellington/WWC/ ./local/check_tools.sh if [ $stage -le 0 ]; then + if [ -f data/train/text ] && ! $overwrite; then - echo "Not processing, probably script have run from wrong stage" + echo "$0: Not processing, probably script have run from wrong stage" echo "Exiting with status 1 to avoid data corruption" exit 1; fi -fi -if [ $stage -le 1 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ @@ -45,8 +44,8 @@ if [ $stage -le 1 ]; then fi mkdir -p data/{train,test}/data -if [ $stage -le 2 ]; then - echo "$(date) stage 2: getting allowed image widths for e2e training..." +if [ $stage -le 1 ]; then + echo "$(date) stage 1: getting allowed image widths for e2e training..." image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to @@ -63,15 +62,15 @@ if [ $stage -le 2 ]; then utils/fix_data_dir.sh data/train fi -if [ $stage -le 3 ]; then +if [ $stage -le 2 ]; then for set in train; do - echo "$(date) stage 3: Performing augmentation, it will double training data" + echo "$(date) stage 2: Performing augmentation, it will double training data" local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; done fi -if [ $stage -le 4 ]; then +if [ $stage -le 3 ]; then echo "$0: Preparing BPE..." # getting non-silence phones. cut -d' ' -f2- data/train/text | \ @@ -105,12 +104,12 @@ END done fi -if [ $stage -le 5 ]; then +if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh fi -if [ $stage -le 6 ]; then +if [ $stage -le 5 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. @@ -128,12 +127,12 @@ if [ $stage -le 6 ]; then data/lang data/lang_rescore_6g fi -if [ $stage -le 7 ]; then +if [ $stage -le 6 ]; then echo "$0: Calling the flat-start chain recipe..." local/chain/run_e2e_cnn.sh --train_set train_aug fi -if [ $stage -le 8 ]; then +if [ $stage -le 7 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ @@ -141,7 +140,7 @@ if [ $stage -le 8 ]; then data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then +if [ $stage -le 8 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." local/chain/run_cnn_e2eali.sh --train_set train_aug fi From ae674ed137e8e4652368b9875855531eea11b0a7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 10 Sep 2018 03:14:56 -0400 Subject: [PATCH 32/37] modification from review --- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 45712b76499..aeafce2baf4 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -1,5 +1,8 @@ #!/bin/bash +# e2eali_1c is the same as e2eali_1c but has more CNN layers, different filter size +# smaller lm-opts, less epochs, more initial/finaljobs, less minibatch, frams-per-iter. + # This script does end2end chain training (i.e. from scratch) # local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d # System e2e_cnn_1b cnn_e2eali_1d From 7651f37607287ffa896049f716ce3ef470bc10f7 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 10 Sep 2018 19:00:52 -0400 Subject: [PATCH 33/37] modification for downloading aachen splits --- egs/iam/v2/local/prepare_data.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index 8c2505601cc..abfc5327149 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -54,6 +54,8 @@ ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +aachen_split_url=http://www.openslr.org/resources/56/splits.zip +aachen_splits=data/local/aachensplits mkdir -p $download_dir data/local # download and extact images and transcription @@ -145,6 +147,19 @@ else echo "$0: Wellington Corpus not included because wellington_dir not provided" fi +if [ -d $aachen_splits ]; then + echo "$0: Not downloading the Aachen splits as it is already there." +else + if [ ! -f $aachen_splits/splits.zip ]; then + echo "$0: Downloading Aachen splits ..." + mkdir -p $aachen_splits + wget -P $aachen_splits/ $aachen_split_url || exit 1; + fi + unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1; + echo "$0: Done downloading and extracting Aachen splits" +fi + + mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask From 417d97cef6bc1743485775786cd40be39e7f2cba Mon Sep 17 00:00:00 2001 From: aarora8 Date: Mon, 10 Sep 2018 22:03:30 -0400 Subject: [PATCH 34/37] fixing bug in rescoring --- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh | 2 +- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh | 2 +- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh | 2 +- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh | 2 +- egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh | 2 +- egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh | 2 +- egs/iam/v2/local/prepare_data.sh | 6 +++--- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh index c39f4bfe9e3..a80bb02290b 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -248,7 +248,7 @@ if [ $stage -le 7 ]; then $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 done fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh index cc4fd84a85a..6615c4669d6 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -250,7 +250,7 @@ if [ $stage -le 7 ]; then $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 done fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh index 752cb76c21b..f44c073635e 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -251,7 +251,7 @@ if [ $stage -le 7 ]; then $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 done fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index aeafce2baf4..7395781dd96 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -251,7 +251,7 @@ if [ $stage -le 7 ]; then $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 done fi diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh index 4eb3e4bdff1..cb2bfa0a82d 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh @@ -169,7 +169,7 @@ if [ $stage -le 5 ]; then $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 done fi diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh index 495d5076cfc..d5f79602695 100755 --- a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -155,7 +155,7 @@ if [ $stage -le 5 ]; then $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/$decode_set $dir/decode_$decode_set{,_rescored} || exit 1 + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 done fi diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index abfc5327149..cf729d9a939 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -177,9 +177,9 @@ cat $test_old > $test_new cat $val1_old $val2_old > $val_new if $process_aachen_split; then - local/process_aachen_splits.py data/local extra/splits data/train --dataset train || exit 1 - local/process_aachen_splits.py data/local extra/splits data/test --dataset test || exit 1 - local/process_aachen_splits.py data/local extra/splits data/val --dataset validation || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1 else local/process_data.py data/local data/train --dataset train || exit 1 local/process_data.py data/local data/test --dataset test || exit 1 From 6a865314e863b831e0a20709052b46fd49cf3173 Mon Sep 17 00:00:00 2001 From: aarora8 Date: Tue, 11 Sep 2018 23:43:29 -0400 Subject: [PATCH 35/37] hardcoding for removing only remaining long utterence --- .../local/remove_test_utterances_from_lob.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py index bf3c72e1e2e..6d8c0780b37 100755 --- a/egs/iam/v2/local/remove_test_utterances_from_lob.py +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -91,7 +91,7 @@ def read_utterances(text_file_path): found_line = False # avoiding very small utterance, it causes removing # complete lob text - if len(line_to_find) < 13: + if len(line_to_find) < 10: remaining_utterances[line_id] = line_to_find else: for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): @@ -109,6 +109,25 @@ def read_utterances(text_file_path): if not found_line: remaining_utterances[line_id] = line_to_find +# removing long utterances not found above +row_to_keep[87530] = False; row_to_keep[87531] = False; row_to_keep[87532] = False +row_to_keep[31724] = False; row_to_keep[31725] = False; row_to_keep[31726] = False +row_to_keep[16704] = False; row_to_keep[16705] = False; row_to_keep[16706] = False; +row_to_keep[94181] = False; row_to_keep[94182] = False; row_to_keep[94183] = False; +row_to_keep[20171] = False; row_to_keep[20172] = False; row_to_keep[20173] = False; +row_to_keep[16734] = False; row_to_keep[16733] = False; row_to_keep[16732] = False; +row_to_keep[20576] = False; row_to_keep[20577] = False; row_to_keep[20578] = False; +row_to_keep[31715] = False; row_to_keep[31716] = False; row_to_keep[31717] = False; +row_to_keep[31808] = False; row_to_keep[31809] = False; row_to_keep[31810] = False; +row_to_keep[31822] = False; row_to_keep[31823] = False; row_to_keep[31824] = False; +row_to_keep[88791] = False; row_to_keep[88792] = False; row_to_keep[88793] = False; +row_to_keep[31745] = False; row_to_keep[31746] = False; row_to_keep[31825] = False; +row_to_keep[94256] = False; row_to_keep[94257] = False; row_to_keep[88794] = False; +row_to_keep[88665] = False; row_to_keep[17093] = False; row_to_keep[17094] = False; +row_to_keep[20586] = False; row_to_keep[87228] = False; row_to_keep[87229] = False; +row_to_keep[16744] = False; row_to_keep[87905] = False; row_to_keep[87906] = False; +row_to_keep[16669] = False; row_to_keep[16670] = False; row_to_keep[16719] = False; +row_to_keep[87515] = False; row_to_keep[20090] = False; row_to_keep[31748] = False; for i in range(len(original_corpus_text)): transcript = original_corpus_text[i].strip() if row_to_keep[i]: From ba07ff0171e6894fe7f5daac813f971564eb5b3e Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 12 Sep 2018 02:04:57 -0400 Subject: [PATCH 36/37] fix in hardcoding --- egs/iam/v2/local/remove_test_utterances_from_lob.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py index 6d8c0780b37..5e5dac52818 100755 --- a/egs/iam/v2/local/remove_test_utterances_from_lob.py +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -27,6 +27,8 @@ def remove_punctuations(transcript): continue if char == '(' or char == ':' or char == ';' or char == '"': continue + if char == '*': + continue char_list.append(char) return char_list @@ -110,8 +112,8 @@ def read_utterances(text_file_path): remaining_utterances[line_id] = line_to_find # removing long utterances not found above -row_to_keep[87530] = False; row_to_keep[87531] = False; row_to_keep[87532] = False -row_to_keep[31724] = False; row_to_keep[31725] = False; row_to_keep[31726] = False +row_to_keep[87530] = False; row_to_keep[87531] = False; row_to_keep[87532] = False; +row_to_keep[31724] = False; row_to_keep[31725] = False; row_to_keep[31726] = False; row_to_keep[16704] = False; row_to_keep[16705] = False; row_to_keep[16706] = False; row_to_keep[94181] = False; row_to_keep[94182] = False; row_to_keep[94183] = False; row_to_keep[20171] = False; row_to_keep[20172] = False; row_to_keep[20173] = False; From 53984120eebedf853fd2c24f640aa7208112a1df Mon Sep 17 00:00:00 2001 From: aarora8 Date: Wed, 12 Sep 2018 17:20:09 -0400 Subject: [PATCH 37/37] modification from review --- egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh index 7395781dd96..e7d9246fb89 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -1,9 +1,8 @@ #!/bin/bash -# e2eali_1c is the same as e2eali_1c but has more CNN layers, different filter size -# smaller lm-opts, less epochs, more initial/finaljobs, less minibatch, frams-per-iter. +# e2eali_1d is the same as e2eali_1c but has more CNN layers, different filter size +# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs. -# This script does end2end chain training (i.e. from scratch) # local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d # System e2e_cnn_1b cnn_e2eali_1d # WER 13.91 8.80