diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py index c5ad1235427..f8b69820601 100755 --- a/egs/iam/v1/local/unk_arc_post_to_transcription.py +++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py @@ -1,88 +1,107 @@ #!/usr/bin/env python3 -# Copyright 2017 Ashish Arora +#Copyright 2017 Ashish Arora +""" This module will be used by scripts for open vocabulary setup. + If the hypothesis transcription contains , then it will replace the + with the word predicted by model by concatenating phones decoded + from the unk-model. It is currently supported only for triphone setup. + Args: + phones: File name of a file that contains the phones.txt, (symbol-table for phones). + phone and phoneID, Eg. a 217, phoneID of 'a' is 217. + words: File name of a file that contains the words.txt, (symbol-table for words). + word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234. + unk: ID of . Eg. 231. + one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior + of arcs along the one-best path from the lattice. + E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231 + [] + [ ...] + output-text: File containing hypothesis transcription with recognized by the + unk-model. + E.g. A move to stop mr. gaitskell. + + Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt + data/lang/oov.int +""" import argparse +import os import sys - parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") -parser.add_argument('phones', type=str, help='phones and phonesID') -parser.add_argument('words', type=str, help='word and wordID') -parser.add_argument('unk', type=str, default='-', help='location of unk file') -parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +parser.add_argument('phones', type=str, help='File name of a file that contains the' + 'symbol-table for phones. Each line must be: ') +parser.add_argument('words', type=str, help='File name of a file that contains the' + 'symbol-table for words. Each line must be: ') +parser.add_argument('unk', type=str, default='-', help='File name of a file that' + 'contains the ID of . The content must be: , e.g. 231') +parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post' + 'format, which is a list of timing info and posterior of arcs' + 'along the one-best path from the lattice') +parser.add_argument('--output-text', type=str, default='-', help='File containing' + 'hypothesis transcription with recognized by the unk-model') args = parser.parse_args() - ### main ### -phone_fh = open(args.phones, 'r', encoding='latin-1') -word_fh = open(args.words, 'r', encoding='latin-1') -unk_fh = open(args.unk, 'r', encoding='latin-1') -if args.input_ark == '-': - input_fh = sys.stdin +phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles +word_handle = open(args.words, 'r', encoding='latin-1') +unk_handle = open(args.unk,'r', encoding='latin-1') +if args.one_best_arc_post == '-': + arc_post_handle = sys.stdin else: - input_fh = open(args.input_ark, 'r', encoding='latin-1') -if args.out_ark == '-': - out_fh = sys.stdout + arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1') +if args.output_text == '-': + output_text_handle = sys.stdout else: - out_fh = open(args.out_ark, 'w', encoding='latin-1') + output_text_handle = open(args.output_text, 'w', encoding='latin-1') -phone_dict = dict() # Stores phoneID and phone mapping -phone_data_vect = phone_fh.read().strip().split("\n") -for key_val in phone_data_vect: +id2phone = dict() # Stores the mapping from phone_id (int) to phone (char) +phones_data = phone_handle.read().strip().split("\n") + +for key_val in phones_data: key_val = key_val.split(" ") - phone_dict[key_val[1]] = key_val[0] + id2phone[key_val[1]] = key_val[0] + word_dict = dict() -word_data_vect = word_fh.read().strip().split("\n") +word_data_vect = word_handle.read().strip().split("\n") + for key_val in word_data_vect: key_val = key_val.split(" ") word_dict[key_val[1]] = key_val[0] -unk_val = unk_fh.read().strip().split(" ")[0] +unk_val = unk_handle.read().strip().split(" ")[0] -utt_word_dict = dict() -utt_phone_dict = dict() # Stores utteranceID and phoneID -unk_word_dict = dict() -count=0 -for line in input_fh: +utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str) +for line in arc_post_handle: line_vect = line.strip().split("\t") - if len(line_vect) < 6: - print("Bad line: '{}' Expecting 6 fields. Skipping...".format(line), + if len(line_vect) < 6: # Check for 1best-arc-post output + print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line), file=sys.stderr) continue - uttID = line_vect[0] + utt_id = line_vect[0] word = line_vect[4] phones = line_vect[5] - if uttID in utt_word_dict.keys(): - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - else: - count = 0 - utt_word_dict[uttID] = dict() - utt_phone_dict[uttID] = dict() - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - if word == unk_val: # Get character sequence for unk - phone_key_vect = phones.split(" ") - phone_val_vect = list() - for pkey in phone_key_vect: - phone_val_vect.append(phone_dict[pkey]) + if utt_id not in list(utt_word_dict.keys()): + utt_word_dict[utt_id] = list() + + if word == unk_val: # Get the 1best phone sequence given by the unk-model + phone_id_seq = phones.split(" ") + phone_seq = list() + for pkey in phone_id_seq: + phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence. phone_2_word = list() - for phone_val in phone_val_vect: - phone_2_word.append(phone_val.split('_')[0]) - phone_2_word = ''.join(phone_2_word) - utt_word_dict[uttID][count] = phone_2_word + for phone_val in phone_seq: + phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B) + phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence + utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model else: - if word == '0': + if word == '0': # Store space/silence word_val = ' ' else: word_val = word_dict[word] - utt_word_dict[uttID][count] = word_val - count += 1 + utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post -transcription = "" -for key in sorted(utt_word_dict.keys()): - transcription = key - for index in sorted(utt_word_dict[key].keys()): - value = utt_word_dict[key][index] - transcription = transcription + " " + value - out_fh.write(transcription + '\n') +transcription = "" # Output transcription +for utt_key in sorted(utt_word_dict.keys()): + transcription = utt_key + for word in utt_word_dict[utt_key]: + transcription = transcription + " " + word + output_text_handle.write(transcription + '\n') diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh old mode 100644 new mode 100755 diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh new file mode 100755 index 00000000000..31e4a8217ca --- /dev/null +++ b/egs/iam/v2/local/augment_data.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --fliplr false --augment true $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh index d4076457463..2ce14e13694 100755 --- a/egs/iam/v2/local/chain/compare_wer.sh +++ b/egs/iam/v2/local/chain/compare_wer.sh @@ -50,6 +50,36 @@ for x in $*; do done echo +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) val " +for x in $*; do + wer="--" + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) val " +for x in $*; do + cer="--" + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh index ad51803ab0e..da731bcb0b1 120000 --- a/egs/iam/v2/local/chain/run_cnn_e2eali.sh +++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh @@ -1 +1 @@ -tuning/run_cnn_e2eali_1c.sh \ No newline at end of file +tuning/run_cnn_e2eali_1d.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh deleted file mode 100755 index 15bdf610cd3..00000000000 --- a/egs/iam/v2/local/chain/run_e2e_cnn.sh +++ /dev/null @@ -1,174 +0,0 @@ -#!/bin/bash -# Copyright 2017 Hossein Hadian - -# This script does end2end chain training (i.e. from scratch) - -# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a -# System cnn_1a cnn_chainali_1c e2e_cnn_1a -# WER 18.52 12.72 12.15 -# CER 10.07 5.99 6.03 -# Final train prob -0.0077 -0.0291 -0.0371 -# Final valid prob -0.0970 -0.0359 -0.0636 -# Final train prob (xent) -0.5484 -0.9781 -# Final valid prob (xent) -0.9643 -1.1544 -# Parameters 4.36M 3.96M 9.13M - -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a -# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) - -set -e - -# configs for 'chain' -stage=0 -train_stage=-10 -get_egs_stage=-10 -affix=1a - -# training options -tdnn_dim=450 -num_epochs=4 -num_jobs_initial=2 -num_jobs_final=4 -minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 -common_egs_dir= -l2_regularize=0.00005 -frames_per_iter=1000000 -cmvn_opts="--norm-means=true --norm-vars=true" -train_set=train -lang_decode=data/lang -lang_rescore=data/lang_rescore_6g - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 1 ]; then - steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ - --shared-phones true \ - --type biphone \ - data/$train_set $lang $treedir - $cmd $treedir/log/make_phone_lm.log \ - cat data/$train_set/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ - utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=500 \ - ark:- $treedir/phone_lm.fst -fi - -if [ $stage -le 2 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - - cnn_opts="l2-regularize=0.075" - tdnn_opts="l2-regularize=0.075" - output_opts="l2-regularize=0.1" - common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=40 name=input - - conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 - conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 - conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 - conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts - - ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts -EOF - - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs -fi - -if [ $stage -le 3 ]; then - # no need to store the egs in a shared storage because we always - # remove them. Anyway, it takes only 5 minutes to generate them. - - steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ - --cmd "$cmd" \ - --feat.cmvn-opts "$cmvn_opts" \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ - --chain.apply-deriv-weights false \ - --egs.dir "$common_egs_dir" \ - --egs.stage $get_egs_stage \ - --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ - --chain.frame-subsampling-factor 4 \ - --chain.alignment-subsampling-factor 4 \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate 0.001 \ - --trainer.optimization.final-effective-lrate 0.0001 \ - --trainer.optimization.shrink-value 1.0 \ - --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ - --feat-dir data/${train_set} \ - --tree-dir $treedir \ - --dir $dir || exit 1; -fi - -if [ $stage -le 4 ]; then - # The reason we are using data/lang here, instead of $lang, is just to - # emphasize that it's not actually important to give mkgraph.sh the - # lang directory with the matched topology (since it gets the - # topology file from the model). So you could give it a different - # lang directory, one that contained a wordlist and LM of your choice, - # as long as phones.txt was compatible. - - utils/mkgraph.sh \ - --self-loop-scale 1.0 $lang_decode \ - $dir $dir/graph || exit 1; -fi - -if [ $stage -le 5 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 30 --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 -fi - -echo "Done. Date: $(date). Results:" -local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..7dca9c30e23 --- /dev/null +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1b.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh index ba28f681708..a80bb02290b 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -22,6 +22,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -42,7 +43,9 @@ tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_unk +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -228,18 +231,26 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh index 298e7053086..6615c4669d6 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -23,6 +23,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -45,6 +46,7 @@ srand=0 remove_egs=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi # End configuration section. echo "$0 $@" # Print the command line for logging @@ -237,15 +239,20 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh index 48e0a76dead..f44c073635e 100755 --- a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -25,6 +25,7 @@ stage=0 nj=30 train_set=train +decode_val=true nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. e2echain_model_dir=exp/chain/e2e_cnn_1a @@ -47,7 +48,7 @@ srand=0 remove_egs=true lang_decode=data/lang lang_rescore=data/lang_rescore_6g - +if $decode_val; then maybe_val=val; else maybe_val= ; fi dropout_schedule='0,0@0.20,0.2@0.50,0' # End configuration section. echo "$0 $@" # Print the command line for logging @@ -239,15 +240,20 @@ fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ - $dir/graph data/test $dir/decode_test || exit 1; - - steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ - data/test $dir/decode_test{,_rescored} || exit 1 + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh new file mode 100755 index 00000000000..e7d9246fb89 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1d.sh @@ -0,0 +1,259 @@ +#!/bin/bash + +# e2eali_1d is the same as e2eali_1c but has more CNN layers, different filter size +# smaller lm-opts, minibatch, frams-per-iter, less epochs and more initial/finaljobs. + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ exp/chain/cnn_e2eali_1d +# System e2e_cnn_1b cnn_e2eali_1d +# WER 13.91 8.80 +# WER (rescored) 13.64 8.52 +# CER 7.08 4.06 +# CER (rescored) 6.82 3.98 +# Final train prob 0.0148 -0.0524 +# Final valid prob 0.0105 -0.0713 +# Final train prob (xent) -0.4695 +# Final valid prob (xent) -0.5310 +# Parameters 9.52M 4.36M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1d +# exp/chain/cnn_e2eali_1d: num-iters=30 nj=3..5 num-params=4.4M dim=40->400 combine=-0.055->-0.055 (over 1) xent:train/valid[19,29,final]=(-0.683,-0.489,-0.469/-0.703,-0.544,-0.531) logprob:train/valid[19,29,final]=(-0.090,-0.057,-0.052/-0.107,-0.076,-0.071) +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1d #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1b +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..cb2bfa0a82d --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 11.24 +# WER (rescored) 10.80 +# CER 5.32 +# CER (rescored) 5.24 +# Final train prob 0.0568 +# Final valid prob 0.0381 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=42 nj=2..4 num-params=9.1M dim=40->12640 combine=0.049->0.049 (over 1) logprob:train/valid[27,41,final]=(0.035,0.055,0.057/0.016,0.037,0.038) + + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a +nj=30 + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train +decode_val=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh new file mode 100755 index 00000000000..d5f79602695 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_e2e_cnn_1b.sh @@ -0,0 +1,163 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1b/ +# System e2e_cnn_1b +# WER 13.59 +# WER (rescored) 13.27 +# CER 6.92 +# CER (rescored) 6.71 +# Final train prob 0.0345 +# Final valid prob 0.0269 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 9.52M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1b +# exp/chain/e2e_cnn_1b: num-iters=42 nj=2..4 num-params=9.5M dim=40->12640 combine=0.041->0.041 (over 2) logprob:train/valid[27,41,final]=(0.032,0.035,0.035/0.025,0.026,0.027) +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1b +nj=30 + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +decode_val=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/extract_features.sh b/egs/iam/v2/local/extract_features.sh new file mode 100755 index 00000000000..1741ad3f9b2 --- /dev/null +++ b/egs/iam/v2/local/extract_features.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment=false +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + local/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/iam/v2/local/gen_topo.py b/egs/iam/v2/local/gen_topo.py new file mode 100755 index 00000000000..540bfbcf270 --- /dev/null +++ b/egs/iam/v2/local/gen_topo.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set("!(),.?;:'-\"") +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split(' ')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_nonsil_states) + " ") +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_punctuation_states) + " ") +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " " + str(x) + " " + str(x) + " " + for y in range(1, args.num_sil_states): + state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") + print(" " + str(args.num_sil_states) + " ") +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" " + str(args.num_sil_states) + " ") +print("") +print("") diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py index 84e012daedb..3ce501732cf 100755 --- a/egs/iam/v2/local/make_features.py +++ b/egs/iam/v2/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2017 Yiwen Shao # 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to @@ -14,20 +15,27 @@ to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. - eg. local/make_features.py data/train --feat-dim 40 """ - +import random import argparse import os import sys +import scipy.io as sio import numpy as np from scipy import misc +from scipy.ndimage.interpolation import affine_transform +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") -parser.add_argument('dir', type=str, - help='Source data directory (containing images.scp)') +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') parser.add_argument('--out-ark', type=str, default='-', help='Where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, @@ -35,8 +43,10 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') - - +parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, + help="Flip the image left-right for right to left languages") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() @@ -56,18 +66,12 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im, allowed_lengths = None): - scale_size = args.feat_dim - sx = im.shape[1] - sy = im.shape[0] - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) + +def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding else: # Find an allowed length for the image - imlen = im.shape[1] + imlen = im.shape[1] # width allowed_len = 0 for l in allowed_lengths: if l > imlen: @@ -77,28 +81,153 @@ def get_scaled_image(im, allowed_lengths = None): # No allowed length was found for the image (the image is too long) return None padding = allowed_len - imlen - left_padding = padding // 2 + left_padding = int(padding // 2) right_padding = padding - left_padding - dim_y = im.shape[0] + dim_y = im.shape[0] # height im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 -### main ### -data_list_path = os.path.join(args.dir, 'images.scp') +def get_scaled_image_aug(im, mode='normal'): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + scale_size = random.randint(10, 30) + scale = (1.0 * scale_size) / sy + down_nx = int(scale_size) + down_ny = int(scale * sx) + if mode == 'normal': + im = misc.imresize(im, (nx, ny)) + return im + else: + im_scaled_down = misc.imresize(im, (down_nx, down_ny)) + im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) + return im_scaled_up + return im + +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + +def find_slant_project(im): + rows = im.shape[0] + cols = im.shape[1] + std_max = 0 + alpha_max = 0 + col_disp = np.zeros(90, int) + proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) + for r in range(rows): + for alpha in range(-45, 45, 1): + col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) + for c in range(cols): + if im[r, c] < 100: + for alpha in range(-45, 45, 1): + proj[alpha + 45, c + col_disp[alpha] + rows] += 1 + for alpha in range(-45, 45, 1): + proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) + proj_std = np.std(proj_histogram) + if proj_std > std_max: + std_max = proj_std + alpha_max = alpha + proj_std = np.std(proj, axis=1) + return -alpha_max + + +def horizontal_shear(im, degree): + rad = degree / 180.0 * math.pi + padding_x = int(abs(np.tan(rad)) * im.shape[0]) + padding_y = im.shape[0] + if rad > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + elif rad < 0: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + else: + im_pad = im + shear_matrix = np.array([[1, 0], + [np.tan(rad), 1]]) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +### main ### +random.seed(1) +data_list_path = args.images_scp_path if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark,'w') allowed_lengths = None -if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): print("Found 'allowed_lengths.txt' file...", file=sys.stderr) allowed_lengths = [] - with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + with open(allowed_len_handle) as f: for line in f: allowed_lengths.append(int(line.strip())) print("Read {} allowed lengths and will apply them to the " @@ -106,6 +235,7 @@ def get_scaled_image(im, allowed_lengths = None): num_fail = 0 num_ok = 0 +aug_setting = ['normal', 'scaled'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -113,15 +243,24 @@ def get_scaled_image(im, allowed_lengths = None): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scaled = get_scaled_image(im, allowed_lengths) - - if im_scaled is None: + if args.fliplr: + im = np.fliplr(im) + if args.augment: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_contrast = contrast_normalization(im_aug, 0.05, 0.2) + slant_degree = find_slant_project(im_contrast) + im_sheared = horizontal_shear(im_contrast, slant_degree) + im_aug = im_sheared + else: + im_aug = get_scaled_image_aug(im, aug_setting[0]) + im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) + if im_horizontal_padded is None: num_fail += 1 continue - data = np.transpose(im_scaled, (1, 0)) + data = np.transpose(im_horizontal_padded, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) -print('Generated features for {} images. Failed for {} (iamge too ' +print('Generated features for {} images. Failed for {} (image too ' 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh index 73d711c73f0..cf729d9a939 100755 --- a/egs/iam/v2/local/prepare_data.sh +++ b/egs/iam/v2/local/prepare_data.sh @@ -18,6 +18,7 @@ stage=0 download_dir=data/download +process_aachen_split=false wellington_dir= username= password= # username and password for downloading the IAM database @@ -53,6 +54,8 @@ ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +aachen_split_url=http://www.openslr.org/resources/56/splits.zip +aachen_splits=data/local/aachensplits mkdir -p $download_dir data/local # download and extact images and transcription @@ -144,6 +147,19 @@ else echo "$0: Wellington Corpus not included because wellington_dir not provided" fi +if [ -d $aachen_splits ]; then + echo "$0: Not downloading the Aachen splits as it is already there." +else + if [ ! -f $aachen_splits/splits.zip ]; then + echo "$0: Downloading Aachen splits ..." + mkdir -p $aachen_splits + wget -P $aachen_splits/ $aachen_split_url || exit 1; + fi + unzip $aachen_splits/splits.zip -d $aachen_splits || exit 1; + echo "$0: Done downloading and extracting Aachen splits" +fi + + mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask @@ -160,11 +176,16 @@ cat $train_old > $train_new cat $test_old > $test_new cat $val1_old $val2_old > $val_new -if [ $stage -le 0 ]; then - local/process_data.py data/local data/train --dataset train || exit 1 - local/process_data.py data/local data/test --dataset test || exit 1 - local/process_data.py data/local data/val --dataset validation || exit 1 - - utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt - utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +if $process_aachen_split; then + local/process_aachen_splits.py data/local $aachen_splits/splits data/train --dataset train || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/test --dataset test || exit 1 + local/process_aachen_splits.py data/local $aachen_splits/splits data/val --dataset validation || exit 1 +else + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 fi + +image/fix_data_dir.sh data/train +image/fix_data_dir.sh data/test +image/fix_data_dir.sh data/val diff --git a/egs/iam/v2/local/prepend_words.py b/egs/iam/v2/local/prepend_words.py deleted file mode 100755 index d53eb8974bf..00000000000 --- a/egs/iam/v2/local/prepend_words.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# This script, prepend '|' to every words in the transcript to mark -# the beginning of the words for finding the initial-space of every word -# after decoding. - -import sys, io - -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -for line in infile: - output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/iam/v2/local/process_aachen_splits.py b/egs/iam/v2/local/process_aachen_splits.py new file mode 100755 index 00000000000..cb6a6d4f0d8 --- /dev/null +++ b/egs/iam/v2/local/process_aachen_splits.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_aachen_splits.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('split_path', type=str, + help='location of the train/test/val set') +parser.add_argument('out_dir', type=str, + help='location to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.split_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + + +### main ### + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + doc = minidom.parse(xml_path) + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder) + for file in os.listdir(lines_path): + if file.endswith(".png"): + image_file_path = os.path.join(lines_path, file) + base_name = os.path.splitext(os.path.basename(image_file_path))[0] + text = text_dict[base_name] + utt_id = writer_id + '_' + base_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py index fa5eb484707..2adae7bf7be 100755 --- a/egs/iam/v2/local/process_data.py +++ b/egs/iam/v2/local/process_data.py @@ -67,7 +67,6 @@ def process_text_file_for_word_model(): xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') img_num = line[-3:] doc = minidom.parse(xml_path) - form_elements = doc.getElementsByTagName('form')[0] writer_id = form_elements.getAttribute('writer-id') outerfolder = form_elements.getAttribute('id')[0:3] diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py index 1b414ef47f6..5e5dac52818 100755 --- a/egs/iam/v2/local/remove_test_utterances_from_lob.py +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -27,6 +27,8 @@ def remove_punctuations(transcript): continue if char == '(' or char == ':' or char == ';' or char == '"': continue + if char == '*': + continue char_list.append(char) return char_list @@ -89,22 +91,45 @@ def read_utterances(text_file_path): remaining_utterances = dict() for line_id, line_to_find in utterance_dict.items(): found_line = False - for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): - # Combine 3 consecutive lines of the corpus into a single line - prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() - curr_words = corpus_text_lowercase_wo_sc[i].strip() - next_words = corpus_text_lowercase_wo_sc[i + 1].strip() - new_line = prev_words + curr_words + next_words - transcript = ''.join(new_line) - if line_to_find in transcript: - found_line = True - row_to_keep[i-1] = False - row_to_keep[i] = False - row_to_keep[i+1] = False + # avoiding very small utterance, it causes removing + # complete lob text + if len(line_to_find) < 10: + remaining_utterances[line_id] = line_to_find + else: + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False if not found_line: remaining_utterances[line_id] = line_to_find - +# removing long utterances not found above +row_to_keep[87530] = False; row_to_keep[87531] = False; row_to_keep[87532] = False; +row_to_keep[31724] = False; row_to_keep[31725] = False; row_to_keep[31726] = False; +row_to_keep[16704] = False; row_to_keep[16705] = False; row_to_keep[16706] = False; +row_to_keep[94181] = False; row_to_keep[94182] = False; row_to_keep[94183] = False; +row_to_keep[20171] = False; row_to_keep[20172] = False; row_to_keep[20173] = False; +row_to_keep[16734] = False; row_to_keep[16733] = False; row_to_keep[16732] = False; +row_to_keep[20576] = False; row_to_keep[20577] = False; row_to_keep[20578] = False; +row_to_keep[31715] = False; row_to_keep[31716] = False; row_to_keep[31717] = False; +row_to_keep[31808] = False; row_to_keep[31809] = False; row_to_keep[31810] = False; +row_to_keep[31822] = False; row_to_keep[31823] = False; row_to_keep[31824] = False; +row_to_keep[88791] = False; row_to_keep[88792] = False; row_to_keep[88793] = False; +row_to_keep[31745] = False; row_to_keep[31746] = False; row_to_keep[31825] = False; +row_to_keep[94256] = False; row_to_keep[94257] = False; row_to_keep[88794] = False; +row_to_keep[88665] = False; row_to_keep[17093] = False; row_to_keep[17094] = False; +row_to_keep[20586] = False; row_to_keep[87228] = False; row_to_keep[87229] = False; +row_to_keep[16744] = False; row_to_keep[87905] = False; row_to_keep[87906] = False; +row_to_keep[16669] = False; row_to_keep[16670] = False; row_to_keep[16719] = False; +row_to_keep[87515] = False; row_to_keep[20090] = False; row_to_keep[31748] = False; for i in range(len(original_corpus_text)): transcript = original_corpus_text[i].strip() if row_to_keep[i]: diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh index b2032909333..1d84815fc69 100755 --- a/egs/iam/v2/local/score.sh +++ b/egs/iam/v2/local/score.sh @@ -1,155 +1,6 @@ -#!/bin/bash -# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) -# Apache 2.0 - -# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's -# using local/unk_arc_post_to_transcription.py and also it calls -# steps/scoring/score_kaldi_cer.sh at the end. - -[ -f ./path.sh ] && . ./path.sh - -# begin configuration section. -cmd=run.pl -stage=0 -decode_mbr=false -stats=true -beam=6 -word_ins_penalty=0.0,0.5,1.0 -min_lmwt=3 -max_lmwt=13 -iter=final -#end configuration section. - -echo "$0 $@" # Print the command line for logging -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --stage (0|1|2) # start scoring script from part-way through." - echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." - echo " --min_lmwt # minumum LM-weight for lattice rescoring " - echo " --max_lmwt # maximum LM-weight for lattice rescoring " - exit 1; -fi - -data=$1 -lang_or_graph=$2 -dir=$3 -model_path=`echo $dir |xargs dirname` -symtab=$lang_or_graph/words.txt - -for f in $symtab $dir/lat.1.gz $data/text; do - [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; -done - - -ref_filtering_cmd="cat" -[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" -[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" -hyp_filtering_cmd="cat" -[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" -[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" - - -if $decode_mbr ; then - echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" -else - echo "$0: scoring with word insertion penalty=$word_ins_penalty" -fi - - -mkdir -p $dir/scoring_kaldi -cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; -if [ $stage -le 0 ]; then - - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - mkdir -p $dir/scoring_kaldi/penalty_$wip/log - - if $decode_mbr ; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ - acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-prune --beam=$beam ark:- ark:- \| \ - lattice-mbr-decode --word-symbol-table=$symtab \ - ark:- ark,t:- \| \ - utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; - else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ - utils/int2sym.pl -f 2- $symtab \| \ - $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; - fi - - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ - cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ - compute-wer --text --mode=present \ - "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; - - done -fi - - - -if [ $stage -le 1 ]; then - - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - for lmwt in $(seq $min_lmwt $max_lmwt); do - # adding /dev/null to the command list below forces grep to output the filename - grep WER $dir/wer_${lmwt}_${wip} /dev/null - done - done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 - - best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) - best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') - best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') - - if [ -z "$best_lmwt" ]; then - echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." - exit 1; - fi - - if $stats; then - mkdir -p $dir/scoring_kaldi/wer_details - echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight - echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty - - $cmd $dir/scoring_kaldi/log/stats1.log \ - cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ - align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ - utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ - utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; - - $cmd $dir/scoring_kaldi/log/stats2.log \ - cat $dir/scoring_kaldi/wer_details/per_utt \| \ - utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ - sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; - - $cmd $dir/scoring_kaldi/log/wer_bootci.log \ - compute-wer-bootci --mode=present \ - ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ - '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; - - fi -fi - -steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ - --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ - $data $lang_or_graph $dir +#!/bin/bash -# If we got here, the scoring was successful. -# As a small aid to prevent confusion, we remove all wer_{?,??} files; -# these originate from the previous version of the scoring files -# i keep both statement here because it could lead to confusion about -# the capabilities of the script (we don't do cer in the script) -rm $dir/wer_{?,??} 2>/dev/null -rm $dir/cer_{?,??} 2>/dev/null -exit 0; +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh index 35eb56b1341..cc0119eb748 100755 --- a/egs/iam/v2/local/train_lm.sh +++ b/egs/iam/v2/local/train_lm.sh @@ -64,22 +64,22 @@ if [ $stage -le 0 ]; then > data/local/lob-train-only.txt fi cat data/local/lob-train-only.txt | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ - | sed 's/@@//g' > ${dir}/data/text/brown.txt + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/brown.txt + tail -n +5000 ${dir}/brown.txt > ${dir}/data/text/brown.txt if [ -d "data/local/wellingtoncorpus" ]; then cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > ${dir}/data/text/wellington.txt fi # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically # becomes the dev set. - - cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + head -5000 ${dir}/brown.txt > ${dir}/data/text/dev.txt # use the training data as an additional data source. # we can later fold the dev data into this. diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh index de5c7086ec2..c515c85fc72 100755 --- a/egs/iam/v2/run_end2end.sh +++ b/egs/iam/v2/run_end2end.sh @@ -6,6 +6,8 @@ stage=0 nj=20 username= password= +process_aachen_split=false +overwrite=false # iam_database points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # like "data/download" and follow the instructions @@ -27,56 +29,96 @@ wellington_database=/export/corpora5/Wellington/WWC/ ./local/check_tools.sh if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ --wellington-dir "$wellington_database" \ - --username "$username" --password "$password" + --username "$username" --password "$password" \ + --process_aachen_split $process_aachen_split fi -mkdir -p data/{train,test}/data +mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then - image/get_image2num_frames.py data/train # This will be needed for the next command + echo "$(date) stage 1: getting allowed image widths for e2e training..." + image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - echo "$0: Preparing the test and train feature files..." - for dataset in train test; do - local/make_features.py data/$dataset --feat-dim 40 | \ - copy-feats --compress=true --compression-method=7 \ - ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp - steps/compute_cmvn_stats.sh data/$dataset + echo "$(date) Extracting features, creating feats.scp file" + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train + steps/compute_cmvn_stats.sh data/train || exit 1; + for set in val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --augment true \ + --feat-dim 40 data/${set} + steps/compute_cmvn_stats.sh data/${set} || exit 1; done utils/fix_data_dir.sh data/train fi if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then echo "$0: Preparing BPE..." + # getting non-silence phones. cut -d' ' -f2- data/train/text | \ - local/prepend_words.py | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt - for set in test train val; do + for set in test train val train_aug; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text done fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh fi -if [ $stage -le 4 ]; then +if [ $stage -le 5 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ @@ -85,20 +127,20 @@ if [ $stage -le 4 ]; then data/lang data/lang_rescore_6g fi -if [ $stage -le 5 ]; then +if [ $stage -le 6 ]; then echo "$0: Calling the flat-start chain recipe..." - local/chain/run_e2e_cnn.sh + local/chain/run_e2e_cnn.sh --train_set train_aug fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ - data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train + data/train_aug data/lang exp/chain/e2e_cnn_1b exp/chain/e2e_ali_train fi -if [ $stage -le 7 ]; then +if [ $stage -le 8 ]; then echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - local/chain/run_cnn_e2eali.sh + local/chain/run_cnn_e2eali.sh --train_set train_aug fi diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py index c86d35e4b8a..f8b69820601 100755 --- a/egs/uw3/v1/local/unk_arc_post_to_transcription.py +++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py @@ -1,86 +1,107 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 -# Copyright 2017 Ashish Arora +#Copyright 2017 Ashish Arora +""" This module will be used by scripts for open vocabulary setup. + If the hypothesis transcription contains , then it will replace the + with the word predicted by model by concatenating phones decoded + from the unk-model. It is currently supported only for triphone setup. + Args: + phones: File name of a file that contains the phones.txt, (symbol-table for phones). + phone and phoneID, Eg. a 217, phoneID of 'a' is 217. + words: File name of a file that contains the words.txt, (symbol-table for words). + word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234. + unk: ID of . Eg. 231. + one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior + of arcs along the one-best path from the lattice. + E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231 + [] + [ ...] + output-text: File containing hypothesis transcription with recognized by the + unk-model. + E.g. A move to stop mr. gaitskell. + + Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt + data/lang/oov.int +""" import argparse +import os import sys - parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") -parser.add_argument('phones', type=str, help='phones and phonesID') -parser.add_argument('words', type=str, help='word and wordID') -parser.add_argument('unk', type=str, default='-', help='location of unk file') -parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') -parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +parser.add_argument('phones', type=str, help='File name of a file that contains the' + 'symbol-table for phones. Each line must be: ') +parser.add_argument('words', type=str, help='File name of a file that contains the' + 'symbol-table for words. Each line must be: ') +parser.add_argument('unk', type=str, default='-', help='File name of a file that' + 'contains the ID of . The content must be: , e.g. 231') +parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post' + 'format, which is a list of timing info and posterior of arcs' + 'along the one-best path from the lattice') +parser.add_argument('--output-text', type=str, default='-', help='File containing' + 'hypothesis transcription with recognized by the unk-model') args = parser.parse_args() + ### main ### -phone_fh = open(args.phones, 'r') -word_fh = open(args.words, 'r') -unk_fh = open(args.unk,'r') -if args.input_ark == '-': - input_fh = sys.stdin +phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles +word_handle = open(args.words, 'r', encoding='latin-1') +unk_handle = open(args.unk,'r', encoding='latin-1') +if args.one_best_arc_post == '-': + arc_post_handle = sys.stdin else: - input_fh = open(args.input_ark,'r') -if args.out_ark == '-': - out_fh = sys.stdout + arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1') +if args.output_text == '-': + output_text_handle = sys.stdout else: - out_fh = open(args.out_ark,'wb') + output_text_handle = open(args.output_text, 'w', encoding='latin-1') -phone_dict = dict()# stores phoneID and phone mapping -phone_data_vect = phone_fh.read().strip().split("\n") -for key_val in phone_data_vect: +id2phone = dict() # Stores the mapping from phone_id (int) to phone (char) +phones_data = phone_handle.read().strip().split("\n") + +for key_val in phones_data: key_val = key_val.split(" ") - phone_dict[key_val[1]] = key_val[0] + id2phone[key_val[1]] = key_val[0] + word_dict = dict() -word_data_vect = word_fh.read().strip().split("\n") +word_data_vect = word_handle.read().strip().split("\n") + for key_val in word_data_vect: key_val = key_val.split(" ") word_dict[key_val[1]] = key_val[0] -unk_val = unk_fh.read().strip().split(" ")[0] +unk_val = unk_handle.read().strip().split(" ")[0] -utt_word_dict = dict() -utt_phone_dict = dict()# stores utteranceID and phoneID -unk_word_dict = dict() -count=0 -for line in input_fh: +utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str) +for line in arc_post_handle: line_vect = line.strip().split("\t") - if len(line_vect) < 6: - print "IndexError" - print line_vect + if len(line_vect) < 6: # Check for 1best-arc-post output + print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line), + file=sys.stderr) continue - uttID = line_vect[0] + utt_id = line_vect[0] word = line_vect[4] phones = line_vect[5] - if uttID in utt_word_dict.keys(): - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - else: - count = 0 - utt_word_dict[uttID] = dict() - utt_phone_dict[uttID] = dict() - utt_word_dict[uttID][count] = word - utt_phone_dict[uttID][count] = phones - if word == unk_val: # get character sequence for unk - phone_key_vect = phones.split(" ") - phone_val_vect = list() - for pkey in phone_key_vect: - phone_val_vect.append(phone_dict[pkey]) + if utt_id not in list(utt_word_dict.keys()): + utt_word_dict[utt_id] = list() + + if word == unk_val: # Get the 1best phone sequence given by the unk-model + phone_id_seq = phones.split(" ") + phone_seq = list() + for pkey in phone_id_seq: + phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence. phone_2_word = list() - for phone_val in phone_val_vect: - phone_2_word.append(phone_val.split('_')[0]) - phone_2_word = ''.join(phone_2_word) - utt_word_dict[uttID][count] = phone_2_word + for phone_val in phone_seq: + phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B) + phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence + utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model else: - if word == '0': + if word == '0': # Store space/silence word_val = ' ' else: word_val = word_dict[word] - utt_word_dict[uttID][count] = word_val - count += 1 + utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post -transcription = "" -for key in sorted(utt_word_dict.iterkeys()): - transcription = key - for index in sorted(utt_word_dict[key].iterkeys()): - value = utt_word_dict[key][index] - transcription = transcription + " " + value - out_fh.write(transcription + '\n') +transcription = "" # Output transcription +for utt_key in sorted(utt_word_dict.keys()): + transcription = utt_key + for word in utt_word_dict[utt_key]: + transcription = transcription + " " + word + output_text_handle.write(transcription + '\n')