kaldi-asr · danpovey · Sep 12, 2018 · Aug 30, 2018 · Aug 31, 2018 · Aug 31, 2018
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -1,88 +1,107 @@
 #!/usr/bin/env python3
 
-# Copyright 2017 Ashish Arora
+#Copyright  2017 Ashish Arora
 
+""" This module will be used by scripts for open vocabulary setup.
+ If the hypothesis transcription contains <unk>, then it will replace the 
+ <unk> with the word predicted by <unk> model by concatenating phones decoded 
+ from the unk-model. It is currently supported only for triphone setup.
+ Args:
+ phones: File name of a file that contains the phones.txt, (symbol-table for phones).
+ phone and phoneID, Eg. a 217, phoneID of 'a' is 217. 
+ words: File name of a file that contains the words.txt, (symbol-table for words). 
+ word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234.
+ unk: ID of <unk>. Eg. 231.
+ one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior
+ of arcs along the one-best path from the lattice.
+ E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231
+ <utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>] 
+ [<phone1> <phone2>...]
+ output-text: File containing hypothesis transcription with <unk> recognized by the
+ unk-model.
+ E.g. A move to stop mr. gaitskell.
+
+ Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt 
+ data/lang/oov.int
+"""
 import argparse
+import os
 import sys
-
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
-parser.add_argument('phones', type=str, help='phones and phonesID')
-parser.add_argument('words', type=str, help='word and wordID')
-parser.add_argument('unk', type=str, default='-', help='location of unk file')
-parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
-parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+parser.add_argument('phones', type=str, help='File name of a file that contains the'
+ 'symbol-table for phones. Each line must be: <phone> <phoneID>')
+parser.add_argument('words', type=str, help='File name of a file that contains the'
+ 'symbol-table for words. Each line must be: <word> <word-id>')
+parser.add_argument('unk', type=str, default='-', help='File name of a file that'
+ 'contains the ID of <unk>. The content must be: <oov-id>, e.g. 231')
+parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post'
+ 'format, which is a list of timing info and posterior of arcs'
+ 'along the one-best path from the lattice')
+parser.add_argument('--output-text', type=str, default='-', help='File containing'
+ 'hypothesis transcription with <unk> recognized by the unk-model')
 args = parser.parse_args()
 
-
 ### main ###
-phone_fh = open(args.phones, 'r', encoding='latin-1')
-word_fh = open(args.words, 'r', encoding='latin-1')
-unk_fh = open(args.unk, 'r', encoding='latin-1')
-if args.input_ark == '-':
- input_fh = sys.stdin
+phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles 
+word_handle = open(args.words, 'r', encoding='latin-1')
+unk_handle = open(args.unk,'r', encoding='latin-1')
+if args.one_best_arc_post == '-':
+ arc_post_handle = sys.stdin
 else:
- input_fh = open(args.input_ark, 'r', encoding='latin-1')
-if args.out_ark == '-':
- out_fh = sys.stdout
+ arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
+if args.output_text == '-':
+ output_text_handle = sys.stdout
 else:
- out_fh = open(args.out_ark, 'w', encoding='latin-1')
+ output_text_handle = open(args.output_text, 'w', encoding='latin-1')
 
-phone_dict = dict() # Stores phoneID and phone mapping
-phone_data_vect = phone_fh.read().strip().split("\n")
-for key_val in phone_data_vect:
+id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
+phones_data = phone_handle.read().strip().split("\n")
+
+for key_val in phones_data:
  key_val = key_val.split(" ")
- phone_dict[key_val[1]] = key_val[0]
+ id2phone[key_val[1]] = key_val[0]
+
 word_dict = dict()
-word_data_vect = word_fh.read().strip().split("\n")
+word_data_vect = word_handle.read().strip().split("\n")
+
 for key_val in word_data_vect:
  key_val = key_val.split(" ")
  word_dict[key_val[1]] = key_val[0]
-unk_val = unk_fh.read().strip().split(" ")[0]
+unk_val = unk_handle.read().strip().split(" ")[0]
 
-utt_word_dict = dict()
-utt_phone_dict = dict() # Stores utteranceID and phoneID
-unk_word_dict = dict()
-count=0
-for line in input_fh:
+utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str)
+for line in arc_post_handle:
  line_vect = line.strip().split("\t")
- if len(line_vect) < 6:
- print("Bad line: '{}' Expecting 6 fields. Skipping...".format(line),
+ if len(line_vect) < 6: # Check for 1best-arc-post output
+ print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line),
  file=sys.stderr)
  continue
- uttID = line_vect[0]
+ utt_id = line_vect[0]
  word = line_vect[4]
  phones = line_vect[5]
- if uttID in utt_word_dict.keys():
- utt_word_dict[uttID][count] = word
- utt_phone_dict[uttID][count] = phones
- else:
- count = 0
- utt_word_dict[uttID] = dict()
- utt_phone_dict[uttID] = dict()
- utt_word_dict[uttID][count] = word
- utt_phone_dict[uttID][count] = phones
- if word == unk_val: # Get character sequence for unk
- phone_key_vect = phones.split(" ")
- phone_val_vect = list()
- for pkey in phone_key_vect:
- phone_val_vect.append(phone_dict[pkey])
+ if utt_id not in list(utt_word_dict.keys()):
+ utt_word_dict[utt_id] = list()
+
+ if word == unk_val: # Get the 1best phone sequence given by the unk-model
+ phone_id_seq = phones.split(" ")
+ phone_seq = list()
+ for pkey in phone_id_seq:
+ phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence.
  phone_2_word = list()
- for phone_val in phone_val_vect:
- phone_2_word.append(phone_val.split('_')[0])
- phone_2_word = ''.join(phone_2_word)
- utt_word_dict[uttID][count] = phone_2_word
+ for phone_val in phone_seq:
+ phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B)
+ phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence
+ utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model
  else:
- if word == '0':
+ if word == '0': # Store space/silence
  word_val = ' '
  else:
  word_val = word_dict[word]
- utt_word_dict[uttID][count] = word_val
- count += 1
+ utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post
 
-transcription = ""
-for key in sorted(utt_word_dict.keys()):
- transcription = key
- for index in sorted(utt_word_dict[key].keys()):
- value = utt_word_dict[key][index]
- transcription = transcription + " " + value
- out_fh.write(transcription + '\n')
+transcription = "" # Output transcription
+for utt_key in sorted(utt_word_dict.keys()):
+ transcription = utt_key
+ for word in utt_word_dict[utt_key]:
+ transcription = transcription + " " + word
+ output_text_handle.write(transcription + '\n')
diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh
diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright 2018 Hossein Hadian
+# 2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+aug_set=aug1
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"
+
+for set in $aug_set; do
+ image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+ $srcdir $datadir/augmentations/$set
+ cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+ local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+ --fliplr false --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh
@@ -50,6 +50,36 @@ for x in $*; do
 done
 echo
 
+echo -n "# WER val "
+for x in $*; do
+ wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored) val "
+for x in $*; do
+ wer="--"
+ [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+ printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val "
+for x in $*; do
+ cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored) val "
+for x in $*; do
+ cer="--"
+ [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+ printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
  exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi

diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh
@@ -1 +1 @@
-tuning/run_cnn_e2eali_1c.sh
+tuning/run_cnn_e2eali_1d.sh