[scripts] Fix an issue in BPE-related scripts (don't assume opt-sil i…

…s phone 1) (kaldi-asr#2547)
Usanter · Jul 25, 2018 · de03a75 · de03a75
1 parent e6fe7e8
commit de03a75
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 18 deletions.
diff --git a/egs/madcat_ar/v1/local/prepare_lexicon.py b/egs/madcat_ar/v1/local/prepare_lexicon.py
@@ -19,8 +19,8 @@
         line_vect = line.strip().split(' ')
         for i in range(1, len(line_vect)):
             characters = list(line_vect[i])
-	    # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
-            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join(['SIL' if char == '|' else char for char in characters])
             lex[line_vect[i]] = characters
             if line_vect[i] == '#':
                 lex[line_vect[i]] = "<HASH>"

diff --git a/egs/madcat_ar/v1/local/prepend_words.py b/egs/madcat_ar/v1/local/prepend_words.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # This script, prepend '|' to every words in the transcript to mark
@@ -10,6 +10,4 @@
 infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
 output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 for line in infile:
-    output.write(' '.join([ "|"+word for word in line.split()]) + '\n')
-
-
+    output.write(' '.join(["|" + word for word in line.split()]) + '\n')
diff --git a/egs/madcat_ar/v1/local/reverse.py b/egs/madcat_ar/v1/local/reverse.py
@@ -2,12 +2,12 @@
 # -*- coding: utf-8 -*-
 
 # This script, reverse all latin and digits sequences
-# (including words like MP3) to put them in the right order in the images. 
+# (including words like MP3) to put them in the right order in the images.
 
 import re, os, sys, io
 
 in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
 out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 for line in in_stream:
-    out_stream.write( re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]' , lambda m:m.group(0)[::-1] , line ))
-
+    out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]',
+                            lambda m:m.group(0)[::-1], line))
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
@@ -80,10 +80,14 @@ fi
 
 if [ $stage -le 5 ]; then
   echo "$0: Preparing dictionary and lang..."
-  cut -d' ' -f2- data/train/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
-  for set in test train dev ; do
+  cut -d' ' -f2- data/train/text | local/reverse.py | \
+    local/prepend_words.py | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
+  for set in test train dev; do
     cut -d' ' -f1 data/$set/text > data/$set/ids
-    cut -d' ' -f2- data/$set/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c data/train/bpe.out | sed 's/@@//g' > data/$set/bpe_text
+    cut -d' ' -f2- data/$set/text | local/reverse.py | \
+      local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \
+      | sed 's/@@//g' > data/$set/bpe_text
     mv data/$set/text data/$set/text.old
     paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
   done

diff --git a/egs/wsj/s5/utils/lang/bpe/add_final_optional_silence.sh b/egs/wsj/s5/utils/lang/bpe/add_final_optional_silence.sh
@@ -5,17 +5,17 @@ final_sil_prob=0.5
 
 echo "$0 $@"  # Print the command line for logging
 
-. utils/parse_options.sh
+. ./utils/parse_options.sh
 
 if [ $# -ne 1 ]; then
   echo "Usage: $0  <lang>"
-  echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in  "
+  echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in"
   echo " lang/ directory <lang>."
   echo " This can be useful in systems with byte-pair encoded (BPE) lexicons, in which"
   echo " the word-initial silence is part of the lexicon, so we turn off the standard"
   echo " optional silence in the lexicon"
   echo "options:"
-  echo "   --final-sil-prob <final silence probability>		# default 0.5"
+  echo "   --final-sil-prob <final silence probability>      # default 0.5"
   exit 1;
 fi
 
@@ -26,6 +26,8 @@ if [ $lang/phones/final_sil_prob -nt $lang/phones/nonsilence.txt ]; then
   exit 1;
 fi
 
+silphone=$(cat $lang/phones/optional_silence.int)
+
 sil_eq_zero=$(echo $(perl -e "if ( $final_sil_prob == 0.0) {print 'true';} else {print 'false';}"))
 sil_eq_one=$(echo $(perl -e "if ( $final_sil_prob == 1.0) {print 'true';} else {print 'false';}"))
 sil_lt_zero=$(echo $(perl -e "if ( $final_sil_prob < 0.0) {print 'true';} else {print 'false';}"))
@@ -39,15 +41,17 @@ else
     echo "$0 final-sil-prob = 0 => Final silence was not added."
     exit 0;
   elif $sil_eq_one; then
-    echo -e "0\t1\t1\t0\n1" | fstcompile > $lang/final_sil.fst
+    ( echo "0 1 $silphone 0";
+      echo "1" ) | fstcompile > $lang/final_sil.fst
   else
     log_silprob=$(echo $(perl -e "print log $final_sil_prob"))
-    echo -e "0\t1\t1\t0\t$log_silprob\n0\t$log_silprob\n1\t0.0" | fstcompile > $lang/final_sil.fst
+    ( echo "0 1 $silphone 0 $log_silprob";
+      echo "0 $log_silprob";
+      echo "1" ) | fstcompile > $lang/final_sil.fst
   fi
   mv $lang/L.fst $lang/L.fst.orig
   mv $lang/L_disambig.fst $lang/L_disambig.fst.orig
   fstconcat $lang/L.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L.fst
   fstconcat $lang/L_disambig.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L_disambig.fst
   echo "$final_sil_prob" > $lang/phones/final_sil_prob
 fi
-