Skip to content

Commit

Permalink
[scripts] Fix an issue in BPE-related scripts (don't assume opt-sil i…
Browse files Browse the repository at this point in the history
…s phone 1) (kaldi-asr#2547)
  • Loading branch information
hhadian authored and danpovey committed Jul 25, 2018
1 parent e6fe7e8 commit de03a75
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 18 deletions.
4 changes: 2 additions & 2 deletions egs/madcat_ar/v1/local/prepare_lexicon.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
line_vect = line.strip().split(' ')
for i in range(1, len(line_vect)):
characters = list(line_vect[i])
# Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
# Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
characters = " ".join(['SIL' if char == '|' else char for char in characters])
lex[line_vect[i]] = characters
if line_vect[i] == '#':
lex[line_vect[i]] = "<HASH>"
Expand Down
6 changes: 2 additions & 4 deletions egs/madcat_ar/v1/local/prepend_words.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# This script, prepend '|' to every words in the transcript to mark
Expand All @@ -10,6 +10,4 @@
infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
for line in infile:
output.write(' '.join([ "|"+word for word in line.split()]) + '\n')


output.write(' '.join(["|" + word for word in line.split()]) + '\n')
6 changes: 3 additions & 3 deletions egs/madcat_ar/v1/local/reverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
# -*- coding: utf-8 -*-

# This script, reverse all latin and digits sequences
# (including words like MP3) to put them in the right order in the images.
# (including words like MP3) to put them in the right order in the images.

import re, os, sys, io

in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
for line in in_stream:
out_stream.write( re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]' , lambda m:m.group(0)[::-1] , line ))

out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]',
lambda m:m.group(0)[::-1], line))
10 changes: 7 additions & 3 deletions egs/madcat_ar/v1/run_end2end.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,14 @@ fi

if [ $stage -le 5 ]; then
echo "$0: Preparing dictionary and lang..."
cut -d' ' -f2- data/train/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
for set in test train dev ; do
cut -d' ' -f2- data/train/text | local/reverse.py | \
local/prepend_words.py | \
utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
for set in test train dev; do
cut -d' ' -f1 data/$set/text > data/$set/ids
cut -d' ' -f2- data/$set/text | python3 local/reverse.py | python3 local/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c data/train/bpe.out | sed 's/@@//g' > data/$set/bpe_text
cut -d' ' -f2- data/$set/text | local/reverse.py | \
local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \
| sed 's/@@//g' > data/$set/bpe_text
mv data/$set/text data/$set/text.old
paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
done
Expand Down
16 changes: 10 additions & 6 deletions egs/wsj/s5/utils/lang/bpe/add_final_optional_silence.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ final_sil_prob=0.5

echo "$0 $@" # Print the command line for logging

. utils/parse_options.sh
. ./utils/parse_options.sh

if [ $# -ne 1 ]; then
echo "Usage: $0 <lang>"
echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in "
echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in"
echo " lang/ directory <lang>."
echo " This can be useful in systems with byte-pair encoded (BPE) lexicons, in which"
echo " the word-initial silence is part of the lexicon, so we turn off the standard"
echo " optional silence in the lexicon"
echo "options:"
echo " --final-sil-prob <final silence probability> # default 0.5"
echo " --final-sil-prob <final silence probability> # default 0.5"
exit 1;
fi

Expand All @@ -26,6 +26,8 @@ if [ $lang/phones/final_sil_prob -nt $lang/phones/nonsilence.txt ]; then
exit 1;
fi

silphone=$(cat $lang/phones/optional_silence.int)

sil_eq_zero=$(echo $(perl -e "if ( $final_sil_prob == 0.0) {print 'true';} else {print 'false';}"))
sil_eq_one=$(echo $(perl -e "if ( $final_sil_prob == 1.0) {print 'true';} else {print 'false';}"))
sil_lt_zero=$(echo $(perl -e "if ( $final_sil_prob < 0.0) {print 'true';} else {print 'false';}"))
Expand All @@ -39,15 +41,17 @@ else
echo "$0 final-sil-prob = 0 => Final silence was not added."
exit 0;
elif $sil_eq_one; then
echo -e "0\t1\t1\t0\n1" | fstcompile > $lang/final_sil.fst
( echo "0 1 $silphone 0";
echo "1" ) | fstcompile > $lang/final_sil.fst
else
log_silprob=$(echo $(perl -e "print log $final_sil_prob"))
echo -e "0\t1\t1\t0\t$log_silprob\n0\t$log_silprob\n1\t0.0" | fstcompile > $lang/final_sil.fst
( echo "0 1 $silphone 0 $log_silprob";
echo "0 $log_silprob";
echo "1" ) | fstcompile > $lang/final_sil.fst
fi
mv $lang/L.fst $lang/L.fst.orig
mv $lang/L_disambig.fst $lang/L_disambig.fst.orig
fstconcat $lang/L.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L.fst
fstconcat $lang/L_disambig.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L_disambig.fst
echo "$final_sil_prob" > $lang/phones/final_sil_prob
fi

0 comments on commit de03a75

Please sign in to comment.