Skip to content

Commit

Permalink
[egs] madcat arabic: clean scripts, tuning, rescoring, text localizat…
Browse files Browse the repository at this point in the history
…ion (#2716)
  • Loading branch information
aarora8 authored and danpovey committed Oct 9, 2018
1 parent 735e2a5 commit 43ec82e
Show file tree
Hide file tree
Showing 30 changed files with 1,108 additions and 444 deletions.
62 changes: 47 additions & 15 deletions egs/cifar/v1/image/ocr/make_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,13 @@
'and right side of the image.')
parser.add_argument('--num-channels', type=int, default=1,
help='Number of color channels')
parser.add_argument('--vertical-shift', type=int, default=0,
help='total number of padding pixel per column')
parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
help="Flip the image left-right for right to left languages")
parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
help="performs image augmentation")
parser.add_argument('--augment_type', type=str, default='no_aug',
choices=['no_aug', 'random_scale','random_shift'],
help='Subset of data to process.')
args = parser.parse_args()


Expand All @@ -68,7 +71,6 @@ def write_kaldi_matrix(file_handle, matrix, key):
file_handle.write("\n")
file_handle.write(" ]\n")


def horizontal_pad(im, allowed_lengths = None):
if allowed_lengths is None:
left_padding = right_padding = args.padding
Expand Down Expand Up @@ -112,6 +114,33 @@ def get_scaled_image_aug(im, mode='normal'):
return im_scaled_up
return im

def vertical_shift(im, mode='normal'):
if args.vertical_shift == 0:
return im
total = args.vertical_shift
if mode == 'notmid':
val = random.randint(0, 1)
if val == 0:
mode = 'top'
else:
mode = 'bottom'
if mode == 'normal':
top = int(total / 2)
bottom = total - top
elif mode == 'top': # more padding on top
top = random.randint(total / 2, total)
bottom = total - top
elif mode == 'bottom': # more padding on bottom
top = random.randint(0, total / 2)
bottom = total - top
width = im.shape[1]
im_pad = np.concatenate(
(255 * np.ones((top, width), dtype=int) -
np.random.normal(2, 1, (top, width)).astype(int), im), axis=0)
im_pad = np.concatenate(
(im_pad, 255 * np.ones((bottom, width), dtype=int) -
np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0)
return im_pad

### main ###
random.seed(1)
Expand All @@ -134,7 +163,6 @@ def get_scaled_image_aug(im, mode='normal'):

num_fail = 0
num_ok = 0
aug_setting = ['normal', 'scaled']
with open(data_list_path) as f:
for line in f:
line = line.strip()
Expand All @@ -144,21 +172,25 @@ def get_scaled_image_aug(im, mode='normal'):
im = misc.imread(image_path)
if args.fliplr:
im = np.fliplr(im)
if args.augment:
im_aug = get_scaled_image_aug(im, aug_setting[1])
else:
im_aug = get_scaled_image_aug(im, aug_setting[0])
im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
if im_horizontal_padded is None:
if args.augment_type == 'no_aug' or 'random_shift':
im = get_scaled_image_aug(im, 'normal')
elif args.augment_type == 'random_scale':
im = get_scaled_image_aug(im, 'scaled')
im = horizontal_pad(im, allowed_lengths)
if im is None:
num_fail += 1
continue
if args.augment_type == 'no_aug' or 'random_scale':
im = vertical_shift(im, 'normal')
elif args.augment_type == 'random_shift':
im = vertical_shift(im, 'notmid')
if args.num_channels == 1:
data = np.transpose(im_horizontal_padded, (1, 0))
data = np.transpose(im, (1, 0))
elif args.num_channels == 3:
H = im_horizontal_padded.shape[0]
W = im_horizontal_padded.shape[1]
C = im_horizontal_padded.shape[2]
data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C))
H = im.shape[0]
W = im.shape[1]
C = im.shape[2]
data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C))
data = np.divide(data, 255.0)
num_ok += 1
write_kaldi_matrix(out_fh, data, image_id)
Expand Down
14 changes: 14 additions & 0 deletions egs/madcat_ar/v1/local/chain/compare_wer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,27 @@ for x in $*; do
done
echo

echo -n "# WER (rescored) "
for x in $*; do
wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER "
for x in $*; do
cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

echo -n "# CER (rescored) "
for x in $*; do
cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

if $used_epochs; then
exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
fi
Expand Down
30 changes: 13 additions & 17 deletions egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,16 @@ reporting_email=
# chain options
train_stage=-10
xent_regularize=0.1
frame_subsampling_factor=4
# training chunk-options
chunk_width=340,300,200,100
num_leaves=500
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
tdnn_dim=450
# training options
srand=0
remove_egs=false
lang_test=lang_test
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
# End configuration section.
echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -168,13 +166,13 @@ if [ $stage -le 5 ]; then
--chain.leaky-hmm-coefficient=0.1 \
--chain.l2-regularize=0.00005 \
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=500" \
--chain.frame-subsampling-factor=$frame_subsampling_factor \
--chain.alignment-subsampling-factor=$frame_subsampling_factor \
--chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
--chain.frame-subsampling-factor=4 \
--chain.alignment-subsampling-factor=4 \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=4 \
--trainer.frames-per-iter=1000000 \
--trainer.frames-per-iter=2000000 \
--trainer.optimization.num-jobs-initial=3 \
--trainer.optimization.num-jobs-final=16 \
--trainer.optimization.initial-effective-lrate=0.001 \
Expand All @@ -183,10 +181,6 @@ if [ $stage -le 5 ]; then
--trainer.num-chunk-per-minibatch=64,32 \
--trainer.optimization.momentum=0.0 \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.chunk-left-context-initial=0 \
--egs.chunk-right-context-final=0 \
--egs.dir="$common_egs_dir" \
--egs.opts="--frames-overlap-per-eg 0" \
--cleanup.remove-egs=$remove_egs \
Expand All @@ -207,18 +201,20 @@ if [ $stage -le 6 ]; then
# as long as phones.txt was compatible.

utils/mkgraph.sh \
--self-loop-scale 1.0 data/$lang_test \
--self-loop-scale 1.0 $lang_decode \
$dir $dir/graph || exit 1;
fi

if [ $stage -le 7 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nj --cmd "$cmd" \
$dir/graph data/test $dir/decode_test || exit 1;

steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test $dir/decode_test{,_rescored} || exit 1
fi

echo "Done. Date: $(date). Results:"
local/chain/compare_wer.sh $dir
29 changes: 12 additions & 17 deletions egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,15 @@ lats_affix=
# chain options
train_stage=-10
xent_regularize=0.1
frame_subsampling_factor=4
# training chunk-options
chunk_width=340,300,200,100
num_leaves=500
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
tdnn_dim=450
# training options
srand=0
remove_egs=false
lang_test=lang_test
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
# End configuration section.
echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -170,13 +167,13 @@ if [ $stage -le 5 ]; then
--chain.leaky-hmm-coefficient=0.1 \
--chain.l2-regularize=0.00005 \
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=500" \
--chain.frame-subsampling-factor=$frame_subsampling_factor \
--chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
--chain.frame-subsampling-factor=4 \
--chain.alignment-subsampling-factor=1 \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=4 \
--trainer.frames-per-iter=1000000 \
--trainer.frames-per-iter=2000000 \
--trainer.optimization.num-jobs-initial=3 \
--trainer.optimization.num-jobs-final=16 \
--trainer.optimization.initial-effective-lrate=0.001 \
Expand All @@ -185,10 +182,6 @@ if [ $stage -le 5 ]; then
--trainer.num-chunk-per-minibatch=64,32 \
--trainer.optimization.momentum=0.0 \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.chunk-left-context-initial=0 \
--egs.chunk-right-context-final=0 \
--egs.dir="$common_egs_dir" \
--egs.opts="--frames-overlap-per-eg 0" \
--cleanup.remove-egs=$remove_egs \
Expand All @@ -209,18 +202,20 @@ if [ $stage -le 6 ]; then
# as long as phones.txt was compatible.

utils/mkgraph.sh \
--self-loop-scale 1.0 data/$lang_test \
--self-loop-scale 1.0 $lang_decode \
$dir $dir/graph || exit 1;
fi

if [ $stage -le 7 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nj --cmd "$cmd" \
$dir/graph data/test $dir/decode_test || exit 1;

steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test $dir/decode_test{,_rescored} || exit 1
fi

echo "Done. Date: $(date). Results:"
local/chain/compare_wer.sh $dir
31 changes: 13 additions & 18 deletions egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,14 @@ reporting_email=
train_stage=-10
xent_regularize=0.1
frame_subsampling_factor=4
# training chunk-options
chunk_width=340,300,200,100
num_leaves=500
# we don't need extra left/right context for TDNN systems.
chunk_left_context=0
chunk_right_context=0
tdnn_dim=450
# training options
srand=0
remove_egs=true
lang_test=lang_test
lang_decode=data/lang
lang_rescore=data/lang_rescore_6g
# End configuration section.
echo "$0 $@" # Print the command line for logging

Expand Down Expand Up @@ -171,28 +168,24 @@ if [ $stage -le 5 ]; then
--chain.leaky-hmm-coefficient=0.1 \
--chain.l2-regularize=0.00005 \
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=500" \
--chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
--chain.frame-subsampling-factor=$frame_subsampling_factor \
--chain.alignment-subsampling-factor=1 \
--chain.left-tolerance 3 \
--chain.right-tolerance 3 \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=2 \
--trainer.frames-per-iter=1000000 \
--trainer.num-epochs=4 \
--trainer.frames-per-iter=2000000 \
--trainer.optimization.num-jobs-initial=3 \
--trainer.optimization.num-jobs-final=16 \
--trainer.optimization.initial-effective-lrate=0.001 \
--trainer.optimization.final-effective-lrate=0.0001 \
--trainer.optimization.shrink-value=1.0 \
--trainer.num-chunk-per-minibatch=96,64 \
--trainer.num-chunk-per-minibatch=64,32 \
--trainer.optimization.momentum=0.0 \
--trainer.add-option="--optimization.memory-compression-level=2" \
--egs.chunk-width=$chunk_width \
--egs.chunk-left-context=$chunk_left_context \
--egs.chunk-right-context=$chunk_right_context \
--egs.chunk-left-context-initial=0 \
--egs.chunk-right-context-final=0 \
--egs.dir="$common_egs_dir" \
--egs.opts="--frames-overlap-per-eg 0" \
--cleanup.remove-egs=$remove_egs \
Expand All @@ -213,18 +206,20 @@ if [ $stage -le 6 ]; then
# as long as phones.txt was compatible.

utils/mkgraph.sh \
--self-loop-scale 1.0 data/$lang_test \
--self-loop-scale 1.0 $lang_decode \
$dir $dir/graph || exit 1;
fi

if [ $stage -le 7 ]; then
frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--extra-left-context $chunk_left_context \
--extra-right-context $chunk_right_context \
--extra-left-context-initial 0 \
--extra-right-context-final 0 \
--frames-per-chunk $frames_per_chunk \
--nj $nj --cmd "$cmd" \
$dir/graph data/test $dir/decode_test || exit 1;

steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \
data/test $dir/decode_test{,_rescored} || exit 1
fi

echo "Done. Date: $(date). Results:"
local/chain/compare_wer.sh $dir
Loading

0 comments on commit 43ec82e

Please sign in to comment.