adding overwrite option

kaldi-asr · danpovey · Sep 12, 2018 · Aug 30, 2018 · Aug 31, 2018 · Aug 31, 2018
commit 89c9ec79ff7dce369d1c5e1c030ef225bae053e2
diff --git a/egs/iam/v2/run.sh b/egs/iam/v2/run.sh
@@ -31,6 +31,14 @@ wellington_database=/export/corpora5/Wellington/WWC/
 ./local/check_tools.sh
 
 if [ $stage -le 0 ]; then
+ if [ -f data/train/text ] && ! $overwrite; then
+ echo "Not processing, probably script have run from wrong stage"
+ echo "Exiting with status 1 to avoid data corruption"
+ exit 1;
+ fi
+fi
+
+if [ $stage -le 1 ]; then
  echo "$0: Preparing data..."
  local/prepare_data.sh --download-dir "$iam_database" \
  --wellington-dir "$wellington_database" \
@@ -39,8 +47,8 @@ if [ $stage -le 0 ]; then
 fi
 mkdir -p data/{train,test}/data
 
-if [ $stage -le 1 ]; then
- echo "$(date) stage 1: getting allowed image widths for e2e training..."
+if [ $stage -le 2 ]; then
+ echo "$(date) stage 2: getting allowed image widths for e2e training..."
  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
  # The next command creates a "allowed_lengths.txt" file in data/train
  # which will be used by local/make_features.py to enforce the images to
@@ -54,15 +62,15 @@ if [ $stage -le 1 ]; then
  image/fix_data_dir.sh data/train
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 3 ]; then
  for set in train; do
- echo "$(date) stage 2: Performing augmentation, it will double training data"
+ echo "$(date) stage 3: Performing augmentation, it will double training data"
  local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
  steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
  done
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 4 ]; then
  echo "$0: Preparing BPE..."
  # getting non-silence phones.
  cut -d' ' -f2- data/train/text | \
@@ -96,12 +104,12 @@ END
  done
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 5 ]; then
  echo "$0: Estimating a language model for decoding..."
  local/train_lm.sh
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 6 ]; then
  echo "$0: Preparing dictionary and lang..."
  local/prepare_dict.sh
  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
@@ -119,34 +127,34 @@ if [ $stage -le 4 ]; then
  data/lang data/lang_rescore_6g
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 7 ]; then
  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train_aug \
  data/lang exp/mono
 fi
 
-if [ $stage -le 6 ] && $decode_gmm; then
+if [ $stage -le 8 ] && $decode_gmm; then
  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
 
  steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
  exp/mono/decode_test
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 9 ]; then
  steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \
  exp/mono exp/mono_ali
 
  steps/train_deltas.sh --cmd $cmd 500 20000 data/train_aug data/lang \
  exp/mono_ali exp/tri
 fi
 
-if [ $stage -le 8 ] && $decode_gmm; then
+if [ $stage -le 10 ] && $decode_gmm; then
  utils/mkgraph.sh data/lang exp/tri exp/tri/graph
 
  steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
  exp/tri/decode_test
 fi
 
-if [ $stage -le 9 ]; then
+if [ $stage -le 11 ]; then
  steps/align_si.sh --nj $nj --cmd $cmd data/train_aug data/lang \
  exp/tri exp/tri_ali
 
@@ -155,38 +163,38 @@ if [ $stage -le 9 ]; then
  data/train_aug data/lang exp/tri_ali exp/tri2
 fi
 
-if [ $stage -le 10 ] && $decode_gmm; then
+if [ $stage -le 12 ] && $decode_gmm; then
  utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph
 
  steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \
  data/test exp/tri2/decode_test
 fi
 
-if [ $stage -le 11 ]; then
+if [ $stage -le 13 ]; then
  steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
  data/train_aug data/lang exp/tri2 exp/tri2_ali
 
  steps/train_sat.sh --cmd $cmd 500 20000 \
  data/train_aug data/lang exp/tri2_ali exp/tri3
 fi
 
-if [ $stage -le 12 ] && $decode_gmm; then
+if [ $stage -le 14 ] && $decode_gmm; then
  utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph
 
  steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \
  data/test exp/tri3/decode_test
 fi
 
-if [ $stage -le 13 ]; then
+if [ $stage -le 15 ]; then
  steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
  data/train_aug data/lang exp/tri3 exp/tri3_ali
 fi
 
-if [ $stage -le 14 ]; then
+if [ $stage -le 16 ]; then
  local/chain/run_cnn.sh --train_set train_aug
 fi
 
-if [ $stage -le 15 ]; then
+if [ $stage -le 17 ]; then
  local/chain/run_cnn_chainali.sh --train_set train_aug \
  --chain-model-dir exp/chain/cnn_1a --stage 2
 fi
diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh
@@ -28,6 +28,14 @@ wellington_database=/export/corpora5/Wellington/WWC/
 ./local/check_tools.sh
 
 if [ $stage -le 0 ]; then
+ if [ -f data/train/text ] && ! $overwrite; then
+ echo "Not processing, probably script have run from wrong stage"
+ echo "Exiting with status 1 to avoid data corruption"
+ exit 1;
+ fi
+fi
+
+if [ $stage -le 1 ]; then
  echo "$0: Preparing data..."
  local/prepare_data.sh --download-dir "$iam_database" \
  --wellington-dir "$wellington_database" \
@@ -36,8 +44,8 @@ if [ $stage -le 0 ]; then
 fi
 
 mkdir -p data/{train,test}/data
-if [ $stage -le 1 ]; then
- echo "$(date) stage 1: getting allowed image widths for e2e training..."
+if [ $stage -le 2 ]; then
+ echo "$(date) stage 2: getting allowed image widths for e2e training..."
  image/get_image2num_frames.py --feat-dim 40 data/train # This will be needed for the next command
  # The next command creates a "allowed_lengths.txt" file in data/train
  # which will be used by local/make_features.py to enforce the images to
@@ -51,15 +59,15 @@ if [ $stage -le 1 ]; then
  image/fix_data_dir.sh data/train
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 3 ]; then
  for set in train; do
- echo "$(date) stage 2: Performing augmentation, it will double training data"
+ echo "$(date) stage 3: Performing augmentation, it will double training data"
  local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/${set} data/${set}_aug data
  steps/compute_cmvn_stats.sh data/${set}_aug || exit 1;
  done
 fi
 
-if [ $stage -le 2 ]; then
+if [ $stage -le 4 ]; then
  echo "$0: Preparing BPE..."
  # getting non-silence phones.
  cut -d' ' -f2- data/train/text | \
@@ -93,12 +101,12 @@ END
  done
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 5 ]; then
  echo "$0: Estimating a language model for decoding..."
  local/train_lm.sh
 fi
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 6 ]; then
  echo "$0: Preparing dictionary and lang..."
  local/prepare_dict.sh
  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
@@ -116,20 +124,20 @@ if [ $stage -le 4 ]; then
  data/lang data/lang_rescore_6g
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 7 ]; then
  echo "$0: Calling the flat-start chain recipe..."
  local/chain/run_e2e_cnn.sh --train_set train_aug
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 8 ]; then
  echo "$0: Aligning the training data using the e2e chain model..."
  steps/nnet3/align.sh --nj 50 --cmd "$cmd" \
  --use-gpu false \
  --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
  data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 9 ]; then
  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
  local/chain/run_cnn_e2eali.sh --train_set train_aug
 fi