adding aachen splits

kaldi-asr · danpovey · Sep 12, 2018 · Aug 30, 2018 · Aug 31, 2018 · Aug 31, 2018
commit c05cd4df19953c65f76c09827ffa47513aa6953c
diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh
@@ -161,16 +161,16 @@ cat $train_old > $train_new
 cat $test_old > $test_new
 cat $val1_old $val2_old > $val_new
 
-if [ $stage -le 0 ]; then
- if [ ! -f data/train/text ] || $overwrite; then
+if $process_aachen_split; then
+ local/process_aachen_splits.py data/local aachen_split data/train --dataset train || exit 1
+ local/process_aachen_splits.py data/local aachen_split data/test --dataset test || exit 1
+ local/process_aachen_splits.py data/local aachen_split data/val --dataset validation || exit 1
+else
  local/process_data.py data/local data/train --dataset train || exit 1
  local/process_data.py data/local data/test --dataset test || exit 1
  local/process_data.py data/local data/val --dataset validation || exit 1
-
- image/fix_data_dir.sh data/train
- image/fix_data_dir.sh data/test
- image/fix_data_dir.sh data/val
- else
- echo "Not processing data since it is already processed"
- fi
 fi
+
+image/fix_data_dir.sh data/train
+image/fix_data_dir.sh data/test
+image/fix_data_dir.sh data/val
diff --git a/egs/iam/v2/local/process_aachen_splits.py b/egs/iam/v2/local/process_aachen_splits.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Chun Chieh Chang
+# 2017 Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+ the following files (for the data subset selected via --dataset):
+ text, utt2spk, images.scp.
+
+ Eg. local/process_aachen_splits.py data/local data/train data --dataset train
+ Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+ utt2spk file: 000_a01-000u-00 000
+ images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+ and images.scp files.""")
+parser.add_argument('database_path', type=str,
+ help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('split_path', type=str,
+ help='location of the train/test/val set')
+parser.add_argument('out_dir', type=str,
+ help='location to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+ choices=['train_list', 'dev_list', 'eval_list'],
+ help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.split_path,
+ args.dataset + '.txt')
+
+text_file_path = os.path.join(args.database_path,
+ 'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+ with open (text_file_path, 'rt') as in_file:
+ for line in in_file:
+ if line[0]=='#':
+ continue
+ line = line.strip()
+ utt_id = line.split(' ')[0]
+ text_vect = line.split(' ')[8:]
+ text = "".join(text_vect)
+ text = text.replace("|", " ")
+ text_dict[utt_id] = text
+
+
+### main ###
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+ for line in f:
+ line = line.strip()
+ line_vect = line.split('-')
+ xml_file = line_vect[0] + '-' + line_vect[1]
+ xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+ doc = minidom.parse(xml_path)
+ form_elements = doc.getElementsByTagName('form')[0]
+ writer_id = form_elements.getAttribute('writer-id')
+ outerfolder = form_elements.getAttribute('id')[0:3]
+ innerfolder = form_elements.getAttribute('id')
+ lines_path = os.path.join(args.database_path, 'lines',
+ outerfolder, innerfolder)
+ for file in os.listdir(lines_path):
+ if file.endswith(".png"):
+ image_file_path = os.path.join(lines_path, file)
+ base_name = os.path.splitext(os.path.basename(image_file_path))[0]
+ text = text_dict[base_name]
+ utt_id = writer_id + '_' + base_name
+ text_fh.write(utt_id + ' ' + text + '\n')
+ utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+ image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py
@@ -67,7 +67,6 @@ def process_text_file_for_word_model():
  xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
  img_num = line[-3:]
  doc = minidom.parse(xml_path)
-
  form_elements = doc.getElementsByTagName('form')[0]
  writer_id = form_elements.getAttribute('writer-id')
  outerfolder = form_elements.getAttribute('id')[0:3]