Added merging manifests script

experimenti · Apr 29, 2017 · 5fd9a22 · 5fd9a22
1 parent de00671
commit 5fd9a22
Show file tree

Hide file tree

Showing 2 changed files with 74 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ pip install -r requirements.txt
 
 ## Dataset
 
-Currently supports AN4, TEDLIUM, Voxforge and LibriSpeech.
+Currently supports AN4, TEDLIUM, Voxforge and LibriSpeech. Scripts will setup the dataset and create manifest files used in dataloading.
 
 ### AN4
 
@@ -105,6 +105,17 @@ To create a custom dataset you must create a CSV file containing the locations o
 
 The first path is to the audio file, and the second path is to a text file containing the transcript on one line. This can then be used as stated below.
 
+
+### Merging multiple manifest files
+
+To create bigger manifest files (to train/test on multiple datasets at once) we can merge manifest files together like below from a directory
+containing all the manifests you want to merge. You can also prune short and long clips out of the new manifest.
+
+```
+cd data/
+python merge_manifests.py --output_path merged_manifest.csv --merge_dir all_manifests/ --min_duration 1 --max_duration 15 # durations in seconds
+```
+
 ## Training
 
 ```

diff --git a/data/merge_manifests.py b/data/merge_manifests.py
@@ -0,0 +1,62 @@
+from __future__ import print_function
+
+import argparse
+import io
+import os
+
+import subprocess
+
+from utils import update_progress
+
+parser = argparse.ArgumentParser(description='Merges all manifest CSV files in specified folder.')
+parser.add_argument('--merge_dir', default='manifests/', help='Path to all manifest files you want to merge')
+parser.add_argument('--min_duration', default=-1,
+ help='Optionally prunes any samples shorter than the min duration (given in seconds, default off)')
+parser.add_argument('--max_duration', default=-1,
+ help='Optionally prunes any samples longer than the max duration (given in seconds, default off)')
+parser.add_argument('--output_path', default='merged_manifest.csv', help='Output path to merged manifest')
+
+args = parser.parse_args()
+
+files = []
+for file in os.listdir(args.merge_dir):
+ if file.endswith(".csv"):
+ with open(os.path.join(args.merge_dir, file), 'r') as fh:
+ files += fh.readlines()
+
+prune_files = args.min_duration >= 0 and args.max_duration >= 0
+if prune_files:
+ print("Pruning files with minimum duration %d, maximum duration of %d" % (args.min_duration, args.max_duration))
+
+new_files = []
+size = len(files)
+for x in range(size):
+ file_path = files[x]
+ file_path = file_path.split(',')[0]
+ output = subprocess.check_output(
+ ['soxi -D %s' % file_path.strip()],
+ shell=True
+ )
+ duration = float(output)
+ if prune_files:
+ if args.min_duration <= duration <= args.max_duration:
+ new_files.append((files[x], duration))
+ else:
+ new_files.append((files[x], duration))
+ update_progress(x / float(size))
+
+print("\nSorting files by length...")
+
+
+def func(element):
+ return element[1]
+
+
+new_files.sort(key=func)
+
+print("Saving new manifest...")
+
+with io.FileIO(args.output_path, 'w') as f:
+ for file_path in new_files:
+ sample = file_path[0].strip() + '\n'
+ f.write(sample.encode('utf-8'))