Refactored voxforge, removed need for labels due to check now done in…

… dataloader, better support for 2.7
DevKiHyun · Apr 24, 2017 · 63c8b88 · 63c8b88
1 parent 598d2b5
commit 63c8b88
Showing 1 changed file with 53 additions and 58 deletions.
diff --git a/data/voxforge.py b/data/voxforge.py
@@ -1,39 +1,35 @@
 import os
-import urllib.request
+from six.moves import urllib
 import argparse
 import re
 import tempfile
 import shutil
 import subprocess
-import json
+import tarfile
 
-from utils import create_manifest
+from utils import create_manifest, _update_progress
 
 VOXFORGE_URL_16kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'
 
-
 parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
-parser.add_argument( "--target_dir", type = str,
- default = "voxforge", help = "Directory to store the dataset." )
+parser.add_argument("--target_dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.")
 parser.add_argument('--sample_rate', default=16000,
  type=int, help='Sample rate')
-parser.add_argument('--labels_path', default='./../labels.json', help='Contains all characters for prediction')
 
 args = parser.parse_args()
 
-def _process_transcript(transcript, labels):
- return "".join( [ c for c in transcript.strip().upper() if c in labels ] )
 
-def _get_recordings_dir( sample_dir, recording_name ):
- wav_dir =  os.path.join( sample_dir, recording_name, "wav" )
+def _get_recordings_dir(sample_dir, recording_name):
+ wav_dir = os.path.join(sample_dir, recording_name, "wav")
  if os.path.exists(wav_dir):
  return "wav", wav_dir
- flac_dir = os.path.join( sample_dir, recording_name, "flac" )
- if os.path.exists( flac_dir ):
+ flac_dir = os.path.join(sample_dir, recording_name, "flac")
+ if os.path.exists(flac_dir):
  return "flac", flac_dir
- raise Exception("wav or flac directory was not found for recording name: {}".format( recotding_name ))
+ raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name))
+
 
-def prepare_sample(recording_name, url, target_folder, labels):
+def prepare_sample(recording_name, url, target_folder):
  """
  Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
  """
@@ -45,55 +41,54 @@ def prepare_sample(recording_name, url, target_folder, labels):
  os.makedirs(txt_dir)
 
  request = urllib.request.Request(url)
- with urllib.request.urlopen(request) as response:
- content = response.read()
- with tempfile.NamedTemporaryFile( suffix = ".tgz" ) as target_tgz:
- target_tgz.write( content )
- dirpath = tempfile.mkdtemp()
- subprocess.call(["tar zxvf {} -C {}".format(target_tgz.name, dirpath)], shell=True)
-
- recordings_type, recordings_dir = _get_recordings_dir( dirpath, recording_name)
- tgz_prompt_file = os.path.join( dirpath, recording_name, "etc", "PROMPTS" )
-
- if os.path.exists( recordings_dir ) and os.path.exists( tgz_prompt_file ):
- transcriptions = open(tgz_prompt_file).read().strip().split("\n")
- transcriptions = {t.split()[0] : " ".join(t.split()[1:]) for t in transcriptions}
- for wav_file in os.listdir( recordings_dir ):
- recording_id = wav_file.split('.{}'.format(recordings_type))[0]
- transcription_key = recording_name + "/mfc/" + recording_id 
- if transcription_key not in transcriptions:
- continue
- utterance = transcriptions[ transcription_key ]
- utterance = _process_transcript(utterance, labels)
-
- target_wav_file = os.path.join( wav_dir, "{}_{}.wav".format( recording_name, recording_id ) )
- target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
- with open(target_txt_file, "w") as f:
- f.write( utterance )
- original_wav_file = os.path.join(recordings_dir, wav_file)
- #shutil.copyfile( original_wav_file , target_wav_file )
- subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format( original_wav_file, str(args.sample_rate),
- target_wav_file )], shell = True)
-
- shutil.rmtree(dirpath)
+ response = urllib.request.urlopen(request)
+ content = response.read()
+ response.close()
+ with tempfile.NamedTemporaryFile(suffix=".tgz", mode='w') as target_tgz:
+ target_tgz.write(content)
+ target_tgz.flush()
+ dirpath = tempfile.mkdtemp()
+
+ tar = tarfile.open(target_tgz.name)
+ tar.extractall(dirpath)
+ tar.close()
+
+ recordings_type, recordings_dir = _get_recordings_dir(dirpath, recording_name)
+ tgz_prompt_file = os.path.join(dirpath, recording_name, "etc", "PROMPTS")
+
+ if os.path.exists(recordings_dir) and os.path.exists(tgz_prompt_file):
+ transcriptions = open(tgz_prompt_file).read().strip().split("\n")
+ transcriptions = {t.split()[0]: " ".join(t.split()[1:]) for t in transcriptions}
+ for wav_file in os.listdir(recordings_dir):
+ recording_id = wav_file.split('.{}'.format(recordings_type))[0]
+ transcription_key = recording_name + "/mfc/" + recording_id
+ if transcription_key not in transcriptions:
+ continue
+ utterance = transcriptions[transcription_key]
+
+ target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
+ target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
+ with open(target_txt_file, "w") as f:
+ f.write(utterance.encode('utf-8'))
+ original_wav_file = os.path.join(recordings_dir, wav_file)
+ subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
+ target_wav_file)], shell=True)
+
+ shutil.rmtree(dirpath)
+
 
 if __name__ == '__main__':
  target_dir = args.target_dir
  sample_rate = args.sample_rate
- labels_path = args.labels_path
-
- with open(args.labels_path) as label_file:
- labels = str(''.join(json.load(label_file)))
 
  if not os.path.isdir(target_dir):
  os.makedirs(target_dir)
-
  request = urllib.request.Request(VOXFORGE_URL_16kHz)
- with urllib.request.urlopen(request) as response:
-  content = response.read()
-  all_files = re.findall( "href\=\"(.*\.tgz)\"", content.decode("utf-8"))
-  for f_idx,f in enumerate(all_files):
-  print('Downloading {} / {} files'.format(f_idx, len(all_files)))
-  prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir, labels)
+ response = urllib.request.urlopen(request)
+ content = response.read()
+ all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
+ for f_idx, f in enumerate(all_files):
+ prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
+ _update_progress(f_idx / float(len(all_files)))
  print('Creating manifests...')
- create_manifest(target_dir, 'train')
+ create_manifest(target_dir, 'voxforge_train')