handle flac samples, filter unused characters

DevKiHyun · Apr 22, 2017 · c853b96 · c853b96
1 parent fa56161
commit c853b96
Showing 1 changed file with 42 additions and 20 deletions.
diff --git a/data/voxforge.py b/data/voxforge.py
@@ -5,22 +5,35 @@
 import tempfile
 import shutil
 import subprocess
+import json
 
 from utils import create_manifest
 
-VOXFORGE_URL_48kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/48kHz_16bit'
-VOXFORGE_URL_16kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit'
+VOXFORGE_URL_16kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'
 
 
 parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
 parser.add_argument( "--target_dir", type = str,
  default = "voxforge", help = "Directory to store the dataset." )
 parser.add_argument('--sample_rate', default=16000,
  type=int, help='Sample rate')
+parser.add_argument('--labels_path', default='./../labels.json', help='Contains all characters for prediction')
+
 args = parser.parse_args()
 
+def _process_transcript(transcript, labels):
+ return "".join( [ c for c in transcript.strip().upper() if c in labels ] )
+
+def _get_recordings_dir( sample_dir, recording_name ):
+ wav_dir = os.path.join( sample_dir, recording_name, "wav" )
+ if os.path.exists(wav_dir):
+ return "wav", wav_dir
+ flac_dir = os.path.join( sample_dir, recording_name, "flac" )
+ if os.path.exists( flac_dir ):
+ return "flac", flac_dir
+ raise Exception("wav or flac directory was not found for recording name: {}".format( recotding_name ))
 
-def prepare_sample(recording_name, url, target_folder):
+def prepare_sample(recording_name, url, target_folder, labels):
  """
  Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
  """
@@ -39,29 +52,38 @@ def prepare_sample(recording_name, url, target_folder):
  dirpath = tempfile.mkdtemp()
  subprocess.call(["tar zxvf {} -C {}".format(target_tgz.name, dirpath)], shell=True)
 
- tgz_wav_dir = os.path.join( dirpath, recording_name, "wav" )
- tgz_prompt_dir = os.path.join( dirpath, recording_name, "etc", "PROMPTS" )
+ recordings_type, recordings_dir = _get_recordings_dir( dirpath, recording_name)
+ tgz_prompt_file = os.path.join( dirpath, recording_name, "etc", "PROMPTS" )
 
- transcriptions = open(tgz_prompt_dir).read().strip().split("\n")
- transcriptions = {t.split()[0] : " ".join(t.split()[1:]) for t in transcriptions if len(t) > 0}
- assert os.path.exists(tgz_wav_dir) and os.path.exists( tgz_prompt_dir ), \
- "wav or PROMPTS dir is not found in the archive "
- for wav_file in os.listdir( tgz_wav_dir ):
- recording_id = wav_file.split('.wav')[0]
+ if os.path.exists( recordings_dir ) and os.path.exists( tgz_prompt_file ):
+ transcriptions = open(tgz_prompt_file).read().strip().split("\n")
+ transcriptions = {t.split()[0] : " ".join(t.split()[1:]) for t in transcriptions}
+ for wav_file in os.listdir( recordings_dir ):
+ recording_id = wav_file.split('.{}'.format(recordings_type))[0]
+ transcription_key = recording_name + "/mfc/" + recording_id 
+ if transcription_key not in transcriptions:
+ continue
+ utterance = transcriptions[ transcription_key ]
+ utterance = _process_transcript(utterance, labels)
 
- utterance = transcriptions[ recording_name + "/mfc/" + recording_id ]
-
- target_wav_file = os.path.join( wav_dir, "{}_{}.wav".format( recording_name, recording_id ) )
- target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
- with open(target_txt_file, "w") as f:
- f.write( utterance )
- shutil.copyfile( os.path.join(tgz_wav_dir, wav_file), target_wav_file )
+ target_wav_file = os.path.join( wav_dir, "{}_{}.wav".format( recording_name, recording_id ) )
+ target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
+ with open(target_txt_file, "w") as f:
+ f.write( utterance )
+ original_wav_file = os.path.join(recordings_dir, wav_file)
+ #shutil.copyfile( original_wav_file , target_wav_file )
+ subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format( original_wav_file, str(args.sample_rate),
+ target_wav_file )], shell = True)
 
  shutil.rmtree(dirpath)
 
 if __name__ == '__main__':
  target_dir = args.target_dir
  sample_rate = args.sample_rate
+ labels_path = args.labels_path
+
+ with open(args.labels_path) as label_file:
+ labels = str(''.join(json.load(label_file)))
 
  if not os.path.isdir(target_dir):
  os.makedirs(target_dir)
@@ -72,6 +94,6 @@ def prepare_sample(recording_name, url, target_folder):
  all_files = re.findall( "href\=\"(.*\.tgz)\"", content.decode("utf-8"))
  for f_idx,f in enumerate(all_files):
  print('Downloading {} / {} files'.format(f_idx, len(all_files)))
- prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
+ prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir, labels)
  print('Creating manifests...')
- create_manifest(target_dir, 'train')
+ create_manifest(target_dir, 'train')