Skip to content

Commit

Permalink
Refactored voxforge, removed need for labels due to check now done in…
Browse files Browse the repository at this point in the history
… dataloader, better support for 2.7
  • Loading branch information
SeanNaren committed Apr 24, 2017
1 parent 598d2b5 commit 63c8b88
Showing 1 changed file with 53 additions and 58 deletions.
111 changes: 53 additions & 58 deletions data/voxforge.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,35 @@
import os
import urllib.request
from six.moves import urllib
import argparse
import re
import tempfile
import shutil
import subprocess
import json
import tarfile

from utils import create_manifest
from utils import create_manifest, _update_progress

VOXFORGE_URL_16kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'


parser = argparse.ArgumentParser(description='Processes and downloads VoxForge dataset.')
parser.add_argument( "--target_dir", type = str,
default = "voxforge", help = "Directory to store the dataset." )
parser.add_argument("--target_dir", default='voxforge_dataset/', type=str, help="Directory to store the dataset.")
parser.add_argument('--sample_rate', default=16000,
type=int, help='Sample rate')
parser.add_argument('--labels_path', default='./../labels.json', help='Contains all characters for prediction')

args = parser.parse_args()

def _process_transcript(transcript, labels):
return "".join( [ c for c in transcript.strip().upper() if c in labels ] )

def _get_recordings_dir( sample_dir, recording_name ):
wav_dir = os.path.join( sample_dir, recording_name, "wav" )
def _get_recordings_dir(sample_dir, recording_name):
wav_dir = os.path.join(sample_dir, recording_name, "wav")
if os.path.exists(wav_dir):
return "wav", wav_dir
flac_dir = os.path.join( sample_dir, recording_name, "flac" )
if os.path.exists( flac_dir ):
flac_dir = os.path.join(sample_dir, recording_name, "flac")
if os.path.exists(flac_dir):
return "flac", flac_dir
raise Exception("wav or flac directory was not found for recording name: {}".format( recotding_name ))
raise Exception("wav or flac directory was not found for recording name: {}".format(recording_name))


def prepare_sample(recording_name, url, target_folder, labels):
def prepare_sample(recording_name, url, target_folder):
"""
Downloads and extracts a sample from VoxForge and puts the wav and txt files into :target_folder.
"""
Expand All @@ -45,55 +41,54 @@ def prepare_sample(recording_name, url, target_folder, labels):
os.makedirs(txt_dir)

request = urllib.request.Request(url)
with urllib.request.urlopen(request) as response:
content = response.read()
with tempfile.NamedTemporaryFile( suffix = ".tgz" ) as target_tgz:
target_tgz.write( content )
dirpath = tempfile.mkdtemp()
subprocess.call(["tar zxvf {} -C {}".format(target_tgz.name, dirpath)], shell=True)

recordings_type, recordings_dir = _get_recordings_dir( dirpath, recording_name)
tgz_prompt_file = os.path.join( dirpath, recording_name, "etc", "PROMPTS" )

if os.path.exists( recordings_dir ) and os.path.exists( tgz_prompt_file ):
transcriptions = open(tgz_prompt_file).read().strip().split("\n")
transcriptions = {t.split()[0] : " ".join(t.split()[1:]) for t in transcriptions}
for wav_file in os.listdir( recordings_dir ):
recording_id = wav_file.split('.{}'.format(recordings_type))[0]
transcription_key = recording_name + "/mfc/" + recording_id
if transcription_key not in transcriptions:
continue
utterance = transcriptions[ transcription_key ]
utterance = _process_transcript(utterance, labels)

target_wav_file = os.path.join( wav_dir, "{}_{}.wav".format( recording_name, recording_id ) )
target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
with open(target_txt_file, "w") as f:
f.write( utterance )
original_wav_file = os.path.join(recordings_dir, wav_file)
#shutil.copyfile( original_wav_file , target_wav_file )
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format( original_wav_file, str(args.sample_rate),
target_wav_file )], shell = True)

shutil.rmtree(dirpath)
response = urllib.request.urlopen(request)
content = response.read()
response.close()
with tempfile.NamedTemporaryFile(suffix=".tgz", mode='w') as target_tgz:
target_tgz.write(content)
target_tgz.flush()
dirpath = tempfile.mkdtemp()

tar = tarfile.open(target_tgz.name)
tar.extractall(dirpath)
tar.close()

recordings_type, recordings_dir = _get_recordings_dir(dirpath, recording_name)
tgz_prompt_file = os.path.join(dirpath, recording_name, "etc", "PROMPTS")

if os.path.exists(recordings_dir) and os.path.exists(tgz_prompt_file):
transcriptions = open(tgz_prompt_file).read().strip().split("\n")
transcriptions = {t.split()[0]: " ".join(t.split()[1:]) for t in transcriptions}
for wav_file in os.listdir(recordings_dir):
recording_id = wav_file.split('.{}'.format(recordings_type))[0]
transcription_key = recording_name + "/mfc/" + recording_id
if transcription_key not in transcriptions:
continue
utterance = transcriptions[transcription_key]

target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id))
target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id))
with open(target_txt_file, "w") as f:
f.write(utterance.encode('utf-8'))
original_wav_file = os.path.join(recordings_dir, wav_file)
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate),
target_wav_file)], shell=True)

shutil.rmtree(dirpath)


if __name__ == '__main__':
target_dir = args.target_dir
sample_rate = args.sample_rate
labels_path = args.labels_path

with open(args.labels_path) as label_file:
labels = str(''.join(json.load(label_file)))

if not os.path.isdir(target_dir):
os.makedirs(target_dir)

request = urllib.request.Request(VOXFORGE_URL_16kHz)
with urllib.request.urlopen(request) as response:
content = response.read()
all_files = re.findall( "href\=\"(.*\.tgz)\"", content.decode("utf-8"))
for f_idx,f in enumerate(all_files):
print('Downloading {} / {} files'.format(f_idx, len(all_files)))
prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir, labels)
response = urllib.request.urlopen(request)
content = response.read()
all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
for f_idx, f in enumerate(all_files):
prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
_update_progress(f_idx / float(len(all_files)))
print('Creating manifests...')
create_manifest(target_dir, 'train')
create_manifest(target_dir, 'voxforge_train')

0 comments on commit 63c8b88

Please sign in to comment.