Skip to content

Commit

Permalink
Change utils func name, formatted script and changed name of param in…
Browse files Browse the repository at this point in the history
… librispeech script
  • Loading branch information
SeanNaren committed Apr 28, 2017
1 parent 608bdb1 commit d7a416f
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 35 deletions.
59 changes: 31 additions & 28 deletions data/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,44 +3,46 @@
import tarfile
import argparse
import subprocess
import unicodedata
from utils import create_manifest, _update_progress
from utils import create_manifest
import shutil

parser = argparse.ArgumentParser(description='Processes and downloads LibriSpeech dataset.')
parser.add_argument("--target_dir", default='LibriSpeech_dataset/', type=str, help="Directory to store the dataset.")
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
parser.add_argument('--files_to_dl', default="train-clean-100.tar.gz,"
"train-clean-360.tar.gz,train-other-500.tar.gz,"
"dev-clean.tar.gz,dev-other.tar.gz,"
"test-clean.tar.gz,test-other.tar.gz", type=str, help='list of file names to download')
parser.add_argument('--files_to_use', default="train-clean-100.tar.gz,"
"train-clean-360.tar.gz,train-other-500.tar.gz,"
"dev-clean.tar.gz,dev-other.tar.gz,"
"test-clean.tar.gz,test-other.tar.gz", type=str,
help='list of file names to download')
args = parser.parse_args()

LIBRI_SPEECH_URLS = {
"train" : ["http:https://www.openslr.org/resources/12/train-clean-100.tar.gz",
"http:https://www.openslr.org/resources/12/train-clean-360.tar.gz",
"http:https://www.openslr.org/resources/12/train-other-500.tar.gz"],
"train": ["http:https://www.openslr.org/resources/12/train-clean-100.tar.gz",
"http:https://www.openslr.org/resources/12/train-clean-360.tar.gz",
"http:https://www.openslr.org/resources/12/train-other-500.tar.gz"],

"val" : ["http:https://www.openslr.org/resources/12/dev-clean.tar.gz",
"http:https://www.openslr.org/resources/12/dev-other.tar.gz"],
"val": ["http:https://www.openslr.org/resources/12/dev-clean.tar.gz",
"http:https://www.openslr.org/resources/12/dev-other.tar.gz"],

"test": ["http:https://www.openslr.org/resources/12/test-clean.tar.gz",
"http:https://www.openslr.org/resources/12/test-other.tar.gz"]
}

"test" : ["http:https://www.openslr.org/resources/12/test-clean.tar.gz",
"http:https://www.openslr.org/resources/12/test-other.tar.gz"]
}

def _preprocess_transcript(phrase):
return phrase.strip().upper()


def _process_file(wav_dir, txt_dir, base_filename, root_dir):
full_recording_path = os.path.join(root_dir, base_filename)
assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
wav_recording_path = os.path.join( wav_dir, base_filename.replace(".flac", ".wav"))
wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
wav_recording_path)], shell=True)
#process transcript
# process transcript
txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1])+".trans.txt")
assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format( transcript_file )
transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
transcriptions = open(transcript_file).read().strip().split("\n")
transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
with open(txt_transcript_path, "w") as f:
Expand All @@ -49,11 +51,12 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir):
f.write(_preprocess_transcript(transcriptions[key]))
f.flush()


def main():
target_dl_dir = args.target_dir
if not os.path.exists(target_dl_dir):
os.makedirs(target_dl_dir)
files_to_dl = args.files_to_dl.strip().split(',')
files_to_dl = args.files_to_use.strip().split(',')
for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
split_dir = os.path.join(target_dl_dir, split_type)
if not os.path.exists(split_dir):
Expand All @@ -66,21 +69,21 @@ def main():
os.makedirs(split_txt_dir)
extracted_dir = os.path.join(split_dir, "LibriSpeech")
if os.path.exists(extracted_dir):
shutil.rmtree( extracted_dir )
shutil.rmtree(extracted_dir)
for url in lst_libri_urls:
#check if we want to dl this file
# check if we want to dl this file
dl_flag = False
for f in files_to_dl:
if url.find(f) != -1:
dl_flag = True
if not dl_flag:
print("Skipping url: {}".format( url ))
print("Skipping url: {}".format(url))
continue
filename = url.split("/")[-1]
target_filename = os.path.join(split_dir, filename)
if not os.path.exists( target_filename ):
target_filename = os.path.join(split_dir, filename)
if not os.path.exists(target_filename):
wget.download(url, split_dir)
print("Unpacking {}...".format( filename ))
print("Unpacking {}...".format(filename))
tar = tarfile.open(target_filename)
tar.extractall(split_dir)
tar.close()
Expand All @@ -91,12 +94,12 @@ def main():
for f in files:
if f.find(".flac") != -1:
_process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir,
base_filename=f, root_dir=root )
base_filename=f, root_dir=root)

print("Finished {}".format( url ))
print("Finished {}".format(url))
shutil.rmtree(extracted_dir)
create_manifest(split_dir, 'libri_' + split_type)


if __name__ == "__main__":
main()

4 changes: 2 additions & 2 deletions data/ted.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import subprocess
import unicodedata

from utils import create_manifest, _update_progress
from utils import create_manifest, update_progress

parser = argparse.ArgumentParser(description='Processes and downloads TED-LIUMv2 dataset.')
parser.add_argument("--target_dir", default='TEDLIUM_dataset/', type=str, help="Directory to store the dataset.")
Expand Down Expand Up @@ -83,7 +83,7 @@ def prepare_dir(ted_dir):
with open(target_txt_file, "w") as f:
f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8'))
counter += 1
_update_progress(counter / float(len(entries)))
update_progress(counter / float(len(entries)))


def main():
Expand Down
6 changes: 3 additions & 3 deletions data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess


def _update_progress(progress):
def update_progress(progress):
print("\rProgress: [{0:50s}] {1:.1f}%".format('#' * int(progress * 50),
progress * 100), end="")

Expand All @@ -23,7 +23,7 @@ def create_manifest(data_path, tag, ordered=True):
for file_path in wav_files:
file_paths.append(file_path.strip())
counter += 1
_update_progress(counter / float(size))
update_progress(counter / float(size))
print('\n')
if ordered:
_order_files(file_paths)
Expand All @@ -34,7 +34,7 @@ def create_manifest(data_path, tag, ordered=True):
sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
file.write(sample.encode('utf-8'))
counter += 1
_update_progress(counter / float(size))
update_progress(counter / float(size))
print('\n')


Expand Down
4 changes: 2 additions & 2 deletions data/voxforge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import subprocess
import tarfile

from utils import create_manifest, _update_progress
from utils import create_manifest, update_progress

VOXFORGE_URL_16kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'

Expand Down Expand Up @@ -89,6 +89,6 @@ def prepare_sample(recording_name, url, target_folder):
all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
for f_idx, f in enumerate(all_files):
prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
_update_progress(f_idx / float(len(all_files)))
update_progress(f_idx / float(len(all_files)))
print('Creating manifests...')
create_manifest(target_dir, 'voxforge_train')

0 comments on commit d7a416f

Please sign in to comment.