Change utils func name, formatted script and changed name of param in…

… librispeech script
DevKiHyun · Apr 28, 2017 · d7a416f · d7a416f
1 parent 608bdb1
commit d7a416f
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 35 deletions.
diff --git a/data/librispeech.py b/data/librispeech.py
@@ -3,44 +3,46 @@
 import tarfile
 import argparse
 import subprocess
-import unicodedata
-from utils import create_manifest, _update_progress
+from utils import create_manifest
 import shutil
 
 parser = argparse.ArgumentParser(description='Processes and downloads LibriSpeech dataset.')
 parser.add_argument("--target_dir", default='LibriSpeech_dataset/', type=str, help="Directory to store the dataset.")
 parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
-parser.add_argument('--files_to_dl', default="train-clean-100.tar.gz,"
- "train-clean-360.tar.gz,train-other-500.tar.gz,"
- "dev-clean.tar.gz,dev-other.tar.gz,"
- "test-clean.tar.gz,test-other.tar.gz", type=str, help='list of file names to download')
+parser.add_argument('--files_to_use', default="train-clean-100.tar.gz,"
+ "train-clean-360.tar.gz,train-other-500.tar.gz,"
+ "dev-clean.tar.gz,dev-other.tar.gz,"
+ "test-clean.tar.gz,test-other.tar.gz", type=str,
+ help='list of file names to download')
 args = parser.parse_args()
 
 LIBRI_SPEECH_URLS = {
-  "train" : ["http:https://www.openslr.org/resources/12/train-clean-100.tar.gz",
-  "http:https://www.openslr.org/resources/12/train-clean-360.tar.gz",
-  "http:https://www.openslr.org/resources/12/train-other-500.tar.gz"],
+ "train": ["http:https://www.openslr.org/resources/12/train-clean-100.tar.gz",
+ "http:https://www.openslr.org/resources/12/train-clean-360.tar.gz",
+ "http:https://www.openslr.org/resources/12/train-other-500.tar.gz"],
 
- "val" : ["http:https://www.openslr.org/resources/12/dev-clean.tar.gz",
- "http:https://www.openslr.org/resources/12/dev-other.tar.gz"],
+ "val": ["http:https://www.openslr.org/resources/12/dev-clean.tar.gz",
+ "http:https://www.openslr.org/resources/12/dev-other.tar.gz"],
+
+ "test": ["http:https://www.openslr.org/resources/12/test-clean.tar.gz",
+ "http:https://www.openslr.org/resources/12/test-other.tar.gz"]
+}
 
- "test" : ["http:https://www.openslr.org/resources/12/test-clean.tar.gz",
- "http:https://www.openslr.org/resources/12/test-other.tar.gz"]
- }
 
 def _preprocess_transcript(phrase):
  return phrase.strip().upper()
 
+
 def _process_file(wav_dir, txt_dir, base_filename, root_dir):
  full_recording_path = os.path.join(root_dir, base_filename)
  assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
- wav_recording_path = os.path.join( wav_dir, base_filename.replace(".flac", ".wav"))
+ wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
  subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
  wav_recording_path)], shell=True)
- #process transcript
+ # process transcript
  txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
- transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1])+".trans.txt")
- assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format( transcript_file )
+ transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
+ assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
  transcriptions = open(transcript_file).read().strip().split("\n")
  transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
  with open(txt_transcript_path, "w") as f:
@@ -49,11 +51,12 @@ def _process_file(wav_dir, txt_dir, base_filename, root_dir):
  f.write(_preprocess_transcript(transcriptions[key]))
  f.flush()
 
+
 def main():
  target_dl_dir = args.target_dir
  if not os.path.exists(target_dl_dir):
  os.makedirs(target_dl_dir)
- files_to_dl = args.files_to_dl.strip().split(',')
+ files_to_dl = args.files_to_use.strip().split(',')
  for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items():
  split_dir = os.path.join(target_dl_dir, split_type)
  if not os.path.exists(split_dir):
@@ -66,21 +69,21 @@ def main():
  os.makedirs(split_txt_dir)
  extracted_dir = os.path.join(split_dir, "LibriSpeech")
  if os.path.exists(extracted_dir):
- shutil.rmtree( extracted_dir )
+ shutil.rmtree(extracted_dir)
  for url in lst_libri_urls:
- #check if we want to dl this file
+ # check if we want to dl this file
  dl_flag = False
  for f in files_to_dl:
  if url.find(f) != -1:
  dl_flag = True
  if not dl_flag:
- print("Skipping url: {}".format( url ))
+ print("Skipping url: {}".format(url))
  continue
  filename = url.split("/")[-1]
- target_filename =  os.path.join(split_dir, filename)
- if not os.path.exists( target_filename ):
+ target_filename = os.path.join(split_dir, filename)
+ if not os.path.exists(target_filename):
  wget.download(url, split_dir)
- print("Unpacking {}...".format( filename ))
+ print("Unpacking {}...".format(filename))
  tar = tarfile.open(target_filename)
  tar.extractall(split_dir)
  tar.close()
@@ -91,12 +94,12 @@ def main():
  for f in files:
  if f.find(".flac") != -1:
  _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir,
- base_filename=f, root_dir=root )
+ base_filename=f, root_dir=root)
 
- print("Finished {}".format( url ))
+ print("Finished {}".format(url))
  shutil.rmtree(extracted_dir)
  create_manifest(split_dir, 'libri_' + split_type)
 
+
 if __name__ == "__main__":
  main()
-
diff --git a/data/ted.py b/data/ted.py
@@ -5,7 +5,7 @@
 import subprocess
 import unicodedata
 
-from utils import create_manifest, _update_progress
+from utils import create_manifest, update_progress
 
 parser = argparse.ArgumentParser(description='Processes and downloads TED-LIUMv2 dataset.')
 parser.add_argument("--target_dir", default='TEDLIUM_dataset/', type=str, help="Directory to store the dataset.")
@@ -83,7 +83,7 @@ def prepare_dir(ted_dir):
  with open(target_txt_file, "w") as f:
  f.write(_preprocess_transcript(utterance["transcript"]).encode('utf-8'))
  counter += 1
- _update_progress(counter / float(len(entries)))
+ update_progress(counter / float(len(entries)))
 
 
 def main():

diff --git a/data/utils.py b/data/utils.py
@@ -7,7 +7,7 @@
 import subprocess
 
 
-def _update_progress(progress):
+def update_progress(progress):
  print("\rProgress: [{0:50s}] {1:.1f}%".format('#' * int(progress * 50),
  progress * 100), end="")
 
@@ -23,7 +23,7 @@ def create_manifest(data_path, tag, ordered=True):
  for file_path in wav_files:
  file_paths.append(file_path.strip())
  counter += 1
- _update_progress(counter / float(size))
+ update_progress(counter / float(size))
  print('\n')
  if ordered:
  _order_files(file_paths)
@@ -34,7 +34,7 @@ def create_manifest(data_path, tag, ordered=True):
  sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
  file.write(sample.encode('utf-8'))
  counter += 1
- _update_progress(counter / float(size))
+ update_progress(counter / float(size))
  print('\n')
 
 

diff --git a/data/voxforge.py b/data/voxforge.py
@@ -7,7 +7,7 @@
 import subprocess
 import tarfile
 
-from utils import create_manifest, _update_progress
+from utils import create_manifest, update_progress
 
 VOXFORGE_URL_16kHz = 'http:https://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/Audio/Main/16kHz_16bit/'
 
@@ -89,6 +89,6 @@ def prepare_sample(recording_name, url, target_folder):
  all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8"))
  for f_idx, f in enumerate(all_files):
  prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir)
- _update_progress(f_idx / float(len(all_files)))
+ update_progress(f_idx / float(len(all_files)))
  print('Creating manifests...')
  create_manifest(target_dir, 'voxforge_train')