Adding the first batch of preprocessing scripts

Natalia-T · Sep 27, 2021 · 61a3796 · 61a3796
1 parent 66b2728
commit 61a3796
Show file tree

Hide file tree

Showing 50 changed files with 1,883 additions and 0 deletions.
diff --git a/data_preprocessing/African_Accented_French_proc/Preproc.py b/data_preprocessing/African_Accented_French_proc/Preproc.py
@@ -0,0 +1,24 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+import os, glob
+from funcs import get_files_in_path, printProgressBar
+
+def main():
+ path = "../speech/**/"
+ newPath = "../wavs/"
+ theFiles = get_files_in_path(path)
+
+ for i, filePath in enumerate(theFiles):
+ fileNewPath = filePath.replace("/speech/", "/wavs/")
+ directory = os.path.dirname(fileNewPath)
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+ os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
+ printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')
+
+if __name__== "__main__":
+ main()
diff --git a/data_preprocessing/African_Accented_French_proc/WriteJson.py b/data_preprocessing/African_Accented_French_proc/WriteJson.py
@@ -0,0 +1,105 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+import os, glob
+from funcs import get_files_in_path, printProgressBar
+from pydub import AudioSegment
+import json
+import csv
+
+def main():
+ wavsPath = "../wavs/**/"
+ theFiles = get_files_in_path(wavsPath)
+
+ allFilesInfo = {}
+ for i, filePath in enumerate(theFiles):
+ audio_file = AudioSegment.from_wav(filePath)
+ if audio_file.duration_seconds < 1 or audio_file.duration_seconds > 30: continue
+ fileName, myDict = makeDict(filePath)
+ allFilesInfo[fileName] = myDict
+ printProgressBar(i + 1, len(theFiles), prefix = 'Processing Files:', suffix = 'Complete')
+ with open('../data.json', 'w') as fp:
+ json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False)
+
+def makeDict(filePath):
+ fileName = os.path.basename(filePath)[:-4]
+ trans = getTranscript(filePath)
+ spk_id = ""
+ gender = "U"
+ if "_m_" in filePath.split(os.path.sep)[-2]: gender = "M"
+ if "_f_" in filePath.split(os.path.sep)[-2]: gender = "F"
+ audio_file = AudioSegment.from_wav(filePath)
+ duration = audio_file.duration_seconds
+ myDict = {
+ "path" : filePath[1:],
+ "trans" : trans,
+ "duration" : duration,
+ "spk_id" : spk_id,
+ "spk_gender" : gender
+ }
+ return fileName, myDict
+
+def getTranscript(wavPath):
+ fileName = os.path.basename(wavPath)[:-4]
+ # print(wavPath)
+ transcript = ""
+ if wavPath.split(os.path.sep)[2] == "dev":
+ transPath = "../transcripts/dev/niger_west_african_fr/transcripts.txt"
+ with open(transPath, newline='\n') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=' ')
+ transList = list(spamreader)
+ for i, trans in enumerate(transList):
+ transWav = os.path.basename(trans[0])[:-4]
+ if transWav == fileName:
+ trans = ' '.join(trans[1:])
+ transcript = trans
+ elif wavPath.split(os.path.sep)[2] == "devtest":
+ transPath = "../transcripts/devtest/ca16_read/conditioned.txt"
+ with open(transPath, newline='\n') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=' ')
+ transList = list(spamreader)
+ for i, trans in enumerate(transList):
+ transWav = os.path.basename(trans[0])
+ if transWav == fileName:
+ trans = ' '.join(trans[1:])
+ transcript = trans
+ elif wavPath.split(os.path.sep)[2] == "test":
+ transPath = "../transcripts/test/ca16/prompts.txt"
+ with open(transPath, newline='\n') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=' ')
+ transList = list(spamreader)
+ for i, trans in enumerate(transList):
+ transWav = os.path.basename(trans[0])
+ if transWav == fileName:
+ trans = ' '.join(trans[1:])
+ transcript = trans
+ elif wavPath.split(os.path.sep)[2] == "train":
+ subFolder = wavPath.split(os.path.sep)[3]
+ baseWav = os.path.basename(wavPath)[:-4]
+ # print(subFolder, baseWav)
+ if subFolder == "ca16":
+ if "conv" in baseWav: 
+ transPath = "../transcripts/train/ca16_conv/transcripts.txt"
+ else:
+ transPath = "../transcripts/train/ca16_read/conditioned.txt"
+ else:
+ transPath = "../transcripts/train/yaounde/fn_text.txt"
+ with open(transPath, newline='\n') as csvfile:
+ spamreader = csv.reader(csvfile, delimiter=' ')
+ transList = list(spamreader)
+ for i, trans in enumerate(transList):
+ if subFolder == "yaounde":
+ subsubFolder = wavPath.split(os.path.sep)[4]
+ if not subsubFolder in trans[0]: continue
+ transWav = os.path.basename(trans[0])
+ if transWav[-4] == '.': transWav = transWav[:-4]
+ if transWav == fileName:
+ trans = ' '.join(trans[1:])
+ transcript = trans
+ return transcript
+
+if __name__== "__main__":
+ main()
diff --git a/data_preprocessing/African_Accented_French_proc/funcs.py b/data_preprocessing/African_Accented_French_proc/funcs.py
@@ -0,0 +1,35 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+def get_files_in_path(path, ext="wav"):
+ """
+ Get files in a path
+ exampe : files = get_files_in_path("./audioFiles")
+ """
+ import os, glob
+ path = os.path.join(path, "*."+ext)
+ theFiles = glob.glob(path, recursive=True)
+ return theFiles
+
+def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'):
+ """
+ Call in a loop to create terminal progress bar
+ @params:
+ iteration - Required : current iteration (Int)
+ total - Required : total iterations (Int)
+ prefix - Optional : prefix string (Str)
+ suffix - Optional : suffix string (Str)
+ decimals - Optional : positive number of decimals in percent complete (Int)
+ length - Optional : character length of bar (Int)
+ fill - Optional : bar fill character (Str)
+ """
+ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+ filledLength = int(length * iteration // total)
+ bar = fill * filledLength + '-' * (length - filledLength)
+ print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
+ # Print New Line on Complete
+ if iteration == total: 
+ print()
diff --git a/data_preprocessing/Att-HACK_SLR88_proc/Preproc.py b/data_preprocessing/Att-HACK_SLR88_proc/Preproc.py
@@ -0,0 +1,24 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+import os, glob
+from funcs import get_files_in_path, printProgressBar
+
+def main():
+ path = "../Volumes/CLEM_HDD/IRCAM/Open_SLR/wav/**/"
+ newPath = "../wavs/"
+ theFiles = get_files_in_path(path)
+
+ for i, filePath in enumerate(theFiles):
+ fileNewPath = filePath.replace("/Volumes/CLEM_HDD/IRCAM/Open_SLR/wav/", "/wavs/")
+ directory = os.path.dirname(fileNewPath)
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+ os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
+ printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')
+
+if __name__== "__main__":
+ main()
diff --git a/data_preprocessing/Att-HACK_SLR88_proc/WriteJson.py b/data_preprocessing/Att-HACK_SLR88_proc/WriteJson.py
@@ -0,0 +1,46 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+import os, glob
+from funcs import get_files_in_path, printProgressBar
+from pydub import AudioSegment
+import json
+
+def main():
+ wavsPath = "../wavs/**/"
+ txtPath = "../Volumes/CLEM_HDD/IRCAM/Open_SLR/txt/"
+ theFiles = get_files_in_path(wavsPath)
+
+ allFilesInfo = {}
+ for i, filePath in enumerate(theFiles):
+ audio_file = AudioSegment.from_wav(filePath)
+ if audio_file.duration_seconds < 1: continue
+ fileName, myDict = makeDict(filePath, txtPath)
+ allFilesInfo[fileName] = myDict
+ printProgressBar(i + 1, len(theFiles), prefix = 'Processing Files:', suffix = 'Complete')
+ with open('../data.json', 'w') as fp:
+ json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False)
+
+def makeDict(filePath, txtPath):
+ fileName = os.path.basename(filePath)[:-4]
+ fileTrs = os.path.join(txtPath, fileName+".txt")
+ with open(fileTrs) as f:
+ trs = f.readlines()[0]
+ spk_id = fileName[0:3]
+ gender = fileName[0]
+ audio_file = AudioSegment.from_wav(filePath)
+ duration = audio_file.duration_seconds
+ myDict = {
+ "path" : filePath[1:],
+ "trans" : trs,
+ "duration" : duration,
+ "spk_id" : spk_id,
+ "spk_gender" : gender
+ }
+ return fileName, myDict
+
+if __name__== "__main__":
+ main()
diff --git a/data_preprocessing/Att-HACK_SLR88_proc/funcs.py b/data_preprocessing/Att-HACK_SLR88_proc/funcs.py
@@ -0,0 +1,35 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+def get_files_in_path(path, ext="wav"):
+ """
+ Get files in a path
+ exampe : files = get_files_in_path("./audioFiles")
+ """
+ import os, glob
+ path = os.path.join(path, "*."+ext)
+ theFiles = glob.glob(path, recursive=True)
+ return theFiles
+
+def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'):
+ """
+ Call in a loop to create terminal progress bar
+ @params:
+ iteration - Required : current iteration (Int)
+ total - Required : total iterations (Int)
+ prefix - Optional : prefix string (Str)
+ suffix - Optional : suffix string (Str)
+ decimals - Optional : positive number of decimals in percent complete (Int)
+ length - Optional : character length of bar (Int)
+ fill - Optional : bar fill character (Str)
+ """
+ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+ filledLength = int(length * iteration // total)
+ bar = fill * filledLength + '-' * (length - filledLength)
+ print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
+ # Print New Line on Complete
+ if iteration == total: 
+ print()
diff --git a/data_preprocessing/CaFE_proc/Preproc.py b/data_preprocessing/CaFE_proc/Preproc.py
@@ -0,0 +1,25 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+
+import os, glob
+from funcs import get_files_in_path, printProgressBar
+
+def main():
+ path = "../CaFE_48k/**/"
+ newPath = "../wavs/"
+ theFiles = get_files_in_path(path)
+
+ for i, filePath in enumerate(theFiles):
+ fileNewPath = filePath.replace("/CaFE_48k/", "/wavs/")
+ directory = os.path.dirname(fileNewPath)
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+ os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
+ printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')
+
+if __name__== "__main__":
+ main()
diff --git a/data_preprocessing/CaFE_proc/WriteJson.py b/data_preprocessing/CaFE_proc/WriteJson.py
@@ -0,0 +1,55 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+
+import os, glob
+from funcs import get_files_in_path, printProgressBar
+from pydub import AudioSegment
+import json
+
+def main():
+ wavsPath = "../wavs/**/"
+ theFiles = get_files_in_path(wavsPath)
+
+ allFilesInfo = {}
+ for i, filePath in enumerate(theFiles):
+ audio_file = AudioSegment.from_wav(filePath)
+ if audio_file.duration_seconds < 1: continue
+ fileName, myDict = makeDict(filePath)
+ allFilesInfo[fileName] = myDict
+ with open('../data.json', 'w') as fp:
+ json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False)
+
+def makeDict(filePath):
+ fileName = os.path.basename(filePath)[:-4]
+ spk_id = fileName[0:2]
+ gender = "M"
+ if int(spk_id)%2 == 0:
+ gender = "F"
+ trans = getTranscript(fileName)
+ audio_file = AudioSegment.from_wav(filePath)
+ duration = audio_file.duration_seconds
+ myDict = {
+ "path" : filePath[1:],
+ "trans" : trans,
+ "duration" : duration,
+ "spk_id" : spk_id,
+ "spk_gender" : gender
+ }
+ return fileName, myDict
+
+def getTranscript(fileName):
+ trans = ""
+ if fileName[-1] == "1": trans = "Un cheval fou dans mon jardin"
+ if fileName[-1] == "2": trans = "Deux ânes aigris au pelage brun"
+ if fileName[-1] == "3": trans = "Trois cygnes aveugles au bord du lac"
+ if fileName[-1] == "4": trans = "Quatre vieilles truies éléphantesques"
+ if fileName[-1] == "5": trans = "Cinq pumas fiers et passionnés"
+ if fileName[-1] == "6": trans = "Six ours aimants domestiqués"
+ return trans
+
+if __name__== "__main__":
+ main()
diff --git a/data_preprocessing/CaFE_proc/funcs.py b/data_preprocessing/CaFE_proc/funcs.py
@@ -0,0 +1,35 @@
+'''
+
+Sina ALISAMIR
+2020-2021
+'''
+
+def get_files_in_path(path, ext="wav"):
+ """
+ Get files in a path
+ exampe : files = get_files_in_path("./audioFiles")
+ """
+ import os, glob
+ path = os.path.join(path, "*."+ext)
+ theFiles = glob.glob(path, recursive=True)
+ return theFiles
+
+def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'):
+ """
+ Call in a loop to create terminal progress bar
+ @params:
+ iteration - Required : current iteration (Int)
+ total - Required : total iterations (Int)
+ prefix - Optional : prefix string (Str)
+ suffix - Optional : suffix string (Str)
+ decimals - Optional : positive number of decimals in percent complete (Int)
+ length - Optional : character length of bar (Int)
+ fill - Optional : bar fill character (Str)
+ """
+ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+ filledLength = int(length * iteration // total)
+ bar = fill * filledLength + '-' * (length - filledLength)
+ print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
+ # Print New Line on Complete
+ if iteration == total: 
+ print()