Skip to content

Commit

Permalink
Adding the first batch of preprocessing scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
mzboito committed Sep 27, 2021
1 parent 66b2728 commit 61a3796
Show file tree
Hide file tree
Showing 50 changed files with 1,883 additions and 0 deletions.
24 changes: 24 additions & 0 deletions data_preprocessing/African_Accented_French_proc/Preproc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
'''
Sina ALISAMIR
2020-2021
'''

import os, glob
from funcs import get_files_in_path, printProgressBar

def main():
path = "../speech/**/"
newPath = "../wavs/"
theFiles = get_files_in_path(path)

for i, filePath in enumerate(theFiles):
fileNewPath = filePath.replace("/speech/", "/wavs/")
directory = os.path.dirname(fileNewPath)
if not os.path.exists(directory):
os.makedirs(directory)
os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')

if __name__== "__main__":
main()
105 changes: 105 additions & 0 deletions data_preprocessing/African_Accented_French_proc/WriteJson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
'''
Sina ALISAMIR
2020-2021
'''

import os, glob
from funcs import get_files_in_path, printProgressBar
from pydub import AudioSegment
import json
import csv

def main():
wavsPath = "../wavs/**/"
theFiles = get_files_in_path(wavsPath)

allFilesInfo = {}
for i, filePath in enumerate(theFiles):
audio_file = AudioSegment.from_wav(filePath)
if audio_file.duration_seconds < 1 or audio_file.duration_seconds > 30: continue
fileName, myDict = makeDict(filePath)
allFilesInfo[fileName] = myDict
printProgressBar(i + 1, len(theFiles), prefix = 'Processing Files:', suffix = 'Complete')
with open('../data.json', 'w') as fp:
json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False)

def makeDict(filePath):
fileName = os.path.basename(filePath)[:-4]
trans = getTranscript(filePath)
spk_id = ""
gender = "U"
if "_m_" in filePath.split(os.path.sep)[-2]: gender = "M"
if "_f_" in filePath.split(os.path.sep)[-2]: gender = "F"
audio_file = AudioSegment.from_wav(filePath)
duration = audio_file.duration_seconds
myDict = {
"path" : filePath[1:],
"trans" : trans,
"duration" : duration,
"spk_id" : spk_id,
"spk_gender" : gender
}
return fileName, myDict

def getTranscript(wavPath):
fileName = os.path.basename(wavPath)[:-4]
# print(wavPath)
transcript = ""
if wavPath.split(os.path.sep)[2] == "dev":
transPath = "../transcripts/dev/niger_west_african_fr/transcripts.txt"
with open(transPath, newline='\n') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ')
transList = list(spamreader)
for i, trans in enumerate(transList):
transWav = os.path.basename(trans[0])[:-4]
if transWav == fileName:
trans = ' '.join(trans[1:])
transcript = trans
elif wavPath.split(os.path.sep)[2] == "devtest":
transPath = "../transcripts/devtest/ca16_read/conditioned.txt"
with open(transPath, newline='\n') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ')
transList = list(spamreader)
for i, trans in enumerate(transList):
transWav = os.path.basename(trans[0])
if transWav == fileName:
trans = ' '.join(trans[1:])
transcript = trans
elif wavPath.split(os.path.sep)[2] == "test":
transPath = "../transcripts/test/ca16/prompts.txt"
with open(transPath, newline='\n') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ')
transList = list(spamreader)
for i, trans in enumerate(transList):
transWav = os.path.basename(trans[0])
if transWav == fileName:
trans = ' '.join(trans[1:])
transcript = trans
elif wavPath.split(os.path.sep)[2] == "train":
subFolder = wavPath.split(os.path.sep)[3]
baseWav = os.path.basename(wavPath)[:-4]
# print(subFolder, baseWav)
if subFolder == "ca16":
if "conv" in baseWav:
transPath = "../transcripts/train/ca16_conv/transcripts.txt"
else:
transPath = "../transcripts/train/ca16_read/conditioned.txt"
else:
transPath = "../transcripts/train/yaounde/fn_text.txt"
with open(transPath, newline='\n') as csvfile:
spamreader = csv.reader(csvfile, delimiter=' ')
transList = list(spamreader)
for i, trans in enumerate(transList):
if subFolder == "yaounde":
subsubFolder = wavPath.split(os.path.sep)[4]
if not subsubFolder in trans[0]: continue
transWav = os.path.basename(trans[0])
if transWav[-4] == '.': transWav = transWav[:-4]
if transWav == fileName:
trans = ' '.join(trans[1:])
transcript = trans
return transcript

if __name__== "__main__":
main()
35 changes: 35 additions & 0 deletions data_preprocessing/African_Accented_French_proc/funcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
'''
Sina ALISAMIR
2020-2021
'''

def get_files_in_path(path, ext="wav"):
"""
Get files in a path
exampe : files = get_files_in_path("./audioFiles")
"""
import os, glob
path = os.path.join(path, "*."+ext)
theFiles = glob.glob(path, recursive=True)
return theFiles

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
# Print New Line on Complete
if iteration == total:
print()
24 changes: 24 additions & 0 deletions data_preprocessing/Att-HACK_SLR88_proc/Preproc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
'''
Sina ALISAMIR
2020-2021
'''

import os, glob
from funcs import get_files_in_path, printProgressBar

def main():
path = "../Volumes/CLEM_HDD/IRCAM/Open_SLR/wav/**/"
newPath = "../wavs/"
theFiles = get_files_in_path(path)

for i, filePath in enumerate(theFiles):
fileNewPath = filePath.replace("/Volumes/CLEM_HDD/IRCAM/Open_SLR/wav/", "/wavs/")
directory = os.path.dirname(fileNewPath)
if not os.path.exists(directory):
os.makedirs(directory)
os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')

if __name__== "__main__":
main()
46 changes: 46 additions & 0 deletions data_preprocessing/Att-HACK_SLR88_proc/WriteJson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
'''
Sina ALISAMIR
2020-2021
'''

import os, glob
from funcs import get_files_in_path, printProgressBar
from pydub import AudioSegment
import json

def main():
wavsPath = "../wavs/**/"
txtPath = "../Volumes/CLEM_HDD/IRCAM/Open_SLR/txt/"
theFiles = get_files_in_path(wavsPath)

allFilesInfo = {}
for i, filePath in enumerate(theFiles):
audio_file = AudioSegment.from_wav(filePath)
if audio_file.duration_seconds < 1: continue
fileName, myDict = makeDict(filePath, txtPath)
allFilesInfo[fileName] = myDict
printProgressBar(i + 1, len(theFiles), prefix = 'Processing Files:', suffix = 'Complete')
with open('../data.json', 'w') as fp:
json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False)

def makeDict(filePath, txtPath):
fileName = os.path.basename(filePath)[:-4]
fileTrs = os.path.join(txtPath, fileName+".txt")
with open(fileTrs) as f:
trs = f.readlines()[0]
spk_id = fileName[0:3]
gender = fileName[0]
audio_file = AudioSegment.from_wav(filePath)
duration = audio_file.duration_seconds
myDict = {
"path" : filePath[1:],
"trans" : trs,
"duration" : duration,
"spk_id" : spk_id,
"spk_gender" : gender
}
return fileName, myDict

if __name__== "__main__":
main()
35 changes: 35 additions & 0 deletions data_preprocessing/Att-HACK_SLR88_proc/funcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
'''
Sina ALISAMIR
2020-2021
'''

def get_files_in_path(path, ext="wav"):
"""
Get files in a path
exampe : files = get_files_in_path("./audioFiles")
"""
import os, glob
path = os.path.join(path, "*."+ext)
theFiles = glob.glob(path, recursive=True)
return theFiles

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
# Print New Line on Complete
if iteration == total:
print()
25 changes: 25 additions & 0 deletions data_preprocessing/CaFE_proc/Preproc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
'''
Sina ALISAMIR
2020-2021
'''


import os, glob
from funcs import get_files_in_path, printProgressBar

def main():
path = "../CaFE_48k/**/"
newPath = "../wavs/"
theFiles = get_files_in_path(path)

for i, filePath in enumerate(theFiles):
fileNewPath = filePath.replace("/CaFE_48k/", "/wavs/")
directory = os.path.dirname(fileNewPath)
if not os.path.exists(directory):
os.makedirs(directory)
os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath)
printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete')

if __name__== "__main__":
main()
55 changes: 55 additions & 0 deletions data_preprocessing/CaFE_proc/WriteJson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
'''
Sina ALISAMIR
2020-2021
'''


import os, glob
from funcs import get_files_in_path, printProgressBar
from pydub import AudioSegment
import json

def main():
wavsPath = "../wavs/**/"
theFiles = get_files_in_path(wavsPath)

allFilesInfo = {}
for i, filePath in enumerate(theFiles):
audio_file = AudioSegment.from_wav(filePath)
if audio_file.duration_seconds < 1: continue
fileName, myDict = makeDict(filePath)
allFilesInfo[fileName] = myDict
with open('../data.json', 'w') as fp:
json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False)

def makeDict(filePath):
fileName = os.path.basename(filePath)[:-4]
spk_id = fileName[0:2]
gender = "M"
if int(spk_id)%2 == 0:
gender = "F"
trans = getTranscript(fileName)
audio_file = AudioSegment.from_wav(filePath)
duration = audio_file.duration_seconds
myDict = {
"path" : filePath[1:],
"trans" : trans,
"duration" : duration,
"spk_id" : spk_id,
"spk_gender" : gender
}
return fileName, myDict

def getTranscript(fileName):
trans = ""
if fileName[-1] == "1": trans = "Un cheval fou dans mon jardin"
if fileName[-1] == "2": trans = "Deux ânes aigris au pelage brun"
if fileName[-1] == "3": trans = "Trois cygnes aveugles au bord du lac"
if fileName[-1] == "4": trans = "Quatre vieilles truies éléphantesques"
if fileName[-1] == "5": trans = "Cinq pumas fiers et passionnés"
if fileName[-1] == "6": trans = "Six ours aimants domestiqués"
return trans

if __name__== "__main__":
main()
35 changes: 35 additions & 0 deletions data_preprocessing/CaFE_proc/funcs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
'''
Sina ALISAMIR
2020-2021
'''

def get_files_in_path(path, ext="wav"):
"""
Get files in a path
exampe : files = get_files_in_path("./audioFiles")
"""
import os, glob
path = os.path.join(path, "*."+ext)
theFiles = glob.glob(path, recursive=True)
return theFiles

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'):
"""
Call in a loop to create terminal progress bar
@params:
iteration - Required : current iteration (Int)
total - Required : total iterations (Int)
prefix - Optional : prefix string (Str)
suffix - Optional : suffix string (Str)
decimals - Optional : positive number of decimals in percent complete (Int)
length - Optional : character length of bar (Int)
fill - Optional : bar fill character (Str)
"""
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
# Print New Line on Complete
if iteration == total:
print()
Loading

0 comments on commit 61a3796

Please sign in to comment.