forked from LeBenchmark/NeurIPS2021
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding the first batch of preprocessing scripts
- Loading branch information
Showing
50 changed files
with
1,883 additions
and
0 deletions.
There are no files selected for viewing
24 changes: 24 additions & 0 deletions
24
data_preprocessing/African_Accented_French_proc/Preproc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
import os, glob | ||
from funcs import get_files_in_path, printProgressBar | ||
|
||
def main(): | ||
path = "../speech/**/" | ||
newPath = "../wavs/" | ||
theFiles = get_files_in_path(path) | ||
|
||
for i, filePath in enumerate(theFiles): | ||
fileNewPath = filePath.replace("/speech/", "/wavs/") | ||
directory = os.path.dirname(fileNewPath) | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath) | ||
printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete') | ||
|
||
if __name__== "__main__": | ||
main() |
105 changes: 105 additions & 0 deletions
105
data_preprocessing/African_Accented_French_proc/WriteJson.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
import os, glob | ||
from funcs import get_files_in_path, printProgressBar | ||
from pydub import AudioSegment | ||
import json | ||
import csv | ||
|
||
def main(): | ||
wavsPath = "../wavs/**/" | ||
theFiles = get_files_in_path(wavsPath) | ||
|
||
allFilesInfo = {} | ||
for i, filePath in enumerate(theFiles): | ||
audio_file = AudioSegment.from_wav(filePath) | ||
if audio_file.duration_seconds < 1 or audio_file.duration_seconds > 30: continue | ||
fileName, myDict = makeDict(filePath) | ||
allFilesInfo[fileName] = myDict | ||
printProgressBar(i + 1, len(theFiles), prefix = 'Processing Files:', suffix = 'Complete') | ||
with open('../data.json', 'w') as fp: | ||
json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False) | ||
|
||
def makeDict(filePath): | ||
fileName = os.path.basename(filePath)[:-4] | ||
trans = getTranscript(filePath) | ||
spk_id = "" | ||
gender = "U" | ||
if "_m_" in filePath.split(os.path.sep)[-2]: gender = "M" | ||
if "_f_" in filePath.split(os.path.sep)[-2]: gender = "F" | ||
audio_file = AudioSegment.from_wav(filePath) | ||
duration = audio_file.duration_seconds | ||
myDict = { | ||
"path" : filePath[1:], | ||
"trans" : trans, | ||
"duration" : duration, | ||
"spk_id" : spk_id, | ||
"spk_gender" : gender | ||
} | ||
return fileName, myDict | ||
|
||
def getTranscript(wavPath): | ||
fileName = os.path.basename(wavPath)[:-4] | ||
# print(wavPath) | ||
transcript = "" | ||
if wavPath.split(os.path.sep)[2] == "dev": | ||
transPath = "../transcripts/dev/niger_west_african_fr/transcripts.txt" | ||
with open(transPath, newline='\n') as csvfile: | ||
spamreader = csv.reader(csvfile, delimiter=' ') | ||
transList = list(spamreader) | ||
for i, trans in enumerate(transList): | ||
transWav = os.path.basename(trans[0])[:-4] | ||
if transWav == fileName: | ||
trans = ' '.join(trans[1:]) | ||
transcript = trans | ||
elif wavPath.split(os.path.sep)[2] == "devtest": | ||
transPath = "../transcripts/devtest/ca16_read/conditioned.txt" | ||
with open(transPath, newline='\n') as csvfile: | ||
spamreader = csv.reader(csvfile, delimiter=' ') | ||
transList = list(spamreader) | ||
for i, trans in enumerate(transList): | ||
transWav = os.path.basename(trans[0]) | ||
if transWav == fileName: | ||
trans = ' '.join(trans[1:]) | ||
transcript = trans | ||
elif wavPath.split(os.path.sep)[2] == "test": | ||
transPath = "../transcripts/test/ca16/prompts.txt" | ||
with open(transPath, newline='\n') as csvfile: | ||
spamreader = csv.reader(csvfile, delimiter=' ') | ||
transList = list(spamreader) | ||
for i, trans in enumerate(transList): | ||
transWav = os.path.basename(trans[0]) | ||
if transWav == fileName: | ||
trans = ' '.join(trans[1:]) | ||
transcript = trans | ||
elif wavPath.split(os.path.sep)[2] == "train": | ||
subFolder = wavPath.split(os.path.sep)[3] | ||
baseWav = os.path.basename(wavPath)[:-4] | ||
# print(subFolder, baseWav) | ||
if subFolder == "ca16": | ||
if "conv" in baseWav: | ||
transPath = "../transcripts/train/ca16_conv/transcripts.txt" | ||
else: | ||
transPath = "../transcripts/train/ca16_read/conditioned.txt" | ||
else: | ||
transPath = "../transcripts/train/yaounde/fn_text.txt" | ||
with open(transPath, newline='\n') as csvfile: | ||
spamreader = csv.reader(csvfile, delimiter=' ') | ||
transList = list(spamreader) | ||
for i, trans in enumerate(transList): | ||
if subFolder == "yaounde": | ||
subsubFolder = wavPath.split(os.path.sep)[4] | ||
if not subsubFolder in trans[0]: continue | ||
transWav = os.path.basename(trans[0]) | ||
if transWav[-4] == '.': transWav = transWav[:-4] | ||
if transWav == fileName: | ||
trans = ' '.join(trans[1:]) | ||
transcript = trans | ||
return transcript | ||
|
||
if __name__== "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
def get_files_in_path(path, ext="wav"): | ||
""" | ||
Get files in a path | ||
exampe : files = get_files_in_path("./audioFiles") | ||
""" | ||
import os, glob | ||
path = os.path.join(path, "*."+ext) | ||
theFiles = glob.glob(path, recursive=True) | ||
return theFiles | ||
|
||
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'): | ||
""" | ||
Call in a loop to create terminal progress bar | ||
@params: | ||
iteration - Required : current iteration (Int) | ||
total - Required : total iterations (Int) | ||
prefix - Optional : prefix string (Str) | ||
suffix - Optional : suffix string (Str) | ||
decimals - Optional : positive number of decimals in percent complete (Int) | ||
length - Optional : character length of bar (Int) | ||
fill - Optional : bar fill character (Str) | ||
""" | ||
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) | ||
filledLength = int(length * iteration // total) | ||
bar = fill * filledLength + '-' * (length - filledLength) | ||
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r') | ||
# Print New Line on Complete | ||
if iteration == total: | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
import os, glob | ||
from funcs import get_files_in_path, printProgressBar | ||
|
||
def main(): | ||
path = "../Volumes/CLEM_HDD/IRCAM/Open_SLR/wav/**/" | ||
newPath = "../wavs/" | ||
theFiles = get_files_in_path(path) | ||
|
||
for i, filePath in enumerate(theFiles): | ||
fileNewPath = filePath.replace("/Volumes/CLEM_HDD/IRCAM/Open_SLR/wav/", "/wavs/") | ||
directory = os.path.dirname(fileNewPath) | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath) | ||
printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete') | ||
|
||
if __name__== "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
import os, glob | ||
from funcs import get_files_in_path, printProgressBar | ||
from pydub import AudioSegment | ||
import json | ||
|
||
def main(): | ||
wavsPath = "../wavs/**/" | ||
txtPath = "../Volumes/CLEM_HDD/IRCAM/Open_SLR/txt/" | ||
theFiles = get_files_in_path(wavsPath) | ||
|
||
allFilesInfo = {} | ||
for i, filePath in enumerate(theFiles): | ||
audio_file = AudioSegment.from_wav(filePath) | ||
if audio_file.duration_seconds < 1: continue | ||
fileName, myDict = makeDict(filePath, txtPath) | ||
allFilesInfo[fileName] = myDict | ||
printProgressBar(i + 1, len(theFiles), prefix = 'Processing Files:', suffix = 'Complete') | ||
with open('../data.json', 'w') as fp: | ||
json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False) | ||
|
||
def makeDict(filePath, txtPath): | ||
fileName = os.path.basename(filePath)[:-4] | ||
fileTrs = os.path.join(txtPath, fileName+".txt") | ||
with open(fileTrs) as f: | ||
trs = f.readlines()[0] | ||
spk_id = fileName[0:3] | ||
gender = fileName[0] | ||
audio_file = AudioSegment.from_wav(filePath) | ||
duration = audio_file.duration_seconds | ||
myDict = { | ||
"path" : filePath[1:], | ||
"trans" : trs, | ||
"duration" : duration, | ||
"spk_id" : spk_id, | ||
"spk_gender" : gender | ||
} | ||
return fileName, myDict | ||
|
||
if __name__== "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
def get_files_in_path(path, ext="wav"): | ||
""" | ||
Get files in a path | ||
exampe : files = get_files_in_path("./audioFiles") | ||
""" | ||
import os, glob | ||
path = os.path.join(path, "*."+ext) | ||
theFiles = glob.glob(path, recursive=True) | ||
return theFiles | ||
|
||
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'): | ||
""" | ||
Call in a loop to create terminal progress bar | ||
@params: | ||
iteration - Required : current iteration (Int) | ||
total - Required : total iterations (Int) | ||
prefix - Optional : prefix string (Str) | ||
suffix - Optional : suffix string (Str) | ||
decimals - Optional : positive number of decimals in percent complete (Int) | ||
length - Optional : character length of bar (Int) | ||
fill - Optional : bar fill character (Str) | ||
""" | ||
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) | ||
filledLength = int(length * iteration // total) | ||
bar = fill * filledLength + '-' * (length - filledLength) | ||
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r') | ||
# Print New Line on Complete | ||
if iteration == total: | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
|
||
import os, glob | ||
from funcs import get_files_in_path, printProgressBar | ||
|
||
def main(): | ||
path = "../CaFE_48k/**/" | ||
newPath = "../wavs/" | ||
theFiles = get_files_in_path(path) | ||
|
||
for i, filePath in enumerate(theFiles): | ||
fileNewPath = filePath.replace("/CaFE_48k/", "/wavs/") | ||
directory = os.path.dirname(fileNewPath) | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
os.system('sox ' + filePath + ' -r 16000 -c 1 -b 16 -e signed-integer ' + fileNewPath) | ||
printProgressBar(i + 1, len(theFiles), prefix = 'Transforming Files:', suffix = 'Complete') | ||
|
||
if __name__== "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
|
||
import os, glob | ||
from funcs import get_files_in_path, printProgressBar | ||
from pydub import AudioSegment | ||
import json | ||
|
||
def main(): | ||
wavsPath = "../wavs/**/" | ||
theFiles = get_files_in_path(wavsPath) | ||
|
||
allFilesInfo = {} | ||
for i, filePath in enumerate(theFiles): | ||
audio_file = AudioSegment.from_wav(filePath) | ||
if audio_file.duration_seconds < 1: continue | ||
fileName, myDict = makeDict(filePath) | ||
allFilesInfo[fileName] = myDict | ||
with open('../data.json', 'w') as fp: | ||
json.dump(allFilesInfo, fp, indent=4, ensure_ascii=False) | ||
|
||
def makeDict(filePath): | ||
fileName = os.path.basename(filePath)[:-4] | ||
spk_id = fileName[0:2] | ||
gender = "M" | ||
if int(spk_id)%2 == 0: | ||
gender = "F" | ||
trans = getTranscript(fileName) | ||
audio_file = AudioSegment.from_wav(filePath) | ||
duration = audio_file.duration_seconds | ||
myDict = { | ||
"path" : filePath[1:], | ||
"trans" : trans, | ||
"duration" : duration, | ||
"spk_id" : spk_id, | ||
"spk_gender" : gender | ||
} | ||
return fileName, myDict | ||
|
||
def getTranscript(fileName): | ||
trans = "" | ||
if fileName[-1] == "1": trans = "Un cheval fou dans mon jardin" | ||
if fileName[-1] == "2": trans = "Deux ânes aigris au pelage brun" | ||
if fileName[-1] == "3": trans = "Trois cygnes aveugles au bord du lac" | ||
if fileName[-1] == "4": trans = "Quatre vieilles truies éléphantesques" | ||
if fileName[-1] == "5": trans = "Cinq pumas fiers et passionnés" | ||
if fileName[-1] == "6": trans = "Six ours aimants domestiqués" | ||
return trans | ||
|
||
if __name__== "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
''' | ||
Sina ALISAMIR | ||
2020-2021 | ||
''' | ||
|
||
def get_files_in_path(path, ext="wav"): | ||
""" | ||
Get files in a path | ||
exampe : files = get_files_in_path("./audioFiles") | ||
""" | ||
import os, glob | ||
path = os.path.join(path, "*."+ext) | ||
theFiles = glob.glob(path, recursive=True) | ||
return theFiles | ||
|
||
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 50, fill = '█'): | ||
""" | ||
Call in a loop to create terminal progress bar | ||
@params: | ||
iteration - Required : current iteration (Int) | ||
total - Required : total iterations (Int) | ||
prefix - Optional : prefix string (Str) | ||
suffix - Optional : suffix string (Str) | ||
decimals - Optional : positive number of decimals in percent complete (Int) | ||
length - Optional : character length of bar (Int) | ||
fill - Optional : bar fill character (Str) | ||
""" | ||
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) | ||
filledLength = int(length * iteration // total) | ||
bar = fill * filledLength + '-' * (length - filledLength) | ||
print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r') | ||
# Print New Line on Complete | ||
if iteration == total: | ||
print() |
Oops, something went wrong.