Skip to content

Commit

Permalink
Build binary lm from existing arpa file
Browse files Browse the repository at this point in the history
  • Loading branch information
diegomarq committed Jan 8, 2022
1 parent e87642e commit a4f0c3d
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 47 deletions.
2 changes: 1 addition & 1 deletion train/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ RUN apt update -q \
&& apt install -y -qq tzdata bash build-essential git curl wget software-properties-common \
vim ca-certificates libffi-dev libssl-dev libsndfile1 libbz2-dev liblzma-dev locales \
libboost-all-dev libboost-tools-dev libboost-thread-dev cmake \
python3 python3-setuptools python3-pip cython
python3 python3-setuptools python3-pip cython ffmpeg

RUN python3 -m pip install --upgrade pip

Expand Down
22 changes: 12 additions & 10 deletions train/python/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,32 +13,34 @@

if __name__ == "__main__":

perform_training_wav2vec2 = True
perform_training_wav2vec2 = False
perform_training_kenlm = False
perform_optimize_kenlm = False
perform_optimize_kenlm = True

#organisation = "pt001"
models_root_dir = "/models"
models_root_dir = "/root"
wav2vec2_model_name = "wav2vec2-xlsr-s1-portuguese"
#language="cy"
kenlm_model_name = "kenlm"

wav2vec2_model_dir = os.path.join(Path.home(), wav2vec2_model_name)
lm_model_dir = os.path.join(Path.home(), kenlm_model_name)
wav2vec2_model_dir = os.path.join(models_root_dir, wav2vec2_model_name)
lm_model_dir = os.path.join(models_root_dir, kenlm_model_name)

print ("\nTraining acoustic model...")
if perform_training_wav2vec2: wav2vec2_model_dir = train_wav2vec2.train(wav2vec2_model_dir)

print ("\n\nTraining KenLM language model...")
if perform_training_kenlm: lm_model_dir = train_kenlm.train(lm_model_dir, "unshuffled_deduplicated_cy")
if perform_training_kenlm: lm_model_dir = train_kenlm.train(lm_model_dir, "pt_sample_dataset.py")

print ("\n\nOptimizing KenLM language model...")
print (lm_model_dir)
if perform_optimize_kenlm: train_kenlm.optimize(lm_model_dir, wav2vec2_model_dir)
if perform_optimize_kenlm: train_kenlm.optimize(lm_model_dir, wav2vec2_model_dir, "pt_sample_dataset.py")

print ("Packaging for publishing...")
publish_dir = os.path.join(models_root_dir, "published", wav2vec2_model_name)
kenlm_archive_file_path = publish.make_model_tarfile(kenlm_model_name, lm_model_dir, publish_dir)
wav2vec2_published_file_path = publish.copy_for_evaluation_or_publishing(wav2vec2_model_dir, publish_dir)

if perform_optimize_kenlm: kenlm_archive_file_path = publish.make_model_tarfile(kenlm_model_name, lm_model_dir, publish_dir)

#wav2vec2_published_file_path = publish.copy_for_evaluation_or_publishing(wav2vec2_model_dir, publish_dir)

print ("Files for publication ready at {}".format(wav2vec2_published_file_path))
#print ("Files for publication ready at {}".format(wav2vec2_published_file_path))
50 changes: 26 additions & 24 deletions train/python/train_kenlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

DESCRIPTION = """
Train and optimize a KenLM language model from HuggingFace's provision of the Welsh corpus by the OSCAR project.
Train and optimize a KenLM language model from HuggingFace's provision of the Portuguese corpus.
"""

Expand Down Expand Up @@ -66,7 +66,7 @@ def optimize_lm_objective(trial):
blank_id=processor.tokenizer.pad_token_id,
log_probs_input=True
)
result = test_dataset.map(decode)
result = dataset_test.map(decode)
result_wer = wer.compute(predictions=result["pred_strings_with_lm"], references=result["sentence"])
trial.report(result_wer, step=0)

Expand All @@ -79,28 +79,31 @@ def optimize_lm_objective(trial):



def train(lm_dir, oscar_dataset_name):
def train(lm_dir, dataset_name):

Path(lm_dir).mkdir(parents=True, exist_ok=True)
corpus_file_path = os.path.join(lm_dir, "corpus.txt")

print ("\nLoading OSCAR {} dataset...".format(oscar_dataset_name))
oscar_corpus = load_dataset("oscar", oscar_dataset_name)
#corpus_name="pt_sample_dataset"

print ("\nExporting OSCAR to text file {}...".format(corpus_file_path))
with open(corpus_file_path, 'w', encoding='utf-8') as corpus_file:
for line in oscar_corpus["train"]:
t = text_preprocess.cleanup(line["text"])
corpus_file.write(t)
#print ("\n Train LM from {} corpus...".format(corpus_name))
#print ("\nLoading {} dataset...".format(dataset_name))
#oscar_corpus = load_dataset("oscar", dataset_name)

#print ("\nExporting PT corpus to text file {}...".format(corpus_file_path))
#with open(corpus_file_path, 'w', encoding='utf-8') as corpus_file:
# for line in oscar_corpus["train"]:
# t = text_preprocess.cleanup(line["text"])
# corpus_file.write(t)

# generate KenLM ARPA file language model
lm_arpa_file_path=os.path.join(lm_dir, "lm.arpa")
lm_bin_file_path=os.path.join(lm_dir, "lm.binary")
lm_arpa_file_path=os.path.join(lm_dir, "lm_vaudimus_small.arpa.gz")
lm_bin_file_path=os.path.join(lm_dir, "lm_vaudimus_small.binary")

cmd = "lmplz -o {n} --text {corpus_file} --arpa {lm_file}".format(n=5, corpus_file=corpus_file_path, lm_file=lm_arpa_file_path)
print (cmd)
#cmd = "lmplz -o {n} --text {corpus_file} --arpa {lm_file}".format(n=5, corpus_file=corpus_file_path, lm_file=lm_arpa_file_path)
#print (cmd)

subprocess.run(shlex.split(cmd), stderr=sys.stderr, stdout=sys.stdout)
#subprocess.run(shlex.split(cmd), stderr=sys.stderr, stdout=sys.stdout)

# generate binary version
cmd = "build_binary trie {arpa_file} {bin_file}".format(arpa_file=lm_arpa_file_path, bin_file=lm_bin_file_path)
Expand All @@ -109,26 +112,25 @@ def train(lm_dir, oscar_dataset_name):
subprocess.run(shlex.split(cmd), stderr=sys.stderr, stdout=sys.stdout)

#
os.remove(corpus_file_path)
#os.remove(corpus_file_path)
os.remove(lm_arpa_file_path)

return lm_dir



def optimize(lm_dir, wav2vec_model_path):
def optimize(lm_dir, wav2vec_model_path, dataset_name):
global processor
global model
global vocab
global wer
global resampler
global test_dataset
global dataset_test
global lm_model_dir

lm_model_dir=lm_dir

test_dataset = load_dataset("custom_common_voice.py", "cy", split="test")
#test_dataset = load_dataset("common_voice", "cy", split="test")
dataset_test = load_dataset(dataset_name, split="test")

wer = load_metric("wer")

Expand All @@ -144,12 +146,12 @@ def optimize(lm_dir, wav2vec_model_path):
vocab[space_ix]=' '

print ("Preprocessing speech files")
test_dataset = test_dataset.map(speech_file_to_array_fn)
dataset_test = dataset_test.map(speech_file_to_array_fn)


print ("Beginning alpha and beta hyperparameter optimization")
study = optuna.create_study()
study.optimize(optimize_lm_objective, n_jobs=1, n_trials=100)
study.optimize(optimize_lm_objective, n_jobs=1, n_trials=10)

#
lm_best = {'alpha':study.best_params['lm_alpha'], 'beta':study.best_params['lm_beta']}
Expand All @@ -163,8 +165,8 @@ def optimize(lm_dir, wav2vec_model_path):


def main(lm_root_dir, wav2vec2_model_path, **args):
lm_file_path=train_kenlm(lm_root_dir, "unshuffled_deduplicated_cy")
optimize_kenlm(lm_file_path, wav2vec2_model_path)
lm_file_path=train_kenlm(lm_root_dir, "pt_sample_dataset.py")
optimize_kenlm(lm_file_path, wav2vec2_model_path, "pt_sample_dataset.py")



Expand Down
35 changes: 23 additions & 12 deletions train/python/train_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import librosa
import numpy as np
import soundfile as sf
import warnings

import publish

Expand Down Expand Up @@ -119,8 +120,11 @@ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) ->


def speech_file_to_array_fn(batch):
speech_array, sampling_rate = torchaudio.load(batch["path"])
batch["speech"] = speech_array[0].numpy()
#speech_array, sampling_rate = torchaudio.load(batch["audio"])
with warnings.catch_warnings():
warnings.simplefilter("ignore")
speech_array, sampling_rate = librosa.load(batch["audio"], sr=16_000)
batch["speech"] = speech_array
batch["sampling_rate"] = sampling_rate
batch["target_text"] = batch["sentence"]
return batch
Expand All @@ -134,11 +138,12 @@ def resample(batch):

def prepare_dataset(batch):
# check that all files have the correct sampling rate
assert (
len(set(batch["sampling_rate"])) == 1
), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
#assert (
# len(set(batch["sampling_rate"])) == 1
#), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values[0]
batch["input_length"] = len(batch["input_values"])

with processor.as_target_processor():
batch["labels"] = processor(batch["target_text"]).input_ids
Expand Down Expand Up @@ -241,14 +246,15 @@ def train(output_dir, train=True):
dataset_test = dataset_test.map(speech_file_to_array_fn, remove_columns=dataset_test.column_names)

print ("\nDownsampling all speech files")
dataset_train = dataset_train.map(resample, num_proc=4)
dataset_test = dataset_test.map(resample, num_proc=4)
print ("\n ---- Not necessary")
#dataset_train = dataset_train.map(resample, num_proc=4)
#dataset_test = dataset_test.map(resample, num_proc=4)

print ("\nPreparing the training dataset")
dataset_train = dataset_train.map(prepare_dataset, remove_columns=dataset_train.column_names, batch_size=8, num_proc=4, batched=True)
dataset_train = dataset_train.map(prepare_dataset, remove_columns=dataset_train.column_names, batch_size=8, num_proc=4)

print ("\nPreparing test set")
dataset_test = dataset_test.map(prepare_dataset, remove_columns=dataset_test.column_names, batch_size=8, num_proc=4, batched=True)
dataset_test = dataset_test.map(prepare_dataset, remove_columns=dataset_test.column_names, batch_size=8, num_proc=4)

print ("\nTESTING =====> Getting sample <=====")
max_input_length_in_sec = 30.0
Expand Down Expand Up @@ -326,8 +332,13 @@ def train(output_dir, train=True):
print ("\nTraining...")
trainer.train()

# copy config and model binary file
publish.export_checkpoint(output_dir)
try:
# copy config and model binary file
publish.export_checkpoint(output_dir)
print ("\n ==> Saving model as publish")
except:
trainer.save_model(output_dir)
print ("\n ==> Saving model as trainer")

print ("\n\nModel trained. See %s" % output_dir)

Expand Down

0 comments on commit a4f0c3d

Please sign in to comment.