From 999d45feb4b3f8fc9ae9552c395c493357ee9032 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Thu, 17 Aug 2023 14:02:42 +0200 Subject: [PATCH 1/8] convert parse prompts --- convert.py | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/convert.py b/convert.py index 6162aae..6baef79 100644 --- a/convert.py +++ b/convert.py @@ -28,6 +28,7 @@ --codec-path ~/Documents/encodec.cpp/ggml_weights \ --vocab-path ./ggml_weights/ \ --out-dir ./ggml_weights/ \ + --prompts-path ~/Documents/bark/bark/assets/prompts/ \ --use-f16 ``` """ @@ -45,10 +46,11 @@ parser.add_argument("--codec-path", type=str, required=True) parser.add_argument("--vocab-path", type=str, required=True) parser.add_argument("--out-dir", type=str, required=True) +parser.add_argument("--prompts-path", type=str, required=False) parser.add_argument("--use-f16", action="store_true") -def parse_codec_model(checkpoint, out_dir): +def parse_codec_model(checkpoint: dict, out_dir: Path): """Load encodec model checkpoint.""" outfile = open(out_dir, "wb") outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic @@ -106,7 +108,7 @@ def parse_codec_model(checkpoint, out_dir): outfile.close() -def parse_hparams(hparams, outfile, use_f16): +def parse_hparams(hparams: dict[str, any], outfile: Path, use_f16: bool): """Parse GPT hyperparameters.""" outfile.write(struct.pack("i", hparams["n_layer"])) outfile.write(struct.pack("i", hparams["n_head"])) @@ -127,12 +129,12 @@ def parse_hparams(hparams, outfile, use_f16): n_wtes = hparams["n_codes_total"] except KeyError: n_lm_heads, n_wtes = 1, 1 - + ftype = int(use_f16) outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype)) -def parse_text_models(checkpoint, outfile, use_f16): +def parse_text_models(checkpoint: dict, outfile: Path, use_f16: bool): """Load GPT model checkpoint (text, fine, coarse).""" for name in checkpoint.keys(): var_data = checkpoint[name].squeeze().numpy() @@ -233,7 +235,7 @@ def parse_text_models(checkpoint, outfile, use_f16): var_data.tofile(outfile) -def generate_file(in_file, out_dir, use_f16): +def generate_file(in_file: Path, out_dir: Path, use_f16: bool): with open(out_dir, "wb") as fout: fout.write(struct.pack("i", 0x67676d6c)) # ggml magic @@ -241,7 +243,7 @@ def generate_file(in_file, out_dir, use_f16): parse_hparams(checkpoint["model_args"], fout, use_f16) parse_text_models(checkpoint["model"], fout, use_f16) -def generate_vocab_file(dir_model, out_dir): +def generate_vocab_file(dir_model: Path, out_dir: Path): """Parse vocabulary.""" # Even if bark relies on GPT to encode text, it uses BertTokenizer (WordPiece) with open(dir_model / "vocab.txt", "r", encoding="utf-8") as fin: @@ -257,6 +259,36 @@ def generate_vocab_file(dir_model, out_dir): fout.write(struct.pack("i", len(data))) fout.write(data) +def generate_prompts_file(dir_model: Path, out_dir: Path): + """Parse history prompts (custom voices).""" + all_prompts_path = list(dir_model.glob("**/*_speaker_*.npz")) + + with open(out_dir, "wb") as fout: + fout.write(struct.pack("i", 0x67676d6c)) # ggml magic + fout.write(struct.pack("i", len(all_prompts_path))) + print("Number of prompts detected:", len(all_prompts_path)) + + for path in all_prompts_path: + print(f" {path.stem} loaded.") + prompt_name = path.stem.encode("utf-8") + history_prompt = np.load(path) + + fout.write(struct.pack("i", len(prompt_name))) + fout.write(prompt_name) + + for k in history_prompt.keys(): + arr = history_prompt[k] + n_dims = len(arr.shape) + encoded_k = k.encode("utf-8") + + fout.write(struct.pack("ii", n_dims, len(encoded_k))) + for i in range(n_dims): + fout.write(struct.pack("i", arr.shape[n_dims - 1 - i])) + fout.write(encoded_k) + + arr.tofile(fout) + print(f" {k} loaded.") + if __name__ == "__main__": args = parser.parse_args() @@ -284,4 +316,9 @@ def generate_vocab_file(dir_model, out_dir): parse_codec_model(codec_chkpt, out_dir / "ggml_weights_codec.bin") print(" Codec model loaded.") + if args.prompts_path: + prompts_path = Path(args.prompts_path) + generate_prompts_file(prompts_path, out_dir / "ggml_prompts.bin") + print(" Prompts loaded.") + print("Done.") From 9af3ede09bc1a650bff135351ae71f430b4329e6 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Thu, 17 Aug 2023 14:02:53 +0200 Subject: [PATCH 2/8] enum for bark languages --- bark.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/bark.h b/bark.h index 3beae13..a58a471 100644 --- a/bark.h +++ b/bark.h @@ -1,3 +1,15 @@ +/* Bark is a text-to-speech model for realistic speech generation. +The model supports 13 languages that can be found in `bark_languages`. +Multiple preset voices (history prompts) are shipped with Bark, allowing the user to +generate the same speech with multiple different voices. + +You can try any combination of voices by using the following pattern: + _speaker_ + +where can be either "" or "v2" + can be the last two letters of any languages supported by bark + is an integer between 0 and 9 (inclusive). +*/ #pragma once #include "encodec.h" @@ -27,6 +39,22 @@ #define COARSE_SEMANTIC_PAD_TOKEN 12048 #define COARSE_INFER_TOKEN 12050 +enum bark_languages { + BARK_LANG_EN = 0, // English + BARK_LANG_DE = 1, // German + BARK_LANG_ES = 2, // Spanish + BARK_LANG_FR = 3, // French + BARK_LANG_HI = 4, // Hindi + BARK_LANG_IT = 5, // Italian + BARK_LANG_JA = 6, // Japanese + BARK_LANG_KO = 7, // Korean + BARK_LANG_PL = 8, // Polish + BARK_LANG_PT = 9, // Portuguese + BARK_LANG_RU = 10, // Russian + BARK_LANG_TR = 11, // Turkish + BARK_LANG_ZH = 12, // Chinese +}; + struct bark_params { int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); From 849727384f24587d92b80a1c74426b24dea41559 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Thu, 17 Aug 2023 14:57:01 +0200 Subject: [PATCH 3/8] bark prompt load --- bark.cpp | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ bark.h | 21 ++++++++++ 2 files changed, 146 insertions(+) diff --git a/bark.cpp b/bark.cpp index c820086..be6ee91 100644 --- a/bark.cpp +++ b/bark.cpp @@ -228,6 +228,131 @@ bool bark_vocab_load(const std::string& fname, bark_vocab& vocab, int32_t expect return true; } +bool bark_prompt_load(const std::string & fname, bark_history_prompts & history_prompts) { + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: faield to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // upper bound on the ctx size needed to store all prompts (not very large) + size_t ctx_size = 10*MB; + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + + history_prompts.ctx = ggml_init(params); + if (!history_prompts.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + auto & ctx = history_prompts.ctx; + + int32_t n_prompts; + read_safe(fin, n_prompts); + + std::string prompt_name; + std::vector tmp; + + tmp.reserve(128); + + for (int i = 0; i < n_prompts; i++) { + uint32_t len; + read_safe(fin, len); + + if (len > 0) { + tmp.resize(len); + fin.read(&tmp[0], tmp.size()); // read to buffer + prompt_name.assign(&tmp[0], tmp.size()); + } else { + fprintf(stderr, "%s: invalid prompt name\n", __func__); + } + + int64_t memsize = 0; + + struct ggml_tensor * semantic_prompt; + struct ggml_tensor * coarse_prompt; + struct ggml_tensor * fine_prompt; + + std::map prompt_tensors = { + { "semantic_prompt", semantic_prompt }, + { "coarse_prompt" , coarse_prompt }, + { "fine_prompt" , fine_prompt }, + }; + + int32_t n_keys; + read_safe(fin, n_keys); + + for (int k = 0; k < n_keys; k++) { + int32_t n_dims; + int32_t length; + + read_safe(fin, n_dims); + read_safe(fin, length); + + int64_t nelements = 1; + int64_t ne[4] = { 1, 1, 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + read_safe(fin, ne[i]); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if ((name != "semantic_prompt") && (name != "coarse_prompt") && (name != "fine_prompt")) { + fprintf(stderr, "%s: tensor '%s' has an unknown key: '%s'\n", __func__, prompt_name, name); + return false; + } + + const size_t bpe = ggml_type_size(GGML_TYPE_I32); + + auto & tensor = prompt_tensors[name]; + tensor = ggml_new_tensor(ctx, GGML_TYPE_I32, 4, ne); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + memsize += ggml_nbytes(tensor); + } + + struct bark_voice voice = { + /*.name =*/ prompt_name, + /*.semantic_prompt =*/ prompt_tensors["semantic_prompt"], + /*.coarse_prompt =*/ prompt_tensors["coarse_prompt"], + /*.fine_prompt =*/ prompt_tensors["fine_prompt"], + /*.memsize =*/ memsize, + }; + + history_prompts.voices[prompt_name] = &voice; + history_prompts.memsize += memsize; + } + + return true; +} + bool gpt_model_load(const std::string& fname, gpt_model& model) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { diff --git a/bark.h b/bark.h index a58a471..7051ddd 100644 --- a/bark.h +++ b/bark.h @@ -81,6 +81,24 @@ struct gpt_hparams { int32_t n_codes_given = 1; }; +struct bark_voice { + std::string name; + + struct ggml_tensor * semantic_prompt; + struct ggml_tensor * coarse_prompt; + struct ggml_tensor * fine_prompt; + + int64_t memsize; +}; + +struct bark_history_prompts { + struct ggml_context * ctx; + + std::map voices; + + int64_t memsize; +}; + struct bark_vocab { using id = int32_t; using token = std::string; @@ -159,6 +177,9 @@ struct bark_model { // vocab bark_vocab vocab; + // history prompts + bark_history_prompts history_prompts; + int64_t memsize = 0; }; From db24cffc879f6ab866f68d90dd3807050083263f Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Thu, 17 Aug 2023 15:03:01 +0200 Subject: [PATCH 4/8] load history prompt --- bark.cpp | 23 +++++++++++++++++------ bark.h | 12 +++++++++--- examples/main.cpp | 2 +- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/bark.cpp b/bark.cpp index be6ee91..7183b5b 100644 --- a/bark.cpp +++ b/bark.cpp @@ -638,14 +638,14 @@ bool gpt_model_load(const std::string& fname, gpt_model& model) { return true; } -bool bark_model_load(const std::string & dirname, bark_model & model) { +bool bark_model_load(const std::string & dirname, bark_model & model, bool load_history_prompts) { printf("%s: loading model from '%s'\n", __func__, dirname.c_str()); // text { printf("%s: reading bark text model\n", __func__); const std::string fname = dirname + "/ggml_weights_text.bin"; - if(!gpt_model_load(fname, model.text_model)) { + if (!gpt_model_load(fname, model.text_model)) { fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); return false; } @@ -658,7 +658,7 @@ bool bark_model_load(const std::string & dirname, bark_model & model) { const std::string fname = dirname + "/ggml_vocab.bin"; const gpt_hparams hparams = model.text_model.hparams; const int32_t expected_size = hparams.n_in_vocab - hparams.n_out_vocab - 5; - if(!bark_vocab_load(fname, model.vocab, expected_size)) { + if (!bark_vocab_load(fname, model.vocab, expected_size)) { fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); return false; } @@ -668,7 +668,7 @@ bool bark_model_load(const std::string & dirname, bark_model & model) { { printf("\n%s: reading bark coarse model\n", __func__); const std::string fname = dirname + "/ggml_weights_coarse.bin"; - if(!gpt_model_load(fname, model.coarse_model)) { + if (!gpt_model_load(fname, model.coarse_model)) { fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str()); return false; } @@ -679,7 +679,7 @@ bool bark_model_load(const std::string & dirname, bark_model & model) { { printf("\n%s: reading bark fine model\n", __func__); const std::string fname = dirname + "/ggml_weights_fine.bin"; - if(!gpt_model_load(fname, model.fine_model)) { + if (!gpt_model_load(fname, model.fine_model)) { fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str()); return false; } @@ -690,13 +690,24 @@ bool bark_model_load(const std::string & dirname, bark_model & model) { { printf("\n%s: reading bark codec model\n", __func__); const std::string fname = dirname + "/ggml_weights_codec.bin"; - if(!encodec_model_load(fname, model.codec_model)) { + if (!encodec_model_load(fname, model.codec_model)) { fprintf(stderr, "%s: invalid model file '%s' (bad codec)\n", __func__, fname.c_str()); return false; } model.memsize += model.codec_model.memsize; } + // history prompts + if (load_history_prompts) { + printf("\n%s: reading history prompts\n", __func__); + const std::string fname = dirname + "/ggml_prompts.bin"; + if (!bark_prompt_load(fname, model.history_prompts)) { + fprintf(stderr, "%s: invalid prompt file '%s'\n", __func__, fname.c_str()); + return false; + } + model.memsize += model.history_prompts.memsize; + } + printf("\n%s: total model size = %8.2f MB\n", __func__, model.memsize/1024.0/1024.0); return true; diff --git a/bark.h b/bark.h index 7051ddd..d6dbdac 100644 --- a/bark.h +++ b/bark.h @@ -208,9 +208,15 @@ bark_vocab::id gpt_sample( float temp, float * eos_p); -bool bark_model_load(const std::string & dirname, bark_model & model); - -bool bark_vocab_load(const std::string & fname, bark_vocab& vocab, int32_t expected_size); +bool bark_model_load( + const std::string & dirname, + bark_model & model, + bool load_history_prompts); + +bool bark_vocab_load( + const std::string & fname, + bark_vocab & vocab, + int32_t expected_size); void bert_tokenize( const bark_vocab & vocab, diff --git a/examples/main.cpp b/examples/main.cpp index 3abdef3..b5b9cff 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -26,7 +26,7 @@ int main(int argc, char **argv) { { const int64_t t_start_us = ggml_time_us(); - if(!bark_model_load(fname, model)) { + if(!bark_model_load(fname, model, false)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, fname.c_str()); return 1; } From aad66efefe6150a312e24fecb8d92a2fd4c06b78 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Thu, 17 Aug 2023 15:10:29 +0200 Subject: [PATCH 5/8] add custom voice to API --- bark.cpp | 8 ++++++-- bark.h | 1 + examples/main.cpp | 7 ++++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/bark.cpp b/bark.cpp index 7183b5b..631e8d4 100644 --- a/bark.cpp +++ b/bark.cpp @@ -1959,6 +1959,8 @@ bool bark_params_parse(int argc, char ** argv, bark_params & params) { params.seed = std::stoi(argv[++i]); } else if (arg == "-o" || arg == "--outwav") { params.dest_wav_path = argv[++i]; + } else if (arg == "-v" || arg == "--voice") { + params.voice = argv[++i]; } else if (arg == "-h" || arg == "--help") { bark_print_usage(argv, params); exit(0); @@ -1981,9 +1983,11 @@ void bark_print_usage(char ** argv, const bark_params & params) { fprintf(stderr, " -s N, --seed N seed for random number generator (default: %d)\n", params.seed); fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -o FNAME, --outwav FNAME\n"); + fprintf(stderr, " -o FNAME, --outwav FNAME\n"); fprintf(stderr, " output generated wav (default: %s)\n", params.dest_wav_path.c_str()); + fprintf(stderr, " -v VOICE, --voice VOICE\n"); + fprintf(stderr, " custom voice (default: none)\n", params.voice.c_str()); fprintf(stderr, "\n"); } diff --git a/bark.h b/bark.h index d6dbdac..90af8b3 100644 --- a/bark.h +++ b/bark.h @@ -63,6 +63,7 @@ struct bark_params { int32_t seed = 0; std::string prompt; // user prompt + std::string voice; // custom voice (history prompts) std::string dest_wav_path = "./output.wav"; }; diff --git a/examples/main.cpp b/examples/main.cpp index b5b9cff..f4d2e41 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -17,16 +17,21 @@ int main(int argc, char **argv) { bark_model model; std::string fname = "./ggml_weights"; + bool load_history_prompts = false; if (!params.model.empty()) { fname = params.model; } + if (!params.voice.empty()) { + load_history_prompts = true; + } + // load the model { const int64_t t_start_us = ggml_time_us(); - if(!bark_model_load(fname, model, false)) { + if(!bark_model_load(fname, model, load_history_prompts)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, fname.c_str()); return 1; } From 922b929333208551dbe2b80b7b105e48cbbfbd46 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Thu, 17 Aug 2023 15:23:09 +0200 Subject: [PATCH 6/8] pass args to text 3 forward pass funcs --- bark.cpp | 21 ++++++++++----------- bark.h | 8 ++++++-- examples/main.cpp | 4 +++- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/bark.cpp b/bark.cpp index 631e8d4..964a5c1 100644 --- a/bark.cpp +++ b/bark.cpp @@ -1481,6 +1481,7 @@ bark_sequence bark_tokenize_input(const char * text, const bark_vocab & vocab, i bark_sequence bark_forward_text_encoder( const bark_sequence & tokens, const gpt_model model, + const std::string & voice, std::mt19937 & rng, const int n_threads, const float temp, @@ -1547,6 +1548,7 @@ bark_sequence bark_forward_text_encoder( bark_codes bark_forward_coarse_encoder( const bark_sequence & tokens, const gpt_model model, + const std::string & voice, std::mt19937 & rng, const int n_threads, const float temp, @@ -1670,6 +1672,7 @@ bark_codes bark_forward_coarse_encoder( bark_codes bark_forward_fine_encoder( const bark_codes & tokens, const gpt_model model, + const std::string & voice, std::mt19937 & rng, const int n_threads, const float temp) { @@ -1891,16 +1894,12 @@ int write_wav_on_disk(audio_arr_t& audio_arr, std::string dest_path) { bool bark_generate_audio( bark_model model, - const bark_vocab& vocab, + const bark_vocab & vocab, const char * text, const int n_threads, const int32_t seed, - const std::string& dest_wav_path) { - // TODO move into params - // const int top_k = 10; - // const int seed = 0; - - // const float top_p = 0.2; + const std::string & dest_wav_path, + const std::string & voice) { const float temp = 0.7; const float fine_temp = 0.5; @@ -1912,7 +1911,7 @@ bool bark_generate_audio( std::mt19937 rng(seed); // tokenize input (bert tokenizer) - int32_t block_size = model.text_model.hparams.block_size; + int32_t block_size = model.text_model.hparams.block_size; bark_sequence tokens = bark_tokenize_input(text, vocab, block_size); printf("%s: prompt: '%s'\n", __func__, text); @@ -1924,15 +1923,15 @@ bool bark_generate_audio( printf("\n"); bark_sequence semantic_tokens = bark_forward_text_encoder( - tokens, model.text_model, rng, n_threads, temp, min_eos_p); + tokens, model.text_model, voice, rng, n_threads, temp, min_eos_p); printf("\n"); bark_codes coarse_tokens = bark_forward_coarse_encoder( - semantic_tokens, model.coarse_model, rng, n_threads, temp, max_coarse_history, sliding_window_size); + semantic_tokens, model.coarse_model, voice, rng, n_threads, temp, max_coarse_history, sliding_window_size); printf("\n"); bark_codes fine_tokens = bark_forward_fine_encoder( - coarse_tokens, model.fine_model, rng, n_threads, fine_temp); + coarse_tokens, model.fine_model, voice, rng, n_threads, fine_temp); printf("\n"); audio_arr_t audio_arr = bark_forward_encodec(fine_tokens, model.codec_model); diff --git a/bark.h b/bark.h index 90af8b3..6d15e0d 100644 --- a/bark.h +++ b/bark.h @@ -228,15 +228,17 @@ void bert_tokenize( bool bark_generate_audio( bark_model model, - const bark_vocab& vocab, + const bark_vocab & vocab, const char * text, const int n_threads, const int32_t seed, - const std::string& dest_wav_path); + const std::string & dest_wav_path, + const std::string & voice); bark_sequence bark_forward_text_encoder( const bark_sequence & tokens, const gpt_model model, + const std::string & voice, std::mt19937 & rng, const int n_threads, const float temp, @@ -245,6 +247,7 @@ bark_sequence bark_forward_text_encoder( bark_codes bark_forward_coarse_encoder( const bark_sequence & tokens, const gpt_model model, + const std::string & voice, std::mt19937 & rng, const int n_threads, const float temp, @@ -254,6 +257,7 @@ bark_codes bark_forward_coarse_encoder( bark_codes bark_forward_fine_encoder( const bark_codes & tokens, const gpt_model model, + const std::string & voice, std::mt19937 & rng, const int n_threads, const float temp); diff --git a/examples/main.cpp b/examples/main.cpp index f4d2e41..805a3f3 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -47,7 +47,9 @@ int main(int argc, char **argv) { } const int64_t t_eval_us_start = ggml_time_us(); - bark_generate_audio(model, model.vocab, prompt.data(), params.n_threads, params.seed, params.dest_wav_path); + bark_generate_audio( + model, model.vocab, prompt.data(), params.n_threads, params.seed, + params.dest_wav_path, params.voice); t_eval_us = ggml_time_us() - t_eval_us_start; // report timing From c4753ce1200997cb10527d0aa91fe2c02e0750a6 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sat, 19 Aug 2023 20:14:48 +0200 Subject: [PATCH 7/8] semantic tokens --- bark.cpp | 77 +++++++++++++++++++++++++++++++++++++++++++++----------- bark.h | 7 ++++-- 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/bark.cpp b/bark.cpp index 964a5c1..1424c89 100644 --- a/bark.cpp +++ b/bark.cpp @@ -318,7 +318,7 @@ bool bark_prompt_load(const std::string & fname, bark_history_prompts & history_ fin.read(&name[0], length); if ((name != "semantic_prompt") && (name != "coarse_prompt") && (name != "fine_prompt")) { - fprintf(stderr, "%s: tensor '%s' has an unknown key: '%s'\n", __func__, prompt_name, name); + fprintf(stderr, "%s: tensor '%s' has an unknown key: '%s'\n", __func__, prompt_name.c_str(), name.c_str()); return false; } @@ -1468,20 +1468,65 @@ bark_sequence bark_tokenize_input(const char * text, const bark_vocab & vocab, i tokens.resize(max_ctx_size); - // semantic history - for (int i = 0; i < 256; i++) - tokens.push_back(SEMANTIC_PAD_TOKEN); - tokens.push_back(SEMANTIC_INFER_TOKEN); + return tokens; +} - assert(tokens.size() == 256 + 256 + 1); +int bark_get_input_sequence( + struct bark_history_prompts * history_prompts, + std::vector & tokens, + std::vector & out, + const std::string & voice) { + BARK_ASSERT(tokens.size() == 256); - return tokens; + out.resize(513); + + struct bark_voice * history_prompt = nullptr; + if (!voice.empty()) { + if (history_prompts->voices.find(voice) != history_prompts->voices.end()) { + history_prompt = history_prompts->voices[voice]; + } else { + fprintf(stderr, "Could not find voice '%s'\n", voice.c_str()); + return false; + } + } + + auto & ctx = history_prompts->ctx; + struct ggml_cgraph gf = {}; + + struct ggml_tensor * semantic_history = nullptr; + if (history_prompt) { + semantic_history = history_prompt->semantic_prompt; + if (semantic_history->ne[0] >= 256) { + size_t offset = (semantic_history->ne[0] - 256) * semantic_history->nb[0]; + semantic_history = ggml_view_1d(ctx, semantic_history, 256, offset); + } else { + // constant padding + struct ggml_tensor * out = ggml_new_tensor_1d(ctx, semantic_history->type, 256); + out = ggml_set_f32(out, SEMANTIC_PAD_TOKEN); + semantic_history = ggml_set_1d(ctx, out, semantic_history, 0); + } + } else { + semantic_history = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 256); + semantic_history = ggml_set_i32(semantic_history, SEMANTIC_PAD_TOKEN); + } + + // concatenate tokens, semantic_history and [SEMANTIC_INFER_TOKEN] + struct ggml_tensor * input = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 513); + memcpy(input->data, tokens.data(), tokens.size()*sizeof(int32_t)); + input = ggml_set_1d(ctx, input, semantic_history, tokens.size()*sizeof(int32_t)); + *((float *) ((char *) ggml_get_data(input) + 512*input->nb[0])) = SEMANTIC_INFER_TOKEN; + + ggml_build_forward_expand(&gf, input); + ggml_graph_compute_with_ctx(ctx, &gf, 1); + + memcpy(out.data(), input->data, 513*sizeof(int32_t)); } bark_sequence bark_forward_text_encoder( - const bark_sequence & tokens, - const gpt_model model, + bark_sequence & tokens, + struct bark_history_prompts * history_prompts, const std::string & voice, + const gpt_model model, std::mt19937 & rng, const int n_threads, const float temp, @@ -1499,7 +1544,10 @@ bark_sequence bark_forward_text_encoder( float eos_p = 0; - bark_sequence input = tokens; + // build input token sequence + bark_sequence input; + bark_get_input_sequence(history_prompts, tokens, input, voice); + std::vector logits; // dry run to estimate mem_per_token @@ -1923,15 +1971,16 @@ bool bark_generate_audio( printf("\n"); bark_sequence semantic_tokens = bark_forward_text_encoder( - tokens, model.text_model, voice, rng, n_threads, temp, min_eos_p); + tokens, &model.history_prompts, voice, model.text_model, rng, n_threads, temp, min_eos_p); printf("\n"); bark_codes coarse_tokens = bark_forward_coarse_encoder( - semantic_tokens, model.coarse_model, voice, rng, n_threads, temp, max_coarse_history, sliding_window_size); + semantic_tokens, history_prompt, model.coarse_model, rng, n_threads, temp, + max_coarse_history, sliding_window_size); printf("\n"); bark_codes fine_tokens = bark_forward_fine_encoder( - coarse_tokens, model.fine_model, voice, rng, n_threads, fine_temp); + coarse_tokens, history_prompt, model.fine_model, rng, n_threads, fine_temp); printf("\n"); audio_arr_t audio_arr = bark_forward_encodec(fine_tokens, model.codec_model); @@ -1987,6 +2036,6 @@ void bark_print_usage(char ** argv, const bark_params & params) { fprintf(stderr, " -o FNAME, --outwav FNAME\n"); fprintf(stderr, " output generated wav (default: %s)\n", params.dest_wav_path.c_str()); fprintf(stderr, " -v VOICE, --voice VOICE\n"); - fprintf(stderr, " custom voice (default: none)\n", params.voice.c_str()); + fprintf(stderr, " custom voice (default: none)\n"); fprintf(stderr, "\n"); } diff --git a/bark.h b/bark.h index 6d15e0d..dc845ec 100644 --- a/bark.h +++ b/bark.h @@ -236,9 +236,10 @@ bool bark_generate_audio( const std::string & voice); bark_sequence bark_forward_text_encoder( - const bark_sequence & tokens, - const gpt_model model, + bark_sequence & tokens, + struct bark_history_prompts * history_prompts, const std::string & voice, + const gpt_model model, std::mt19937 & rng, const int n_threads, const float temp, @@ -246,6 +247,7 @@ bark_sequence bark_forward_text_encoder( bark_codes bark_forward_coarse_encoder( const bark_sequence & tokens, + struct bark_voice * history_prompt, const gpt_model model, const std::string & voice, std::mt19937 & rng, @@ -256,6 +258,7 @@ bark_codes bark_forward_coarse_encoder( bark_codes bark_forward_fine_encoder( const bark_codes & tokens, + struct bark_voice * history_prompt, const gpt_model model, const std::string & voice, std::mt19937 & rng, From ba2fccd09ea9eb733e34b2a8129f35bed4de0f48 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sat, 19 Aug 2023 22:42:02 +0200 Subject: [PATCH 8/8] custom voices coarse --- bark.cpp | 192 ++++++++++++++++++++++++++++++++++++++++--------------- bark.h | 6 +- 2 files changed, 142 insertions(+), 56 deletions(-) diff --git a/bark.cpp b/bark.cpp index 1424c89..7f2f55c 100644 --- a/bark.cpp +++ b/bark.cpp @@ -1471,12 +1471,12 @@ bark_sequence bark_tokenize_input(const char * text, const bark_vocab & vocab, i return tokens; } -int bark_get_input_sequence( +int bark_get_semantic_input_sequence( struct bark_history_prompts * history_prompts, - std::vector & tokens, - std::vector & out, + const bark_sequence & semantic_tokens, + bark_sequence & out, const std::string & voice) { - BARK_ASSERT(tokens.size() == 256); + BARK_ASSERT(semantic_tokens.size() == 256); out.resize(513); @@ -1512,18 +1512,22 @@ int bark_get_input_sequence( // concatenate tokens, semantic_history and [SEMANTIC_INFER_TOKEN] struct ggml_tensor * input = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 513); - memcpy(input->data, tokens.data(), tokens.size()*sizeof(int32_t)); - input = ggml_set_1d(ctx, input, semantic_history, tokens.size()*sizeof(int32_t)); + memcpy(input->data, semantic_tokens.data(), semantic_tokens.size()*sizeof(int32_t)); + input = ggml_set_1d(ctx, input, semantic_history, semantic_tokens.size()*sizeof(int32_t)); *((float *) ((char *) ggml_get_data(input) + 512*input->nb[0])) = SEMANTIC_INFER_TOKEN; ggml_build_forward_expand(&gf, input); ggml_graph_compute_with_ctx(ctx, &gf, 1); memcpy(out.data(), input->data, 513*sizeof(int32_t)); + + ggml_free(ctx); + + return 0; } bark_sequence bark_forward_text_encoder( - bark_sequence & tokens, + const bark_sequence & tokens, struct bark_history_prompts * history_prompts, const std::string & voice, const gpt_model model, @@ -1546,7 +1550,7 @@ bark_sequence bark_forward_text_encoder( // build input token sequence bark_sequence input; - bark_get_input_sequence(history_prompts, tokens, input, voice); + bark_get_semantic_input_sequence(history_prompts, tokens, input, voice); std::vector logits; @@ -1593,17 +1597,79 @@ bark_sequence bark_forward_text_encoder( return out; } +int bark_get_coarse_input_sequence( + struct bark_history_prompts * history_prompts, + const bark_sequence & tokens, + const std::string & voice, + bark_sequence & out_semantic, + bark_sequence & out_semantic_history, + bark_sequence & out_coarse_history, + int max_semantic_history, + float semantic_to_coarse_ratio) { + struct bark_voice * history_prompt = nullptr; + if (!voice.empty()) { + if (history_prompts->voices.find(voice) != history_prompts->voices.end()) { + history_prompt = history_prompts->voices[voice]; + } else { + fprintf(stderr, "Could not find voice '%s'\n", voice.c_str()); + return 1; + } + } + + auto & ctx = history_prompts->ctx; + + struct ggml_tensor * x_semantic_history = history_prompt->semantic_prompt; + struct ggml_tensor * x_coarse_history = history_prompt->coarse_prompt; + + // TODO: offset CODEBOOK_SIZE + + struct ggml_tensor * flattened_history = ggml_cpy(ctx, + x_coarse_history, + ggml_new_tensor_1d(ctx, GGML_TYPE_I32, x_coarse_history->ne[0]*x_coarse_history->ne[1])); + + struct ggml_tensor * offset = ggml_new_i32(ctx, SEMANTIC_VOCAB_SIZE); + flattened_history = ggml_add(ctx, flattened_history, ggml_repeat(ctx, offset, flattened_history)); + + int n_semantic_hist_provided = std::min( + max_semantic_history, + std::min( + (int) (x_semantic_history->ne[0] - (x_semantic_history->ne[0] % 2)), + (int) floorf(flattened_history->ne[0] / semantic_to_coarse_ratio) + ) + ); + int n_coarse_hist_provided = (int) roundf(n_semantic_hist_provided * semantic_to_coarse_ratio); + + out_semantic_history.resize(n_semantic_hist_provided); + int Ns = x_semantic_history->ne[0]; + memcpy( + out_semantic_history.data(), + (char *) x_semantic_history + (Ns - n_semantic_hist_provided), + n_semantic_hist_provided*sizeof(int32_t)); + + out_coarse_history.resize(n_coarse_hist_provided); + int Nc = flattened_history->ne[0]; + memcpy( + out_coarse_history.data(), + (char *) flattened_history + (Nc - n_coarse_hist_provided), + n_coarse_hist_provided*sizeof(int32_t)); + + return 0; +} + bark_codes bark_forward_coarse_encoder( - const bark_sequence & tokens, - const gpt_model model, + const bark_sequence & semantic_tokens, + struct bark_history_prompts * history_prompts, const std::string & voice, + const gpt_model model, std::mt19937 & rng, const int n_threads, const float temp, const int max_coarse_history, const int sliding_window_size) { - bark_codes out_coarse; - bark_sequence out; + + BARK_ASSERT(semantic_tokens.size() > 0); + BARK_ASSERT((max_coarse_history >= 60) && (max_coarse_history <= 630)); + BARK_ASSERT(max_coarse_history + sliding_window_size <= 1024 - 256); bark_progress progress; progress.func = __func__; @@ -1616,15 +1682,22 @@ bark_codes bark_forward_coarse_encoder( float semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS; int max_semantic_history = floorf(max_coarse_history / semantic_to_coarse_ratio); - int n_steps = floorf(tokens.size() * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS; - int step_ix = 0; + bark_sequence x_semantic; + bark_sequence x_semantic_history; + bark_sequence x_coarse_history; + + bark_get_coarse_input_sequence(history_prompts, semantic_tokens, voice, x_semantic, + x_semantic_history, x_coarse_history); + int n_steps = floorf(semantic_tokens.size() * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS; BARK_ASSERT(n_steps > 0); BARK_ASSERT(n_steps % N_COARSE_CODEBOOKS == 0); - int n_window_steps = ceilf(static_cast(n_steps) / sliding_window_size); + // concatenate x_semantic_history and x_semantic + x_semantic.insert(x_semantic.begin(), x_semantic_history.begin(), x_semantic_history.end()); + + bark_sequence x_coarse = x_coarse_history; - bark_sequence input = tokens; std::vector logits; // dry run to estimate mem_per_token @@ -1634,76 +1707,89 @@ bark_codes bark_forward_coarse_encoder( gpt_eval(model, n_threads, &n_past, false, { 0, 1, 2, 3 }, logits, mem_per_token); } + int base_semantic_idx = x_semantic_history.size(); + + bark_sequence x_semantic_in = x_semantic; + bark_sequence x_coarse_in = x_coarse; + + int n_window_steps = ceilf(static_cast(n_steps) / sliding_window_size); + int n_step = 0; + for (int i = 0; i < n_window_steps; i++) { - int semantic_ix = roundf(n_steps / semantic_to_coarse_ratio); + int semantic_idx = base_semantic_idx + roundf(n_steps / semantic_to_coarse_ratio); - bark_sequence input_in( - input.begin() + std::max(semantic_ix-max_semantic_history, 0), - input.end() + bark_sequence x_in( + x_semantic_in.begin() + std::max(semantic_idx - max_semantic_history, 0), + x_semantic_in.end() ); - size_t original_size = input_in.size(); - input_in.resize(256); + size_t original_size = x_in.size(); + x_in.resize(256); // padding from the right side - for (int ix = original_size; ix < 256; ix++) - input_in[ix] = COARSE_SEMANTIC_PAD_TOKEN; + for (int i = original_size; i < 256; i++) + x_in[i] = COARSE_SEMANTIC_PAD_TOKEN; - input_in.push_back(COARSE_INFER_TOKEN); + x_in.push_back(COARSE_INFER_TOKEN); // concatenate input_in and input_coarse - input_in.insert( - input_in.end(), - std::make_move_iterator(out.end() - std::min(max_coarse_history, (int) out.size())), - std::make_move_iterator(out.end()) + x_in.insert( + x_in.end(), + x_coarse_in.end() - std::min(max_coarse_history, (int) x_coarse_in.size()), + x_coarse_in.end() ); int n_past = 0; mem_per_token *= 1.1; // context length is growing, mem_per_token must grow as well for (int j = 0; j < sliding_window_size; j++) { - if (step_ix >= n_steps) + if (n_step >= n_steps) continue; + bool is_major = n_step % N_COARSE_CODEBOOKS == 0; + int64_t t_predict_start_us = ggml_time_us(); - gpt_eval(model, n_threads, &n_past, false, input_in, logits, mem_per_token); + gpt_eval(model, n_threads, &n_past, false, x_in, logits, mem_per_token); t_predict_us += (ggml_time_us() - t_predict_start_us); - input_in.clear(); + x_in.clear(); - bool is_major = step_ix % N_COARSE_CODEBOOKS == 0; - int start_ix = SEMANTIC_VOCAB_SIZE + (1 - is_major) * CODEBOOK_SIZE; - int end_ix = SEMANTIC_VOCAB_SIZE + (2 - is_major) * CODEBOOK_SIZE; - std::vector relevant_logits(logits.begin() + start_ix, logits.begin() + end_ix); + int logit_start_ix = SEMANTIC_VOCAB_SIZE + (1 - is_major) * CODEBOOK_SIZE; + int logit_end_ix = SEMANTIC_VOCAB_SIZE + (2 - is_major) * CODEBOOK_SIZE; + std::vector relevant_logits( + logits.begin() + logit_start_ix, + logits.begin() + logit_end_ix + ); int64_t t_sample_start_us = ggml_time_us(); - bark_vocab::id next = gpt_sample(relevant_logits, rng, temp, NULL); + bark_vocab::id item_next = gpt_sample(relevant_logits, rng, temp, NULL); t_sample_us += (ggml_time_us() - t_sample_start_us); - next += start_ix; + item_next += logit_start_ix; - input_in.push_back(next); - out.push_back(next); - - // printf("%d ", next); - // fflush(stdout); - - step_ix += 1; + x_in.push_back(item_next); + x_coarse_in.push_back(item_next); + n_step += 1; progress.callback((float) (i*sliding_window_size+j)/n_steps); } } - BARK_ASSERT((int) out.size() == n_steps); - BARK_ASSERT(out.size() % N_COARSE_CODEBOOKS == 0); + size_t history_size = x_coarse_history.size(); + x_coarse_in.erase(x_coarse_in.begin(), x_coarse_in.begin() + history_size); + + BARK_ASSERT((int) x_coarse_in.size() == n_steps); + BARK_ASSERT(x_coarse_in.size() % N_COARSE_CODEBOOKS == 0); // out_coarse: [seq_length, n_codes] - for (int i = 0; i < (int) out.size(); i += N_COARSE_CODEBOOKS) { + bark_codes coarse_audio_arr; + + for (int i = 0; i < (int) x_coarse_in.size(); i += N_COARSE_CODEBOOKS) { // this assumes N_COARSE_CODEBOOKS = 2 bark_sequence _tmp = { - out[i] - SEMANTIC_VOCAB_SIZE, - out[i+1] - SEMANTIC_VOCAB_SIZE - CODEBOOK_SIZE + x_coarse_in[i] - SEMANTIC_VOCAB_SIZE, + x_coarse_in[i+1] - SEMANTIC_VOCAB_SIZE - CODEBOOK_SIZE }; - out_coarse.push_back(_tmp); + coarse_audio_arr.push_back(_tmp); } const int64_t t_main_end_us = ggml_time_us(); @@ -1711,10 +1797,10 @@ bark_codes bark_forward_coarse_encoder( printf("\n\n"); printf("%s: mem per token = %8.2f MB\n", __func__, mem_per_token/1000.0f/1000.0f); printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/step_ix); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_step); printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - return out_coarse; + return coarse_audio_arr; } bark_codes bark_forward_fine_encoder( diff --git a/bark.h b/bark.h index dc845ec..e41623f 100644 --- a/bark.h +++ b/bark.h @@ -236,7 +236,7 @@ bool bark_generate_audio( const std::string & voice); bark_sequence bark_forward_text_encoder( - bark_sequence & tokens, + const bark_sequence & tokens, struct bark_history_prompts * history_prompts, const std::string & voice, const gpt_model model, @@ -247,9 +247,9 @@ bark_sequence bark_forward_text_encoder( bark_codes bark_forward_coarse_encoder( const bark_sequence & tokens, - struct bark_voice * history_prompt, - const gpt_model model, + struct bark_history_prompts * history_prompt, const std::string & voice, + const gpt_model model, std::mt19937 & rng, const int n_threads, const float temp,