From 999d45feb4b3f8fc9ae9552c395c493357ee9032 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Thu, 17 Aug 2023 14:02:42 +0200
Subject: [PATCH 1/8] convert parse prompts

---
 convert.py | 49 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/convert.py b/convert.py
index 6162aae..6baef79 100644
--- a/convert.py
+++ b/convert.py
@@ -28,6 +28,7 @@
         --codec-path ~/Documents/encodec.cpp/ggml_weights \
         --vocab-path ./ggml_weights/ \
         --out-dir ./ggml_weights/ \
+        --prompts-path ~/Documents/bark/bark/assets/prompts/ \
         --use-f16
 ```
 """
@@ -45,10 +46,11 @@
 parser.add_argument("--codec-path", type=str, required=True)
 parser.add_argument("--vocab-path", type=str, required=True)
 parser.add_argument("--out-dir", type=str, required=True)
+parser.add_argument("--prompts-path", type=str, required=False)
 parser.add_argument("--use-f16", action="store_true")
 
 
-def parse_codec_model(checkpoint, out_dir):
+def parse_codec_model(checkpoint: dict, out_dir: Path):
     """Load encodec model checkpoint."""
     outfile = open(out_dir, "wb")
     outfile.write(struct.pack("i", 0x67676d6c))  # ggml magic
@@ -106,7 +108,7 @@ def parse_codec_model(checkpoint, out_dir):
 
     outfile.close()
 
-def parse_hparams(hparams, outfile, use_f16):
+def parse_hparams(hparams: dict[str, any], outfile: Path, use_f16: bool):
     """Parse GPT hyperparameters."""
     outfile.write(struct.pack("i", hparams["n_layer"]))
     outfile.write(struct.pack("i", hparams["n_head"]))
@@ -127,12 +129,12 @@ def parse_hparams(hparams, outfile, use_f16):
         n_wtes = hparams["n_codes_total"]
     except KeyError:
         n_lm_heads, n_wtes = 1, 1
-    
+
     ftype = int(use_f16)
 
     outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype))
 
-def parse_text_models(checkpoint, outfile, use_f16):
+def parse_text_models(checkpoint: dict, outfile: Path, use_f16: bool):
     """Load GPT model checkpoint (text, fine, coarse)."""
     for name in checkpoint.keys():
         var_data = checkpoint[name].squeeze().numpy()
@@ -233,7 +235,7 @@ def parse_text_models(checkpoint, outfile, use_f16):
 
         var_data.tofile(outfile)
 
-def generate_file(in_file, out_dir, use_f16):
+def generate_file(in_file: Path, out_dir: Path, use_f16: bool):
     with open(out_dir, "wb") as fout:
         fout.write(struct.pack("i", 0x67676d6c))  # ggml magic
 
@@ -241,7 +243,7 @@ def generate_file(in_file, out_dir, use_f16):
         parse_hparams(checkpoint["model_args"], fout, use_f16)
         parse_text_models(checkpoint["model"], fout, use_f16)
 
-def generate_vocab_file(dir_model, out_dir):
+def generate_vocab_file(dir_model: Path, out_dir: Path):
     """Parse vocabulary."""
     # Even if bark relies on GPT to encode text, it uses BertTokenizer (WordPiece)
     with open(dir_model / "vocab.txt", "r", encoding="utf-8") as fin:
@@ -257,6 +259,36 @@ def generate_vocab_file(dir_model, out_dir):
             fout.write(struct.pack("i", len(data)))
             fout.write(data)
 
+def generate_prompts_file(dir_model: Path, out_dir: Path):
+    """Parse history prompts (custom voices)."""
+    all_prompts_path = list(dir_model.glob("**/*_speaker_*.npz"))
+
+    with open(out_dir, "wb") as fout:
+        fout.write(struct.pack("i", 0x67676d6c))  # ggml magic
+        fout.write(struct.pack("i", len(all_prompts_path)))
+        print("Number of prompts detected:", len(all_prompts_path))
+
+        for path in all_prompts_path:
+            print(f" {path.stem} loaded.")
+            prompt_name = path.stem.encode("utf-8")
+            history_prompt = np.load(path)
+
+            fout.write(struct.pack("i", len(prompt_name)))
+            fout.write(prompt_name)
+
+            for k in history_prompt.keys():
+                arr = history_prompt[k]
+                n_dims = len(arr.shape)
+                encoded_k = k.encode("utf-8")
+
+                fout.write(struct.pack("ii", n_dims, len(encoded_k)))
+                for i in range(n_dims):
+                    fout.write(struct.pack("i", arr.shape[n_dims - 1 - i]))
+                fout.write(encoded_k)
+
+                arr.tofile(fout)
+                print(f"  {k} loaded.")
+
 
 if __name__ == "__main__":
     args = parser.parse_args()
@@ -284,4 +316,9 @@ def generate_vocab_file(dir_model, out_dir):
     parse_codec_model(codec_chkpt, out_dir / "ggml_weights_codec.bin")
     print(" Codec model loaded.")
 
+    if args.prompts_path:
+        prompts_path = Path(args.prompts_path)
+        generate_prompts_file(prompts_path, out_dir / "ggml_prompts.bin")
+    print("  Prompts loaded.")
+
     print("Done.")

From 9af3ede09bc1a650bff135351ae71f430b4329e6 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Thu, 17 Aug 2023 14:02:53 +0200
Subject: [PATCH 2/8] enum for bark languages

---
 bark.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/bark.h b/bark.h
index 3beae13..a58a471 100644
--- a/bark.h
+++ b/bark.h
@@ -1,3 +1,15 @@
+/* Bark is a text-to-speech model for realistic speech generation.
+The model supports 13 languages that can be found in `bark_languages`.
+Multiple preset voices (history prompts) are shipped with Bark, allowing the user to
+generate the same speech with multiple different voices.
+
+You can try any combination of voices by using the following pattern:
+    <PREFIX><LANG>_speaker_<N>
+
+where <PREFIX> can be either "" or "v2"
+      <LANG> can be the last two letters of any languages supported by bark
+      <N> is an integer between 0 and 9 (inclusive).
+*/
 #pragma once
 #include "encodec.h"
 
@@ -27,6 +39,22 @@
 #define COARSE_SEMANTIC_PAD_TOKEN 12048
 #define COARSE_INFER_TOKEN 12050
 
+enum bark_languages {
+    BARK_LANG_EN = 0,   // English
+    BARK_LANG_DE = 1,   // German
+    BARK_LANG_ES = 2,   // Spanish
+    BARK_LANG_FR = 3,   // French
+    BARK_LANG_HI = 4,   // Hindi
+    BARK_LANG_IT = 5,   // Italian
+    BARK_LANG_JA = 6,   // Japanese
+    BARK_LANG_KO = 7,   // Korean
+    BARK_LANG_PL = 8,   // Polish
+    BARK_LANG_PT = 9,   // Portuguese
+    BARK_LANG_RU = 10,  // Russian
+    BARK_LANG_TR = 11,  // Turkish
+    BARK_LANG_ZH = 12,  // Chinese
+};
+
 struct bark_params {
     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 

From 849727384f24587d92b80a1c74426b24dea41559 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Thu, 17 Aug 2023 14:57:01 +0200
Subject: [PATCH 3/8] bark prompt load

---
 bark.cpp | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 bark.h   |  21 ++++++++++
 2 files changed, 146 insertions(+)

diff --git a/bark.cpp b/bark.cpp
index c820086..be6ee91 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -228,6 +228,131 @@ bool bark_vocab_load(const std::string& fname, bark_vocab& vocab, int32_t expect
     return true;
 }
 
+bool bark_prompt_load(const std::string & fname, bark_history_prompts & history_prompts) {
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: faield to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // upper bound on the ctx size needed to store all prompts (not very large)
+    size_t ctx_size = 10*MB;
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false,
+        };
+
+        history_prompts.ctx = ggml_init(params);
+        if (!history_prompts.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+    auto & ctx = history_prompts.ctx;
+
+    int32_t n_prompts;
+    read_safe(fin, n_prompts);
+
+    std::string prompt_name;
+    std::vector<char> tmp;
+
+    tmp.reserve(128);
+
+    for (int i = 0; i < n_prompts; i++) {
+        uint32_t len;
+        read_safe(fin, len);
+
+        if (len > 0) {
+            tmp.resize(len);
+            fin.read(&tmp[0], tmp.size()); // read to buffer
+            prompt_name.assign(&tmp[0], tmp.size());
+        } else {
+            fprintf(stderr, "%s: invalid prompt name\n", __func__);
+        }
+
+        int64_t memsize = 0;
+
+        struct ggml_tensor * semantic_prompt;
+        struct ggml_tensor * coarse_prompt;
+        struct ggml_tensor * fine_prompt;
+
+        std::map<std::string, struct ggml_tensor *> prompt_tensors = {
+            { "semantic_prompt", semantic_prompt },
+            { "coarse_prompt"  , coarse_prompt   },
+            { "fine_prompt"    , fine_prompt     },
+        };
+
+        int32_t n_keys;
+        read_safe(fin, n_keys);
+
+        for (int k = 0; k < n_keys; k++) {
+            int32_t n_dims;
+            int32_t length;
+
+            read_safe(fin, n_dims);
+            read_safe(fin, length);
+
+            int64_t nelements = 1;
+            int64_t ne[4] = { 1, 1, 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                read_safe(fin, ne[i]);
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if ((name != "semantic_prompt") && (name != "coarse_prompt") && (name != "fine_prompt")) {
+                fprintf(stderr, "%s: tensor '%s' has an unknown key: '%s'\n", __func__, prompt_name, name);
+                return false;
+            }
+
+            const size_t bpe = ggml_type_size(GGML_TYPE_I32);
+
+            auto & tensor = prompt_tensors[name];
+            tensor = ggml_new_tensor(ctx, GGML_TYPE_I32, 4, ne);
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            memsize += ggml_nbytes(tensor);
+        }
+
+        struct bark_voice voice = {
+            /*.name            =*/ prompt_name,
+            /*.semantic_prompt =*/ prompt_tensors["semantic_prompt"],
+            /*.coarse_prompt   =*/ prompt_tensors["coarse_prompt"],
+            /*.fine_prompt     =*/ prompt_tensors["fine_prompt"],
+            /*.memsize         =*/ memsize,
+        };
+
+        history_prompts.voices[prompt_name] = &voice;
+        history_prompts.memsize += memsize;
+    }
+
+    return true;
+}
+
 bool gpt_model_load(const std::string& fname, gpt_model& model) {
     auto fin = std::ifstream(fname, std::ios::binary);
     if (!fin) {
diff --git a/bark.h b/bark.h
index a58a471..7051ddd 100644
--- a/bark.h
+++ b/bark.h
@@ -81,6 +81,24 @@ struct gpt_hparams {
     int32_t n_codes_given = 1;
 };
 
+struct bark_voice {
+    std::string name;
+
+    struct ggml_tensor * semantic_prompt;
+    struct ggml_tensor * coarse_prompt;
+    struct ggml_tensor * fine_prompt;
+
+    int64_t memsize;
+};
+
+struct bark_history_prompts {
+    struct ggml_context * ctx;
+
+    std::map<std::string, struct bark_voice *> voices;
+
+    int64_t memsize;
+};
+
 struct bark_vocab {
     using id    = int32_t;
     using token = std::string;
@@ -159,6 +177,9 @@ struct bark_model {
     // vocab
     bark_vocab vocab;
 
+    // history prompts
+    bark_history_prompts history_prompts;
+
     int64_t memsize = 0;
 };
 

From db24cffc879f6ab866f68d90dd3807050083263f Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Thu, 17 Aug 2023 15:03:01 +0200
Subject: [PATCH 4/8] load history prompt

---
 bark.cpp          | 23 +++++++++++++++++------
 bark.h            | 12 +++++++++---
 examples/main.cpp |  2 +-
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/bark.cpp b/bark.cpp
index be6ee91..7183b5b 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -638,14 +638,14 @@ bool gpt_model_load(const std::string& fname, gpt_model& model) {
     return true;
 }
 
-bool bark_model_load(const std::string & dirname, bark_model & model) {
+bool bark_model_load(const std::string & dirname, bark_model & model, bool load_history_prompts) {
     printf("%s: loading model from '%s'\n", __func__, dirname.c_str());
 
     // text
     {
         printf("%s: reading bark text model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_text.bin";
-        if(!gpt_model_load(fname, model.text_model)) {
+        if (!gpt_model_load(fname, model.text_model)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
             return false;
         }
@@ -658,7 +658,7 @@ bool bark_model_load(const std::string & dirname, bark_model & model) {
         const std::string fname     = dirname + "/ggml_vocab.bin";
         const gpt_hparams hparams   = model.text_model.hparams;
         const int32_t expected_size = hparams.n_in_vocab - hparams.n_out_vocab - 5;
-        if(!bark_vocab_load(fname, model.vocab, expected_size)) {
+        if (!bark_vocab_load(fname, model.vocab, expected_size)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
             return false;
         }
@@ -668,7 +668,7 @@ bool bark_model_load(const std::string & dirname, bark_model & model) {
     {
         printf("\n%s: reading bark coarse model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_coarse.bin";
-        if(!gpt_model_load(fname, model.coarse_model)) {
+        if (!gpt_model_load(fname, model.coarse_model)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str());
             return false;
         }
@@ -679,7 +679,7 @@ bool bark_model_load(const std::string & dirname, bark_model & model) {
     {
         printf("\n%s: reading bark fine model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_fine.bin";
-        if(!gpt_model_load(fname, model.fine_model)) {
+        if (!gpt_model_load(fname, model.fine_model)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str());
             return false;
         }
@@ -690,13 +690,24 @@ bool bark_model_load(const std::string & dirname, bark_model & model) {
     {
         printf("\n%s: reading bark codec model\n", __func__);
         const std::string fname = dirname + "/ggml_weights_codec.bin";
-        if(!encodec_model_load(fname, model.codec_model)) {
+        if (!encodec_model_load(fname, model.codec_model)) {
             fprintf(stderr, "%s: invalid model file '%s' (bad codec)\n", __func__, fname.c_str());
             return false;
         }
         model.memsize += model.codec_model.memsize;
     }
 
+    // history prompts
+    if (load_history_prompts) {
+        printf("\n%s: reading history prompts\n", __func__);
+        const std::string fname = dirname + "/ggml_prompts.bin";
+        if (!bark_prompt_load(fname, model.history_prompts)) {
+            fprintf(stderr, "%s: invalid prompt file '%s'\n", __func__, fname.c_str());
+            return false;
+        }
+        model.memsize += model.history_prompts.memsize;
+    }
+
     printf("\n%s: total model size  = %8.2f MB\n", __func__, model.memsize/1024.0/1024.0);
 
     return true;
diff --git a/bark.h b/bark.h
index 7051ddd..d6dbdac 100644
--- a/bark.h
+++ b/bark.h
@@ -208,9 +208,15 @@ bark_vocab::id gpt_sample(
               float temp,
               float * eos_p);
 
-bool bark_model_load(const std::string & dirname, bark_model & model);
-
-bool bark_vocab_load(const std::string & fname, bark_vocab& vocab, int32_t expected_size);
+bool bark_model_load(
+        const std::string & dirname,
+               bark_model & model,
+                     bool   load_history_prompts);
+
+bool bark_vocab_load(
+        const std::string & fname,
+               bark_vocab & vocab,
+                  int32_t   expected_size);
 
 void bert_tokenize(
         const bark_vocab & vocab,
diff --git a/examples/main.cpp b/examples/main.cpp
index 3abdef3..b5b9cff 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -26,7 +26,7 @@ int main(int argc, char **argv) {
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if(!bark_model_load(fname, model)) {
+        if(!bark_model_load(fname, model, false)) {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, fname.c_str());
             return 1;
         }

From aad66efefe6150a312e24fecb8d92a2fd4c06b78 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Thu, 17 Aug 2023 15:10:29 +0200
Subject: [PATCH 5/8] add custom voice to API

---
 bark.cpp          | 8 ++++++--
 bark.h            | 1 +
 examples/main.cpp | 7 ++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/bark.cpp b/bark.cpp
index 7183b5b..631e8d4 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -1959,6 +1959,8 @@ bool bark_params_parse(int argc, char ** argv, bark_params & params) {
             params.seed = std::stoi(argv[++i]);
         } else if (arg == "-o" || arg == "--outwav") {
             params.dest_wav_path = argv[++i];
+        } else if (arg == "-v" || arg == "--voice") {
+            params.voice = argv[++i];
         } else if (arg == "-h" || arg == "--help") {
             bark_print_usage(argv, params);
             exit(0);
@@ -1981,9 +1983,11 @@ void bark_print_usage(char ** argv, const bark_params & params) {
     fprintf(stderr, "  -s N, --seed N        seed for random number generator (default: %d)\n", params.seed);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "  -m FNAME,  --model  FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -o FNAME, --outwav FNAME\n");
+    fprintf(stderr, "  -o FNAME,  --outwav FNAME\n");
     fprintf(stderr, "                        output generated wav (default: %s)\n", params.dest_wav_path.c_str());
+    fprintf(stderr, "  -v VOICE,  --voice  VOICE\n");
+    fprintf(stderr, "                        custom voice (default: none)\n", params.voice.c_str());
     fprintf(stderr, "\n");
 }
diff --git a/bark.h b/bark.h
index d6dbdac..90af8b3 100644
--- a/bark.h
+++ b/bark.h
@@ -63,6 +63,7 @@ struct bark_params {
     int32_t seed = 0;
 
     std::string prompt;  // user prompt
+    std::string voice;   // custom voice (history prompts)
 
     std::string dest_wav_path = "./output.wav";
 };
diff --git a/examples/main.cpp b/examples/main.cpp
index b5b9cff..f4d2e41 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -17,16 +17,21 @@ int main(int argc, char **argv) {
 
     bark_model model;
     std::string fname = "./ggml_weights";
+    bool load_history_prompts = false;
 
     if (!params.model.empty()) {
         fname = params.model;
     }
 
+    if (!params.voice.empty()) {
+        load_history_prompts = true;
+    }
+
     // load the model
     {
         const int64_t t_start_us = ggml_time_us();
 
-        if(!bark_model_load(fname, model, false)) {
+        if(!bark_model_load(fname, model, load_history_prompts)) {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, fname.c_str());
             return 1;
         }

From 922b929333208551dbe2b80b7b105e48cbbfbd46 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Thu, 17 Aug 2023 15:23:09 +0200
Subject: [PATCH 6/8] pass args to text 3 forward pass funcs

---
 bark.cpp          | 21 ++++++++++-----------
 bark.h            |  8 ++++++--
 examples/main.cpp |  4 +++-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/bark.cpp b/bark.cpp
index 631e8d4..964a5c1 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -1481,6 +1481,7 @@ bark_sequence bark_tokenize_input(const char * text, const bark_vocab & vocab, i
 bark_sequence bark_forward_text_encoder(
     const bark_sequence & tokens,
     const gpt_model model,
+    const std::string & voice,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
@@ -1547,6 +1548,7 @@ bark_sequence bark_forward_text_encoder(
 bark_codes bark_forward_coarse_encoder(
     const bark_sequence & tokens,
     const gpt_model model,
+    const std::string & voice,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
@@ -1670,6 +1672,7 @@ bark_codes bark_forward_coarse_encoder(
 bark_codes bark_forward_fine_encoder(
     const bark_codes & tokens,
     const gpt_model model,
+    const std::string & voice,
     std::mt19937 & rng,
     const int n_threads,
     const float temp) {
@@ -1891,16 +1894,12 @@ int write_wav_on_disk(audio_arr_t& audio_arr, std::string dest_path) {
 
 bool bark_generate_audio(
         bark_model model,
-        const bark_vocab& vocab,
+        const bark_vocab & vocab,
         const char * text,
         const int n_threads,
         const int32_t seed,
-        const std::string& dest_wav_path) {
-    // TODO move into params
-    // const int top_k = 10;
-    // const int seed  = 0;
-
-    // const float top_p     = 0.2;
+        const std::string & dest_wav_path,
+        const std::string & voice) {
     const float temp      = 0.7;
     const float fine_temp = 0.5;
 
@@ -1912,7 +1911,7 @@ bool bark_generate_audio(
     std::mt19937 rng(seed);
 
     // tokenize input (bert tokenizer)
-    int32_t block_size = model.text_model.hparams.block_size;
+    int32_t block_size   = model.text_model.hparams.block_size;
     bark_sequence tokens = bark_tokenize_input(text, vocab, block_size);
 
     printf("%s: prompt: '%s'\n", __func__, text);
@@ -1924,15 +1923,15 @@ bool bark_generate_audio(
     printf("\n");
 
     bark_sequence semantic_tokens = bark_forward_text_encoder(
-            tokens, model.text_model, rng, n_threads, temp, min_eos_p);
+            tokens, model.text_model, voice, rng, n_threads, temp, min_eos_p);
     printf("\n");
 
     bark_codes coarse_tokens = bark_forward_coarse_encoder(
-            semantic_tokens, model.coarse_model, rng, n_threads, temp, max_coarse_history, sliding_window_size);
+            semantic_tokens, model.coarse_model, voice, rng, n_threads, temp, max_coarse_history, sliding_window_size);
     printf("\n");
 
     bark_codes fine_tokens = bark_forward_fine_encoder(
-            coarse_tokens, model.fine_model, rng, n_threads, fine_temp);
+            coarse_tokens, model.fine_model, voice, rng, n_threads, fine_temp);
     printf("\n");
 
     audio_arr_t audio_arr = bark_forward_encodec(fine_tokens, model.codec_model);
diff --git a/bark.h b/bark.h
index 90af8b3..6d15e0d 100644
--- a/bark.h
+++ b/bark.h
@@ -228,15 +228,17 @@ void bert_tokenize(
 
 bool bark_generate_audio(
         bark_model model,
-        const bark_vocab& vocab,
+        const bark_vocab & vocab,
         const char * text,
         const int n_threads,
         const int32_t seed,
-        const std::string& dest_wav_path);
+        const std::string & dest_wav_path,
+        const std::string & voice);
 
 bark_sequence bark_forward_text_encoder(
     const bark_sequence & tokens,
     const gpt_model model,
+    const std::string & voice,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
@@ -245,6 +247,7 @@ bark_sequence bark_forward_text_encoder(
 bark_codes bark_forward_coarse_encoder(
     const bark_sequence & tokens,
     const gpt_model model,
+    const std::string & voice,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
@@ -254,6 +257,7 @@ bark_codes bark_forward_coarse_encoder(
 bark_codes bark_forward_fine_encoder(
     const bark_codes & tokens,
     const gpt_model model,
+    const std::string & voice,
     std::mt19937 & rng,
     const int n_threads,
     const float temp);
diff --git a/examples/main.cpp b/examples/main.cpp
index f4d2e41..805a3f3 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -47,7 +47,9 @@ int main(int argc, char **argv) {
     }
 
     const int64_t t_eval_us_start = ggml_time_us();
-    bark_generate_audio(model, model.vocab, prompt.data(), params.n_threads, params.seed, params.dest_wav_path);
+    bark_generate_audio(
+            model, model.vocab, prompt.data(), params.n_threads, params.seed, 
+            params.dest_wav_path, params.voice);
     t_eval_us = ggml_time_us() - t_eval_us_start;
 
     // report timing

From c4753ce1200997cb10527d0aa91fe2c02e0750a6 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sat, 19 Aug 2023 20:14:48 +0200
Subject: [PATCH 7/8] semantic tokens

---
 bark.cpp | 77 +++++++++++++++++++++++++++++++++++++++++++++-----------
 bark.h   |  7 ++++--
 2 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/bark.cpp b/bark.cpp
index 964a5c1..1424c89 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -318,7 +318,7 @@ bool bark_prompt_load(const std::string & fname, bark_history_prompts & history_
             fin.read(&name[0], length);
 
             if ((name != "semantic_prompt") && (name != "coarse_prompt") && (name != "fine_prompt")) {
-                fprintf(stderr, "%s: tensor '%s' has an unknown key: '%s'\n", __func__, prompt_name, name);
+                fprintf(stderr, "%s: tensor '%s' has an unknown key: '%s'\n", __func__, prompt_name.c_str(), name.c_str());
                 return false;
             }
 
@@ -1468,20 +1468,65 @@ bark_sequence bark_tokenize_input(const char * text, const bark_vocab & vocab, i
 
     tokens.resize(max_ctx_size);
 
-    // semantic history
-    for (int i = 0; i < 256; i++)
-        tokens.push_back(SEMANTIC_PAD_TOKEN);
-    tokens.push_back(SEMANTIC_INFER_TOKEN);
+    return tokens;
+}
 
-    assert(tokens.size() == 256 + 256 + 1);
+int bark_get_input_sequence(
+       struct bark_history_prompts * history_prompts,
+       std::vector<bark_vocab::id> & tokens,
+       std::vector<bark_vocab::id> & out,
+                 const std::string & voice) {
+    BARK_ASSERT(tokens.size() == 256);
 
-    return tokens;
+    out.resize(513);
+
+    struct bark_voice * history_prompt = nullptr;
+    if (!voice.empty()) {
+        if (history_prompts->voices.find(voice) != history_prompts->voices.end()) {
+            history_prompt = history_prompts->voices[voice];
+        } else {
+            fprintf(stderr, "Could not find voice '%s'\n", voice.c_str());
+            return false;
+        }
+    }
+
+    auto & ctx = history_prompts->ctx;
+    struct ggml_cgraph gf = {};
+
+    struct ggml_tensor * semantic_history = nullptr;
+    if (history_prompt) {
+        semantic_history = history_prompt->semantic_prompt;
+        if (semantic_history->ne[0] >= 256) {
+            size_t offset = (semantic_history->ne[0] - 256) * semantic_history->nb[0];
+            semantic_history = ggml_view_1d(ctx, semantic_history, 256, offset);
+        } else {
+            // constant padding
+            struct ggml_tensor * out = ggml_new_tensor_1d(ctx, semantic_history->type, 256);
+            out = ggml_set_f32(out, SEMANTIC_PAD_TOKEN);
+            semantic_history = ggml_set_1d(ctx, out, semantic_history, 0);
+        }
+    } else {
+        semantic_history = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 256);
+        semantic_history = ggml_set_i32(semantic_history, SEMANTIC_PAD_TOKEN);
+    }
+
+    // concatenate tokens, semantic_history and [SEMANTIC_INFER_TOKEN]
+    struct ggml_tensor * input = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 513);
+    memcpy(input->data, tokens.data(), tokens.size()*sizeof(int32_t));
+    input = ggml_set_1d(ctx, input, semantic_history, tokens.size()*sizeof(int32_t));
+    *((float *) ((char *) ggml_get_data(input) + 512*input->nb[0])) = SEMANTIC_INFER_TOKEN;
+
+    ggml_build_forward_expand(&gf, input);
+    ggml_graph_compute_with_ctx(ctx, &gf, 1);
+
+    memcpy(out.data(), input->data, 513*sizeof(int32_t));
 }
 
 bark_sequence bark_forward_text_encoder(
-    const bark_sequence & tokens,
-    const gpt_model model,
+    bark_sequence & tokens,
+    struct bark_history_prompts * history_prompts,
     const std::string & voice,
+    const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
@@ -1499,7 +1544,10 @@ bark_sequence bark_forward_text_encoder(
 
     float eos_p = 0;
 
-    bark_sequence input = tokens;
+    // build input token sequence
+    bark_sequence input;
+    bark_get_input_sequence(history_prompts, tokens, input, voice);
+
     std::vector<float> logits;
 
     // dry run to estimate mem_per_token
@@ -1923,15 +1971,16 @@ bool bark_generate_audio(
     printf("\n");
 
     bark_sequence semantic_tokens = bark_forward_text_encoder(
-            tokens, model.text_model, voice, rng, n_threads, temp, min_eos_p);
+        tokens, &model.history_prompts, voice, model.text_model, rng, n_threads, temp, min_eos_p);
     printf("\n");
 
     bark_codes coarse_tokens = bark_forward_coarse_encoder(
-            semantic_tokens, model.coarse_model, voice, rng, n_threads, temp, max_coarse_history, sliding_window_size);
+        semantic_tokens, history_prompt, model.coarse_model, rng, n_threads, temp,
+        max_coarse_history, sliding_window_size);
     printf("\n");
 
     bark_codes fine_tokens = bark_forward_fine_encoder(
-            coarse_tokens, model.fine_model, voice, rng, n_threads, fine_temp);
+        coarse_tokens, history_prompt, model.fine_model, rng, n_threads, fine_temp);
     printf("\n");
 
     audio_arr_t audio_arr = bark_forward_encodec(fine_tokens, model.codec_model);
@@ -1987,6 +2036,6 @@ void bark_print_usage(char ** argv, const bark_params & params) {
     fprintf(stderr, "  -o FNAME,  --outwav FNAME\n");
     fprintf(stderr, "                        output generated wav (default: %s)\n", params.dest_wav_path.c_str());
     fprintf(stderr, "  -v VOICE,  --voice  VOICE\n");
-    fprintf(stderr, "                        custom voice (default: none)\n", params.voice.c_str());
+    fprintf(stderr, "                        custom voice (default: none)\n");
     fprintf(stderr, "\n");
 }
diff --git a/bark.h b/bark.h
index 6d15e0d..dc845ec 100644
--- a/bark.h
+++ b/bark.h
@@ -236,9 +236,10 @@ bool bark_generate_audio(
         const std::string & voice);
 
 bark_sequence bark_forward_text_encoder(
-    const bark_sequence & tokens,
-    const gpt_model model,
+    bark_sequence & tokens,
+    struct bark_history_prompts * history_prompts,
     const std::string & voice,
+    const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
@@ -246,6 +247,7 @@ bark_sequence bark_forward_text_encoder(
 
 bark_codes bark_forward_coarse_encoder(
     const bark_sequence & tokens,
+    struct bark_voice * history_prompt,
     const gpt_model model,
     const std::string & voice,
     std::mt19937 & rng,
@@ -256,6 +258,7 @@ bark_codes bark_forward_coarse_encoder(
 
 bark_codes bark_forward_fine_encoder(
     const bark_codes & tokens,
+    struct bark_voice * history_prompt,
     const gpt_model model,
     const std::string & voice,
     std::mt19937 & rng,

From ba2fccd09ea9eb733e34b2a8129f35bed4de0f48 Mon Sep 17 00:00:00 2001
From: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
Date: Sat, 19 Aug 2023 22:42:02 +0200
Subject: [PATCH 8/8] custom voices coarse

---
 bark.cpp | 192 ++++++++++++++++++++++++++++++++++++++++---------------
 bark.h   |   6 +-
 2 files changed, 142 insertions(+), 56 deletions(-)

diff --git a/bark.cpp b/bark.cpp
index 1424c89..7f2f55c 100644
--- a/bark.cpp
+++ b/bark.cpp
@@ -1471,12 +1471,12 @@ bark_sequence bark_tokenize_input(const char * text, const bark_vocab & vocab, i
     return tokens;
 }
 
-int bark_get_input_sequence(
+int bark_get_semantic_input_sequence(
        struct bark_history_prompts * history_prompts,
-       std::vector<bark_vocab::id> & tokens,
-       std::vector<bark_vocab::id> & out,
+               const bark_sequence & semantic_tokens,
+                     bark_sequence & out,
                  const std::string & voice) {
-    BARK_ASSERT(tokens.size() == 256);
+    BARK_ASSERT(semantic_tokens.size() == 256);
 
     out.resize(513);
 
@@ -1512,18 +1512,22 @@ int bark_get_input_sequence(
 
     // concatenate tokens, semantic_history and [SEMANTIC_INFER_TOKEN]
     struct ggml_tensor * input = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 513);
-    memcpy(input->data, tokens.data(), tokens.size()*sizeof(int32_t));
-    input = ggml_set_1d(ctx, input, semantic_history, tokens.size()*sizeof(int32_t));
+    memcpy(input->data, semantic_tokens.data(), semantic_tokens.size()*sizeof(int32_t));
+    input = ggml_set_1d(ctx, input, semantic_history, semantic_tokens.size()*sizeof(int32_t));
     *((float *) ((char *) ggml_get_data(input) + 512*input->nb[0])) = SEMANTIC_INFER_TOKEN;
 
     ggml_build_forward_expand(&gf, input);
     ggml_graph_compute_with_ctx(ctx, &gf, 1);
 
     memcpy(out.data(), input->data, 513*sizeof(int32_t));
+
+    ggml_free(ctx);
+
+    return 0;
 }
 
 bark_sequence bark_forward_text_encoder(
-    bark_sequence & tokens,
+    const bark_sequence & tokens,
     struct bark_history_prompts * history_prompts,
     const std::string & voice,
     const gpt_model model,
@@ -1546,7 +1550,7 @@ bark_sequence bark_forward_text_encoder(
 
     // build input token sequence
     bark_sequence input;
-    bark_get_input_sequence(history_prompts, tokens, input, voice);
+    bark_get_semantic_input_sequence(history_prompts, tokens, input, voice);
 
     std::vector<float> logits;
 
@@ -1593,17 +1597,79 @@ bark_sequence bark_forward_text_encoder(
     return out;
 }
 
+int bark_get_coarse_input_sequence(
+       struct bark_history_prompts * history_prompts,
+               const bark_sequence & tokens,
+                 const std::string & voice,
+                     bark_sequence & out_semantic,
+                     bark_sequence & out_semantic_history,
+                     bark_sequence & out_coarse_history,
+                               int   max_semantic_history,
+                             float   semantic_to_coarse_ratio) {
+    struct bark_voice * history_prompt = nullptr;
+    if (!voice.empty()) {
+        if (history_prompts->voices.find(voice) != history_prompts->voices.end()) {
+            history_prompt = history_prompts->voices[voice];
+        } else {
+            fprintf(stderr, "Could not find voice '%s'\n", voice.c_str());
+            return 1;
+        }
+    }
+
+    auto & ctx = history_prompts->ctx;
+
+    struct ggml_tensor * x_semantic_history = history_prompt->semantic_prompt;
+    struct ggml_tensor * x_coarse_history   = history_prompt->coarse_prompt;
+
+    // TODO: offset CODEBOOK_SIZE
+
+    struct ggml_tensor * flattened_history = ggml_cpy(ctx,
+        x_coarse_history,
+        ggml_new_tensor_1d(ctx, GGML_TYPE_I32, x_coarse_history->ne[0]*x_coarse_history->ne[1]));
+
+    struct ggml_tensor * offset = ggml_new_i32(ctx, SEMANTIC_VOCAB_SIZE);
+    flattened_history = ggml_add(ctx, flattened_history, ggml_repeat(ctx, offset, flattened_history));
+
+    int n_semantic_hist_provided = std::min(
+        max_semantic_history,
+        std::min(
+            (int) (x_semantic_history->ne[0] - (x_semantic_history->ne[0] % 2)),
+            (int) floorf(flattened_history->ne[0] / semantic_to_coarse_ratio)
+        )
+    );
+    int n_coarse_hist_provided = (int) roundf(n_semantic_hist_provided * semantic_to_coarse_ratio);
+
+    out_semantic_history.resize(n_semantic_hist_provided);
+    int Ns = x_semantic_history->ne[0];
+    memcpy(
+        out_semantic_history.data(),
+        (char *) x_semantic_history + (Ns - n_semantic_hist_provided),
+        n_semantic_hist_provided*sizeof(int32_t));
+
+    out_coarse_history.resize(n_coarse_hist_provided);
+    int Nc = flattened_history->ne[0];
+    memcpy(
+        out_coarse_history.data(),
+        (char *) flattened_history + (Nc - n_coarse_hist_provided),
+        n_coarse_hist_provided*sizeof(int32_t));
+
+    return 0;
+}
+
 bark_codes bark_forward_coarse_encoder(
-    const bark_sequence & tokens,
-    const gpt_model model,
+    const bark_sequence & semantic_tokens,
+    struct bark_history_prompts * history_prompts,
     const std::string & voice,
+    const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,
     const int max_coarse_history,
     const int sliding_window_size) {
-    bark_codes out_coarse;
-    bark_sequence out;
+
+    BARK_ASSERT(semantic_tokens.size() > 0);
+    BARK_ASSERT((max_coarse_history >= 60) && (max_coarse_history <= 630));
+    BARK_ASSERT(max_coarse_history + sliding_window_size <= 1024 - 256);
 
     bark_progress progress;
     progress.func = __func__;
@@ -1616,15 +1682,22 @@ bark_codes bark_forward_coarse_encoder(
     float semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS;
     int max_semantic_history = floorf(max_coarse_history / semantic_to_coarse_ratio);
 
-    int n_steps = floorf(tokens.size() * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS;
-    int step_ix = 0;
+    bark_sequence x_semantic;
+    bark_sequence x_semantic_history;
+    bark_sequence x_coarse_history;
+
+    bark_get_coarse_input_sequence(history_prompts, semantic_tokens, voice, x_semantic,
+        x_semantic_history, x_coarse_history);
 
+    int n_steps = floorf(semantic_tokens.size() * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS;
     BARK_ASSERT(n_steps > 0);
     BARK_ASSERT(n_steps % N_COARSE_CODEBOOKS == 0);
 
-    int n_window_steps = ceilf(static_cast<float>(n_steps) / sliding_window_size);
+    // concatenate x_semantic_history and x_semantic
+    x_semantic.insert(x_semantic.begin(), x_semantic_history.begin(), x_semantic_history.end());
+
+    bark_sequence x_coarse = x_coarse_history;
 
-    bark_sequence input = tokens;
     std::vector<float> logits;
 
     // dry run to estimate mem_per_token
@@ -1634,76 +1707,89 @@ bark_codes bark_forward_coarse_encoder(
         gpt_eval(model, n_threads, &n_past, false, { 0, 1, 2, 3 }, logits, mem_per_token);
     }
 
+    int base_semantic_idx = x_semantic_history.size();
+
+    bark_sequence x_semantic_in = x_semantic;
+    bark_sequence x_coarse_in = x_coarse;
+
+    int n_window_steps = ceilf(static_cast<float>(n_steps) / sliding_window_size);
+    int n_step = 0;
+
     for (int i = 0; i < n_window_steps; i++) {
-        int semantic_ix = roundf(n_steps / semantic_to_coarse_ratio);
+        int semantic_idx = base_semantic_idx + roundf(n_steps / semantic_to_coarse_ratio);
 
-        bark_sequence input_in(
-            input.begin() + std::max(semantic_ix-max_semantic_history, 0),
-            input.end()
+        bark_sequence x_in(
+            x_semantic_in.begin() + std::max(semantic_idx - max_semantic_history, 0),
+            x_semantic_in.end()
         );
-        size_t original_size = input_in.size();
-        input_in.resize(256);
+        size_t original_size = x_in.size();
+        x_in.resize(256);
 
         // padding from the right side
-        for (int ix = original_size; ix < 256; ix++)
-            input_in[ix] = COARSE_SEMANTIC_PAD_TOKEN;
+        for (int i = original_size; i < 256; i++)
+            x_in[i] = COARSE_SEMANTIC_PAD_TOKEN;
 
-        input_in.push_back(COARSE_INFER_TOKEN);
+        x_in.push_back(COARSE_INFER_TOKEN);
 
         // concatenate input_in and input_coarse
-        input_in.insert(
-            input_in.end(),
-            std::make_move_iterator(out.end() - std::min(max_coarse_history, (int) out.size())),
-            std::make_move_iterator(out.end())
+        x_in.insert(
+            x_in.end(),
+            x_coarse_in.end() - std::min(max_coarse_history, (int) x_coarse_in.size()),
+            x_coarse_in.end()
         );
 
         int n_past = 0;
         mem_per_token *= 1.1;  // context length is growing, mem_per_token must grow as well
 
         for (int j = 0; j < sliding_window_size; j++) {
-            if (step_ix >= n_steps)
+            if (n_step >= n_steps)
                 continue;
 
+            bool is_major = n_step % N_COARSE_CODEBOOKS == 0;
+
             int64_t t_predict_start_us = ggml_time_us();
-            gpt_eval(model, n_threads, &n_past, false, input_in, logits, mem_per_token);
+            gpt_eval(model, n_threads, &n_past, false, x_in, logits, mem_per_token);
             t_predict_us += (ggml_time_us() - t_predict_start_us);
 
-            input_in.clear();
+            x_in.clear();
 
-            bool is_major = step_ix % N_COARSE_CODEBOOKS == 0;
-            int start_ix  = SEMANTIC_VOCAB_SIZE + (1 - is_major) * CODEBOOK_SIZE;
-            int end_ix    = SEMANTIC_VOCAB_SIZE + (2 - is_major) * CODEBOOK_SIZE;
-            std::vector<float> relevant_logits(logits.begin() + start_ix, logits.begin() + end_ix);
+            int logit_start_ix = SEMANTIC_VOCAB_SIZE + (1 - is_major) * CODEBOOK_SIZE;
+            int logit_end_ix   = SEMANTIC_VOCAB_SIZE + (2 - is_major) * CODEBOOK_SIZE;
+            std::vector<float> relevant_logits(
+                logits.begin() + logit_start_ix,
+                logits.begin() + logit_end_ix
+            );
 
             int64_t t_sample_start_us = ggml_time_us();
-            bark_vocab::id next = gpt_sample(relevant_logits, rng, temp, NULL);
+            bark_vocab::id item_next = gpt_sample(relevant_logits, rng, temp, NULL);
             t_sample_us += (ggml_time_us() - t_sample_start_us);
 
-            next += start_ix;
+            item_next += logit_start_ix;
 
-            input_in.push_back(next);
-            out.push_back(next);
-
-            // printf("%d ", next);
-            // fflush(stdout);
-
-            step_ix += 1;
+            x_in.push_back(item_next);
+            x_coarse_in.push_back(item_next);
 
+            n_step += 1;
             progress.callback((float) (i*sliding_window_size+j)/n_steps);
         }
     }
 
-    BARK_ASSERT((int) out.size() == n_steps);
-    BARK_ASSERT(out.size() % N_COARSE_CODEBOOKS == 0);
+    size_t history_size = x_coarse_history.size();
+    x_coarse_in.erase(x_coarse_in.begin(), x_coarse_in.begin() + history_size);
+
+    BARK_ASSERT((int) x_coarse_in.size() == n_steps);
+    BARK_ASSERT(x_coarse_in.size() % N_COARSE_CODEBOOKS == 0);
 
     // out_coarse: [seq_length, n_codes]
-    for (int i = 0; i < (int) out.size(); i += N_COARSE_CODEBOOKS) {
+    bark_codes coarse_audio_arr;
+
+    for (int i = 0; i < (int) x_coarse_in.size(); i += N_COARSE_CODEBOOKS) {
         // this assumes N_COARSE_CODEBOOKS = 2
         bark_sequence _tmp = {
-            out[i] - SEMANTIC_VOCAB_SIZE,
-            out[i+1] - SEMANTIC_VOCAB_SIZE - CODEBOOK_SIZE
+            x_coarse_in[i] - SEMANTIC_VOCAB_SIZE,
+            x_coarse_in[i+1] - SEMANTIC_VOCAB_SIZE - CODEBOOK_SIZE
         };
-        out_coarse.push_back(_tmp);
+        coarse_audio_arr.push_back(_tmp);
     }
 
     const int64_t t_main_end_us = ggml_time_us();
@@ -1711,10 +1797,10 @@ bark_codes bark_forward_coarse_encoder(
     printf("\n\n");
     printf("%s: mem per token = %8.2f MB\n", __func__, mem_per_token/1000.0f/1000.0f);
     printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-    printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/step_ix);
+    printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_step);
     printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
 
-    return out_coarse;
+    return coarse_audio_arr;
 }
 
 bark_codes bark_forward_fine_encoder(
diff --git a/bark.h b/bark.h
index dc845ec..e41623f 100644
--- a/bark.h
+++ b/bark.h
@@ -236,7 +236,7 @@ bool bark_generate_audio(
         const std::string & voice);
 
 bark_sequence bark_forward_text_encoder(
-    bark_sequence & tokens,
+    const bark_sequence & tokens,
     struct bark_history_prompts * history_prompts,
     const std::string & voice,
     const gpt_model model,
@@ -247,9 +247,9 @@ bark_sequence bark_forward_text_encoder(
 
 bark_codes bark_forward_coarse_encoder(
     const bark_sequence & tokens,
-    struct bark_voice * history_prompt,
-    const gpt_model model,
+    struct bark_history_prompts * history_prompt,
     const std::string & voice,
+    const gpt_model model,
     std::mt19937 & rng,
     const int n_threads,
     const float temp,